In [1]:
import pandas as pd
pd.set_option('display.max_colwidth', None)

In [2]:
import numpy as np
np.bool = np.bool_

In [3]:
#current notebook name
notebook_name = __session__.replace('.ipynb','')[__session__.rfind('/')+1:] 

In [4]:
# HDFS base paths
hdfs_lakehouse_base_path = 'hdfs://localhost:9000/lakehouse/'
hdfs_warehouse_base_path = 'hdfs://localhost:9000/warehouse'

In [5]:
import os
dependencies = ["org.apache.spark:spark-avro_2.12:3.5.0",
                "io.delta:delta-iceberg_2.12:3.0.0","org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.0"]
os.environ['PYSPARK_SUBMIT_ARGS']= f"--packages {','.join(dependencies)} pyspark-shell"
os.environ['PYARROW_IGNORE_TIMEZONE'] = 'true'

In [6]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, from_json, udf
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, BooleanType
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Initialize Spark Session with Kafka Support
spark = (SparkSession.builder
    .appName(notebook_name)
    .config("spark.log.level", "ERROR")
    .config("spark.sql.warehouse.dir", hdfs_warehouse_base_path)
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
    .config("spark.sql.streaming.schemaInference", "true")  # Enable schema inference for streaming data
    .config("spark.jars.packages", 
            "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.0,"  # Kafka integration
            "io.delta:delta-core_2.12:2.4.0")  # Delta Lake support
    .enableHiveSupport()
    .getOrCreate()
)

25/03/17 20:19:31 WARN Utils: Your hostname, osbdet resolves to a loopback address: 127.0.0.1; using 10.0.2.15 instead (on interface enp0s1)
25/03/17 20:19:31 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Ivy Default Cache set to: /home/osbdet/.ivy2/cache
The jars for the packages stored in: /home/osbdet/.ivy2/jars
org.apache.spark#spark-avro_2.12 added as a dependency
io.delta#delta-iceberg_2.12 added as a dependency
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-5c1e93d2-85ff-4ce5-a4b7-232da663b64b;1.0
	confs: [default]
	found org.apache.spark#spark-avro_2.12;3.5.0 in central


:: loading settings :: url = jar:file:/home/osbdet/.jupyter_venv/lib/python3.11/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


	found org.tukaani#xz;1.9 in central
	found io.delta#delta-iceberg_2.12;3.0.0 in central
	found io.delta#delta-spark_2.12;3.0.0 in central
	found io.delta#delta-storage;3.0.0 in central
	found org.antlr#antlr4-runtime;4.9.3 in central
	found org.scala-lang.modules#scala-collection-compat_2.12;2.1.1 in central
	found com.github.ben-manes.caffeine#caffeine;2.9.3 in central
	found org.checkerframework#checker-qual;3.19.0 in central
	found com.google.errorprone#error_prone_annotations;2.10.0 in central
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.5.0 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.5.0 in central
	found org.apache.kafka#kafka-clients;3.4.1 in central
	found org.lz4#lz4-java;1.8.0 in central
	found org.xerial.snappy#snappy-java;1.1.10.3 in central
	found org.slf4j#slf4j-api;2.0.7 in central
	found org.apache.hadoop#hadoop-client-runtime;3.3.4 in central
	found org.apache.hadoop#hadoop-client-api;3.3.4 in central
	found commons-logging#commons

In [7]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, from_json, udf
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, BooleanType
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Define schema for incoming Twitter data
twitter_schema = StructType([
    StructField("date", StringType(), True),
    StructField("favorites", DoubleType(), True),
    StructField("id", StringType(), True),
    StructField("isRetweet", BooleanType(), True),
    StructField("retweets", DoubleType(), True),
    StructField("text", StringType(), True)
])


twitter_stream = (spark.readStream
    .format("kafka")
    .option("kafka.bootstrap.servers", "localhost:9092")
    .option("subscribe", "btc_price")
    .option("startingOffsets", "latest")
    .load()
)
# Extract JSON from Kafka's `value` column
twitter_df = twitter_stream.withColumn(
    "parsed_value", from_json(col("value").cast("string"), twitter_schema)
).select("parsed_value.*")


In [8]:
import yfinance as yf
import requests
from pyspark.sql.functions import col
from pyspark.sql.types import StructType, StructField, StringType, FloatType, IntegerType, FloatType
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from pyspark.ml.regression import RandomForestRegressionModel
from pyspark.ml.feature import VectorAssembler


# Define HDFS path where the model is stored
model = RandomForestRegressionModel.load(f"{hdfs_lakehouse_base_path}/gold/trump_btc/rf_model")
print("✅ Model loaded successfully!")


# Initialize Sentiment Analyzer
analyzer = SentimentIntensityAnalyzer()

THRESHOLD_PERCENT = 5  # If predicted price is 5% above the current price, we BUY

def process_batch(batch_df, batch_id):

    if not batch_df.isEmpty():
        print(f"🚀 Processing Batch {batch_id}")

        # Convert Spark DataFrame to Pandas for sentiment analysis
        pdf = batch_df.toPandas()

        # Apply Sentiment Analysis using VADER
        pdf["sentiment_score"] = pdf["text"].apply(
            lambda text: analyzer.polarity_scores(text)["compound"] if isinstance(text, str) else 0.0
        )

        # Convert back to Spark DataFrame
        spark_df = spark.createDataFrame(pdf)

        # Extract the **average** sentiment score for all tweets in this batch
        avg_sentiment = spark_df.selectExpr("avg(sentiment_score) as avg_sentiment").collect()[0]["avg_sentiment"]

        # Fetch BTC & Fear/Greed Data and merge it with avg_sentiment
        model_data, current_btc_price = fetch_btc_fng_for_model(spark, avg_sentiment)

        # Convert feature columns into a vector column for Spark ML (Updated Columns)
        feature_columns = ["sentiment_score", "is_president", "Value", "BTC_Volume", "btc_current_price"]
        assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
        model_data = assembler.transform(model_data)

        # Apply the trained Spark ML model for prediction
        predictions = model.transform(model_data)

        # Select final output columns
        final_df = predictions.select(
            "sentiment_score", "is_president", "Value", "BTC_Volume", "btc_current_price", col("prediction").alias("predicted_btc_price")
        )

        # Convert final DataFrame to Pandas for decision-making
        final_pdf = final_df.toPandas()

        # Extract predicted price
        predicted_price = final_pdf["predicted_btc_price"].iloc[0]

        # Calculate percentage difference
        percent_change = ((predicted_price - current_btc_price) / current_btc_price) * 100

        # Trading Decision
        if percent_change > THRESHOLD_PERCENT:
            decision = "🚀 BUY"
        else:
            decision = "❌ DO NOTHING"

        # Print trading decision
        print(f"Current BTC Price: ${current_btc_price:.2f}")
        print(f"Predicted BTC Price: ${predicted_price:.2f}")
        print(f"Price Change: {percent_change:.2f}%")
        print(f"Trading Decision: {decision}")

        # Show predictions
        final_df.show(truncate=False)

def fetch_btc_fng_for_model(spark, sentiment_score):
    """
    Fetches the latest BTC price, volume, and Fear & Greed Index, 
    and returns a Spark DataFrame with the required columns.

    Parameters:
        spark (SparkSession): Active Spark session.
        sentiment_score (float): Average sentiment score from tweets.

    Returns:
        Spark DataFrame with selected columns for the ML model.
        Current BTC price (float) for decision-making.
    """

    # 🟢 Step 1: Get BTC Price and Volume
    btc = yf.Ticker("BTC-USD")
    latest_data = btc.history(period="1d")

    if not latest_data.empty:
        btc_current_price = float(np.random.uniform(2000, 10000)) #float(latest_data['Close'].iloc[-1])  # Current BTC price
        latest_volume = float(latest_data['Volume'].iloc[-1])  # Current BTC volume
        latest_date = latest_data.index[-1].strftime('%Y-%m-%d')
    else:
        btc_current_price = None
        latest_volume = None
        latest_date = datetime.today().strftime('%Y-%m-%d')

    # 🟢 Step 2: Get Fear & Greed Index
    response = requests.get("https://api.alternative.me/fng/?limit=1")
    data = response.json()

    index_value = int(data['data'][0]['value'])  # Fear & Greed Index
    classification = data['data'][0]['value_classification']

    # 🟢 Step 3: Define Schema for Model DataFrame (Updated Columns)
    schema = StructType([
        StructField("sentiment_score", FloatType(), True),
        StructField("is_president", IntegerType(), False),
        StructField("Value", IntegerType(), False),  # Renaming index_value as "Value"
        StructField("BTC_Volume", FloatType(), True),
        StructField("btc_current_price", FloatType(), True),
    ])

    # 🟢 Step 4: Create a Single Spark DataFrame
    model_df = spark.createDataFrame([(sentiment_score, 1, index_value, latest_volume, btc_current_price)], schema=schema)

    return model_df, btc_current_price


✅ Model loaded successfully!


In [9]:
query = twitter_df.writeStream \
    .foreachBatch(process_batch) \
    .start()

query.awaitTermination()


                                                                                

🚀 Processing Batch 1
Current BTC Price: $2797.57
Predicted BTC Price: $10808.41
Price Change: 286.35%
Trading Decision: 🚀 BUY
+---------------+------------+-----+-------------+-----------------+-------------------+
|sentiment_score|is_president|Value|BTC_Volume   |btc_current_price|predicted_btc_price|
+---------------+------------+-----+-------------+-----------------+-------------------+
|-0.9413        |1           |32   |2.66835702E10|2797.567         |10808.411351983068 |
+---------------+------------+-----+-------------+-----------------+-------------------+

🚀 Processing Batch 2


                                                                                

Current BTC Price: $6064.85
Predicted BTC Price: $9677.52
Price Change: 59.57%
Trading Decision: 🚀 BUY
+---------------+------------+-----+-------------+-----------------+-------------------+
|sentiment_score|is_president|Value|BTC_Volume   |btc_current_price|predicted_btc_price|
+---------------+------------+-----+-------------+-----------------+-------------------+
|0.3565         |1           |32   |2.66835702E10|6064.847         |9677.517685914816  |
+---------------+------------+-----+-------------+-----------------+-------------------+



ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/home/osbdet/.jupyter_venv/lib/python3.11/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/osbdet/.jupyter_venv/lib/python3.11/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.11/socket.py", line 706, in readinto
    return self._sock.recv_into(b)
           ^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt


KeyboardInterrupt: 