In [1]:
import pandas as pd
pd.set_option('display.max_colwidth', None)

In [2]:
import numpy as np
np.bool = np.bool_

In [3]:
#current notebook name
notebook_name = __session__.replace('.ipynb','')[__session__.rfind('/')+1:] 

In [4]:
# HDFS base paths
hdfs_lakehouse_base_path = 'hdfs://localhost:9000/lakehouse/'
hdfs_warehouse_base_path = 'hdfs://localhost:9000/warehouse'

In [5]:
import os
dependencies = ["org.apache.spark:spark-avro_2.12:3.5.0",
                "io.delta:delta-iceberg_2.12:3.0.0","org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.0"]
os.environ['PYSPARK_SUBMIT_ARGS']= f"--packages {','.join(dependencies)} pyspark-shell"
os.environ['PYARROW_IGNORE_TIMEZONE'] = 'true'

In [6]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, from_json, udf
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, BooleanType
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Initialize Spark Session with Kafka Support
spark = (SparkSession.builder
    .appName(notebook_name)
    .config("spark.log.level", "ERROR")
    .config("spark.sql.warehouse.dir", hdfs_warehouse_base_path)
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
    .config("spark.sql.streaming.schemaInference", "true")  # Enable schema inference for streaming data
    .config("spark.jars.packages", 
            "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.0,"  # Kafka integration
            "io.delta:delta-core_2.12:2.4.0")  # Delta Lake support
    .enableHiveSupport()
    .getOrCreate()
)

25/03/20 18:58:52 WARN Utils: Your hostname, osbdet resolves to a loopback address: 127.0.0.1; using 10.0.2.15 instead (on interface enp0s1)
25/03/20 18:58:52 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Ivy Default Cache set to: /home/osbdet/.ivy2/cache
The jars for the packages stored in: /home/osbdet/.ivy2/jars
org.apache.spark#spark-avro_2.12 added as a dependency
io.delta#delta-iceberg_2.12 added as a dependency
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-1229d953-00e0-4222-8e9c-8cfb10ab3fe0;1.0
	confs: [default]
	found org.apache.spark#spark-avro_2.12;3.5.0 in central
	found org.tukaani#xz;1.9 in central
	found io.delta#delta-iceberg_2.12;3.0.0 in central


:: loading settings :: url = jar:file:/home/osbdet/.jupyter_venv/lib/python3.11/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


	found io.delta#delta-spark_2.12;3.0.0 in central
	found io.delta#delta-storage;3.0.0 in central
	found org.antlr#antlr4-runtime;4.9.3 in central
	found org.scala-lang.modules#scala-collection-compat_2.12;2.1.1 in central
	found com.github.ben-manes.caffeine#caffeine;2.9.3 in central
	found org.checkerframework#checker-qual;3.19.0 in central
	found com.google.errorprone#error_prone_annotations;2.10.0 in central
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.5.0 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.5.0 in central
	found org.apache.kafka#kafka-clients;3.4.1 in central
	found org.lz4#lz4-java;1.8.0 in central
	found org.xerial.snappy#snappy-java;1.1.10.3 in central
	found org.slf4j#slf4j-api;2.0.7 in central
	found org.apache.hadoop#hadoop-client-runtime;3.3.4 in central
	found org.apache.hadoop#hadoop-client-api;3.3.4 in central
	found commons-logging#commons-logging;1.1.3 in central
	found com.google.code.findbugs#jsr305;3.0.0 in central
	found 

In [7]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, from_json, udf
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, BooleanType
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Define schema for incoming Twitter data
twitter_schema = StructType([
    StructField("date", StringType(), True),
    StructField("favorites", DoubleType(), True),
    StructField("id", StringType(), True),
    StructField("isRetweet", BooleanType(), True),
    StructField("retweets", DoubleType(), True),
    StructField("text", StringType(), True)
])


twitter_stream = (spark.readStream
    .format("kafka")
    .option("kafka.bootstrap.servers", "localhost:9092")
    .option("subscribe", "btc_price")
    .option("startingOffsets", "latest")
    .load()
)
# Extract JSON from Kafka's `value` column
twitter_df = twitter_stream.withColumn(
    "parsed_value", from_json(col("value").cast("string"), twitter_schema)
).select("parsed_value.*")


In [8]:
import yfinance as yf
import requests
from pyspark.sql.functions import col
from pyspark.sql.types import StructType, StructField, StringType, FloatType, IntegerType
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from pyspark.ml.classification import RandomForestClassificationModel
from pyspark.ml.feature import VectorAssembler
import random


# Define HDFS path where the classification model is stored
model = RandomForestClassificationModel.load(f"{hdfs_lakehouse_base_path}/gold/trump_btc/rf_model")
print("Classification Model Loaded Successfully!")

# Initialize Sentiment Analyzer
analyzer = SentimentIntensityAnalyzer()

def process_batch(batch_df, batch_id):
    if not batch_df.isEmpty():
        print(f"🚀 Processing Batch {batch_id}")

        # Convert Spark DataFrame to Pandas for sentiment analysis
        pdf = batch_df.toPandas()

        # Apply Sentiment Analysis using VADER
        pdf["sentiment_score"] = pdf["text"].apply(
            lambda text: analyzer.polarity_scores(text)["compound"] if isinstance(text, str) else 0.0
        )

        # Convert back to Spark DataFrame
        spark_df = spark.createDataFrame(pdf)

        # Extract the sentiment score for all tweets in this batch
        sentiment_score = spark_df.select("sentiment_score").collect()[0]["sentiment_score"]

        # Fetch BTC & Fear/Greed Data and merge it with sentiment
        model_data, current_btc_price = fetch_btc_fng_for_model(spark, sentiment_score)

        # Convert feature columns into a vector column for Spark ML
        feature_columns = ["sentiment_score", "is_president", "Value" ,"BTC_Volume", "btc_current_price"]
        assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
        model_data = assembler.transform(model_data)

        # Apply the trained Spark ML classification model for prediction
        predictions = model.transform(model_data)

        # Select final output columns
        final_df = predictions.select(
            "sentiment_score", "is_president", "Value", "BTC_Volume", "btc_current_price", col("prediction").alias("btc_price_increased")
        )

        # Convert final DataFrame to Pandas for decision-making
        final_pdf = final_df.toPandas()

        # Extract classification prediction (1 = BUY, 0 = DON'T BUY)
        btc_price_increased = int(final_pdf["btc_price_increased"].iloc[0])

        # Trading Decision Logic
        decision = " BUY!!!!!" if btc_price_increased == 1 else "DO NOTHING"

        # Print Trading Decision
        print(f"Current BTC Price: ${current_btc_price:.2f}")
        print(f"Predicted BTC Price Increase: {'YES' if btc_price_increased == 1 else 'NO'}")
        print(f"Trading Decision: {decision}")

        # Show Predictions
        final_df.show(truncate=False)

def get_random_volume():
    return random.randrange(int(3.67e5), int(1.09e11), int(5e9))

def fetch_btc_fng_for_model(spark, sentiment_score):

    
    btc = yf.Ticker("BTC-USD")
    latest_data = btc.history(period="1d")

    if not latest_data.empty:
        btc_current_price = float(latest_data['Close'].iloc[-1])
        latest_volume = float(get_random_volume()) #float(latest_data['Volume'].iloc[-1])  # Current BTC volume
        latest_date = latest_data.index[-1].strftime('%Y-%m-%d')
    else:
        btc_current_price = None
        latest_volume = None
        latest_date = datetime.today().strftime('%Y-%m-%d')

    #Get Fear & Greed Index
    response = requests.get("https://api.alternative.me/fng/?limit=1")
    data = response.json()

    index_value = int(data['data'][0]['value'])  # Fear & Greed Index
    classification = data['data'][0]['value_classification']

    # Define Schema
    schema = StructType([
        StructField("sentiment_score", FloatType(), True),
        StructField("is_president", IntegerType(), False),
        StructField("Value", IntegerType(), False),  # Renaming index_value as "Value"
        StructField("BTC_Volume", FloatType(), True),
        StructField("btc_current_price", FloatType(), True),
    ])

    #Create a Single Spark DataFrame
    #setting is president to 1
    model_df = spark.createDataFrame([(sentiment_score, 1, index_value, latest_volume, btc_current_price)], schema=schema)

    return model_df, btc_current_price


Classification Model Loaded Successfully!


In [9]:
query = twitter_df.writeStream \
    .foreachBatch(process_batch) \
    .start()

query.awaitTermination()


                                                                                

🚀 Processing Batch 1
Current BTC Price: $83981.65
Predicted BTC Price Increase: YES
Trading Decision: 🚀 BUY
+---------------+------------+-----+-------------+-----------------+-------------------+
|sentiment_score|is_president|Value|BTC_Volume   |btc_current_price|btc_price_increased|
+---------------+------------+-----+-------------+-----------------+-------------------+
|-0.5456        |1           |49   |1.00000367E11|83981.65         |1.0                |
+---------------+------------+-----+-------------+-----------------+-------------------+

🚀 Processing Batch 2
Current BTC Price: $83981.65
Predicted BTC Price Increase: NO
Trading Decision: DO NOTHING
+---------------+------------+-----+------------+-----------------+-------------------+
|sentiment_score|is_president|Value|BTC_Volume  |btc_current_price|btc_price_increased|
+---------------+------------+-----+------------+-----------------+-------------------+
|0.9767         |1           |49   |5.0000368E10|83981.65         |0.0

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/home/osbdet/.jupyter_venv/lib/python3.11/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/osbdet/.jupyter_venv/lib/python3.11/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.11/socket.py", line 706, in readinto
    return self._sock.recv_into(b)
           ^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt


KeyboardInterrupt: 

🚀 Processing Batch 8


                                                                                

Current BTC Price: $83884.02
Predicted BTC Price Increase: NO
Trading Decision: DO NOTHING
+---------------+------------+-----+------------+-----------------+-------------------+
|sentiment_score|is_president|Value|BTC_Volume  |btc_current_price|btc_price_increased|
+---------------+------------+-----+------------+-----------------+-------------------+
|0.9611         |1           |49   |8.5000364E10|83884.016        |0.0                |
+---------------+------------+-----+------------+-----------------+-------------------+

🚀 Processing Batch 9
Current BTC Price: $84018.68
Predicted BTC Price Increase: YES
Trading Decision: 🚀 BUY
+---------------+------------+-----+------------+-----------------+-------------------+
|sentiment_score|is_president|Value|BTC_Volume  |btc_current_price|btc_price_increased|
+---------------+------------+-----+------------+-----------------+-------------------+
|-0.7269        |1           |49   |8.0000369E10|84018.68         |1.0                |
+-------

                                                                                

🚀 Processing Batch 17
Current BTC Price: $84048.09
Predicted BTC Price Increase: NO
Trading Decision: DO NOTHING
+---------------+------------+-----+------------+-----------------+-------------------+
|sentiment_score|is_president|Value|BTC_Volume  |btc_current_price|btc_price_increased|
+---------------+------------+-----+------------+-----------------+-------------------+
|0.9666         |1           |49   |3.5000365E10|84048.086        |0.0                |
+---------------+------------+-----+------------+-----------------+-------------------+

🚀 Processing Batch 18
Current BTC Price: $84048.09
Predicted BTC Price Increase: YES
Trading Decision: 🚀 BUY
+---------------+------------+-----+------------+-----------------+-------------------+
|sentiment_score|is_president|Value|BTC_Volume  |btc_current_price|btc_price_increased|
+---------------+------------+-----+------------+-----------------+-------------------+
|0.8977         |1           |49   |9.0000368E10|84048.086        |1.0   

                                                                                

🚀 Processing Batch 19
Current BTC Price: $84048.09
Predicted BTC Price Increase: NO
Trading Decision: DO NOTHING
+---------------+------------+-----+------------+-----------------+-------------------+
|sentiment_score|is_president|Value|BTC_Volume  |btc_current_price|btc_price_increased|
+---------------+------------+-----+------------+-----------------+-------------------+
|0.9167         |1           |49   |5.0000368E10|84048.086        |0.0                |
+---------------+------------+-----+------------+-----------------+-------------------+

🚀 Processing Batch 20
Current BTC Price: $84048.09
Predicted BTC Price Increase: NO
Trading Decision: DO NOTHING
+---------------+------------+-----+-------------+-----------------+-------------------+
|sentiment_score|is_president|Value|BTC_Volume   |btc_current_price|btc_price_increased|
+---------------+------------+-----+-------------+-----------------+-------------------+
|0.2467         |1           |49   |1.50003671E10|84048.086       

                                                                                

🚀 Processing Batch 21
Current BTC Price: $84048.09
Predicted BTC Price Increase: NO
Trading Decision: DO NOTHING
+---------------+------------+-----+-------------+-----------------+-------------------+
|sentiment_score|is_president|Value|BTC_Volume   |btc_current_price|btc_price_increased|
+---------------+------------+-----+-------------+-----------------+-------------------+
|-0.7724        |1           |49   |1.50003671E10|84048.086        |0.0                |
+---------------+------------+-----+-------------+-----------------+-------------------+



                                                                                

🚀 Processing Batch 22
Current BTC Price: $84048.09
Predicted BTC Price Increase: NO
Trading Decision: DO NOTHING
+---------------+------------+-----+------------+-----------------+-------------------+
|sentiment_score|is_president|Value|BTC_Volume  |btc_current_price|btc_price_increased|
+---------------+------------+-----+------------+-----------------+-------------------+
|0.3818         |1           |49   |3.5000365E10|84048.086        |0.0                |
+---------------+------------+-----+------------+-----------------+-------------------+



                                                                                

🚀 Processing Batch 23
Current BTC Price: $84048.09
Predicted BTC Price Increase: YES
Trading Decision: 🚀 BUY
+---------------+------------+-----+------------+-----------------+-------------------+
|sentiment_score|is_president|Value|BTC_Volume  |btc_current_price|btc_price_increased|
+---------------+------------+-----+------------+-----------------+-------------------+
|-0.4738        |1           |49   |4.5000368E10|84048.086        |1.0                |
+---------------+------------+-----+------------+-----------------+-------------------+

🚀 Processing Batch 24
Current BTC Price: $84048.09
Predicted BTC Price Increase: NO
Trading Decision: DO NOTHING
+---------------+------------+-----+------------+-----------------+-------------------+
|sentiment_score|is_president|Value|BTC_Volume  |btc_current_price|btc_price_increased|
+---------------+------------+-----+------------+-----------------+-------------------+
|0.6505         |1           |49   |3.5000365E10|84048.086        |0.0   

                                                                                

🚀 Processing Batch 25
Current BTC Price: $84048.09
Predicted BTC Price Increase: NO
Trading Decision: DO NOTHING
+---------------+------------+-----+-------------+-----------------+-------------------+
|sentiment_score|is_president|Value|BTC_Volume   |btc_current_price|btc_price_increased|
+---------------+------------+-----+-------------+-----------------+-------------------+
|0.9797         |1           |49   |2.00003666E10|84048.086        |0.0                |
+---------------+------------+-----+-------------+-----------------+-------------------+

🚀 Processing Batch 26
Current BTC Price: $84048.09
Predicted BTC Price Increase: NO
Trading Decision: DO NOTHING
+---------------+------------+-----+----------+-----------------+-------------------+
|sentiment_score|is_president|Value|BTC_Volume|btc_current_price|btc_price_increased|
+---------------+------------+-----+----------+-----------------+-------------------+
|0.5244         |1           |49   |367000.0  |84048.086        |0.0  

                                                                                

Current BTC Price: $84099.41
Predicted BTC Price Increase: NO
Trading Decision: DO NOTHING
+---------------+------------+-----+-----------+-----------------+-------------------+
|sentiment_score|is_president|Value|BTC_Volume |btc_current_price|btc_price_increased|
+---------------+------------+-----+-----------+-----------------+-------------------+
|-0.6093        |1           |49   |5.0003671E9|84099.41         |0.0                |
+---------------+------------+-----+-----------+-----------------+-------------------+

🚀 Processing Batch 41


                                                                                

Current BTC Price: $84099.41
Predicted BTC Price Increase: NO
Trading Decision: DO NOTHING
+---------------+------------+-----+----------+-----------------+-------------------+
|sentiment_score|is_president|Value|BTC_Volume|btc_current_price|btc_price_increased|
+---------------+------------+-----+----------+-----------------+-------------------+
|0.3818         |1           |49   |367000.0  |84099.41         |0.0                |
+---------------+------------+-----+----------+-----------------+-------------------+

🚀 Processing Batch 42


                                                                                

Current BTC Price: $84099.41
Predicted BTC Price Increase: NO
Trading Decision: DO NOTHING
+---------------+------------+-----+------------+-----------------+-------------------+
|sentiment_score|is_president|Value|BTC_Volume  |btc_current_price|btc_price_increased|
+---------------+------------+-----+------------+-----------------+-------------------+
|0.9683         |1           |49   |1.0500037E11|84099.41         |0.0                |
+---------------+------------+-----+------------+-----------------+-------------------+

🚀 Processing Batch 43


                                                                                

Current BTC Price: $84099.41
Predicted BTC Price Increase: NO
Trading Decision: DO NOTHING
+---------------+------------+-----+----------+-----------------+-------------------+
|sentiment_score|is_president|Value|BTC_Volume|btc_current_price|btc_price_increased|
+---------------+------------+-----+----------+-----------------+-------------------+
|0.2244         |1           |49   |367000.0  |84099.41         |0.0                |
+---------------+------------+-----+----------+-----------------+-------------------+



                                                                                

🚀 Processing Batch 44
Current BTC Price: $84099.41
Predicted BTC Price Increase: YES
Trading Decision: 🚀 BUY
+---------------+------------+-----+------------+-----------------+-------------------+
|sentiment_score|is_president|Value|BTC_Volume  |btc_current_price|btc_price_increased|
+---------------+------------+-----+------------+-----------------+-------------------+
|0.4007         |1           |49   |8.0000369E10|84099.41         |1.0                |
+---------------+------------+-----+------------+-----------------+-------------------+

🚀 Processing Batch 45
Current BTC Price: $84112.97
Predicted BTC Price Increase: YES
Trading Decision: 🚀 BUY
+---------------+------------+-----+------------+-----------------+-------------------+
|sentiment_score|is_president|Value|BTC_Volume  |btc_current_price|btc_price_increased|
+---------------+------------+-----+------------+-----------------+-------------------+
|0.3818         |1           |49   |5.5000367E10|84112.97         |1.0       

                                                                                

🚀 Processing Batch 50
Current BTC Price: $84112.97
Predicted BTC Price Increase: NO
Trading Decision: DO NOTHING
+---------------+------------+-----+-----------+-----------------+-------------------+
|sentiment_score|is_president|Value|BTC_Volume |btc_current_price|btc_price_increased|
+---------------+------------+-----+-----------+-----------------+-------------------+
|0.9643         |1           |49   |5.0003671E9|84112.97         |0.0                |
+---------------+------------+-----+-----------+-----------------+-------------------+



                                                                                

🚀 Processing Batch 51
Current BTC Price: $84014.14
Predicted BTC Price Increase: NO
Trading Decision: DO NOTHING
+---------------+------------+-----+------------+-----------------+-------------------+
|sentiment_score|is_president|Value|BTC_Volume  |btc_current_price|btc_price_increased|
+---------------+------------+-----+------------+-----------------+-------------------+
|0.9544         |1           |49   |4.0000369E10|84014.14         |0.0                |
+---------------+------------+-----+------------+-----------------+-------------------+

🚀 Processing Batch 52
Current BTC Price: $84014.14
Predicted BTC Price Increase: YES
Trading Decision: 🚀 BUY
+---------------+------------+-----+------------+-----------------+-------------------+
|sentiment_score|is_president|Value|BTC_Volume  |btc_current_price|btc_price_increased|
+---------------+------------+-----+------------+-----------------+-------------------+
|-0.3595        |1           |49   |8.0000369E10|84014.14         |1.0   

                                                                                

🚀 Processing Batch 56
Current BTC Price: $84014.14
Predicted BTC Price Increase: YES
Trading Decision: 🚀 BUY
+---------------+------------+-----+------------+-----------------+-------------------+
|sentiment_score|is_president|Value|BTC_Volume  |btc_current_price|btc_price_increased|
+---------------+------------+-----+------------+-----------------+-------------------+
|-0.6249        |1           |49   |5.5000367E10|84014.14         |1.0                |
+---------------+------------+-----+------------+-----------------+-------------------+

🚀 Processing Batch 57
Current BTC Price: $84014.14
Predicted BTC Price Increase: YES
Trading Decision: 🚀 BUY
+---------------+------------+-----+------------+-----------------+-------------------+
|sentiment_score|is_president|Value|BTC_Volume  |btc_current_price|btc_price_increased|
+---------------+------------+-----+------------+-----------------+-------------------+
|0.0516         |1           |49   |1.0500037E11|84014.14         |1.0       

In [None]:
query = twitter_df.writeStream \
    .outputMode("append") \
    .format("console") \
    .start()

query.awaitTermination()
