In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, from_json
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, BooleanType, TimestampType

# Initialize Spark Session
spark = SparkSession.builder \
    .appName("KafkaBTCConsumer") \
    .master("local[*]") \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.4.1") \
    .getOrCreate()

25/03/13 19:08:31 WARN Utils: Your hostname, osbdet resolves to a loopback address: 127.0.0.1; using 10.0.2.15 instead (on interface enp0s1)
25/03/13 19:08:31 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Ivy Default Cache set to: /home/osbdet/.ivy2/cache
The jars for the packages stored in: /home/osbdet/.ivy2/jars
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-dff98fd8-48f3-46a9-a8e7-feb3350e6384;1.0
	confs: [default]
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.4.1 in central


:: loading settings :: url = jar:file:/home/osbdet/.jupyter_venv/lib/python3.11/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.4.1 in central
	found org.apache.kafka#kafka-clients;3.3.2 in central
	found org.lz4#lz4-java;1.8.0 in central
	found org.xerial.snappy#snappy-java;1.1.10.1 in central
	found org.slf4j#slf4j-api;2.0.6 in central
	found org.apache.hadoop#hadoop-client-runtime;3.3.4 in central
	found org.apache.hadoop#hadoop-client-api;3.3.4 in central
	found commons-logging#commons-logging;1.1.3 in central
	found com.google.code.findbugs#jsr305;3.0.0 in central
	found org.apache.commons#commons-pool2;2.11.1 in central
:: resolution report :: resolve 265ms :: artifacts dl 6ms
	:: modules in use:
	com.google.code.findbugs#jsr305;3.0.0 from central in [default]
	commons-logging#commons-logging;1.1.3 from central in [default]
	org.apache.commons#commons-pool2;2.11.1 from central in [default]
	org.apache.hadoop#hadoop-client-api;3.3.4 from central in [default]
	org.apache.hadoop#hadoop-client-runtime;3.3.4 from central in [default]
	org.apache.ka

In [2]:

# Define Kafka topic and broker
kafka_topic = "btc_price"
kafka_bootstrap_servers = "localhost:9092"  # Change if using a remote Kafka server

In [3]:


# Define the schema of incoming data
schema = StructType([
    StructField("date", StringType(), True),  # Change to TimestampType() if timestamp format is consistent
    StructField("favorites", DoubleType(), True),
    StructField("id", StringType(), True),  # Use StringType() to avoid precision issues with large numbers
    StructField("isRetweet", BooleanType(), True),
    StructField("retweets", DoubleType(), True),
    StructField("text", StringType(), True)
])


In [4]:

# Read data from Kafka
kafka_df = spark.readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", kafka_bootstrap_servers) \
    .option("subscribe", kafka_topic) \
    .option("startingOffsets", "latest") \
    .load()

In [5]:

# Deserialize Kafka message value (assuming JSON format)
parsed_df = kafka_df.selectExpr("CAST(value AS STRING)") \
    .select(from_json(col("value"), schema).alias("data")) \
    .select("data.*")

# Convert 'date' to TimestampType for proper handling (optional)
parsed_df = parsed_df.withColumn("date", col("date").cast(TimestampType()))

In [6]:
# Display the streaming data in console
query = parsed_df.writeStream \
    .outputMode("append") \
    .format("console") \
    .option("truncate", "false") \
    .start()

query.awaitTermination()


25/03/13 19:08:33 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-3ff1d983-00ea-4d2c-8e98-645602c06581. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
25/03/13 19:08:33 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
25/03/13 19:08:34 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.


-------------------------------------------
Batch: 0
-------------------------------------------
+----+---------+---+---------+--------+----+
|date|favorites|id |isRetweet|retweets|text|
+----+---------+---+---------+--------+----+
+----+---------+---+---------+--------+----+



                                                                                

-------------------------------------------
Batch: 1
-------------------------------------------
+-------------------+---------+------------------+---------+--------+--------------------------------------------------------------------------------------------------+
|date               |favorites|id                |isRetweet|retweets|text                                                                                              |
+-------------------+---------+------------------+---------+--------+--------------------------------------------------------------------------------------------------+
|2012-09-27 22:51:48|13.0     |251423922069839874|false    |20.0    |@BENJAMINPRICE18  Go for the gold standard--aim high. Be tenacious and have passion for your work.|
+-------------------+---------+------------------+---------+--------+--------------------------------------------------------------------------------------------------+

-------------------------------------------
Batch: 2
----

                                                                                

-------------------------------------------
Batch: 4
-------------------------------------------
+-------------------+---------+------------------+---------+--------+----------------------------------------------------------------------------------------------------------------------+
|date               |favorites|id                |isRetweet|retweets|text                                                                                                                  |
+-------------------+---------+------------------+---------+--------+----------------------------------------------------------------------------------------------------------------------+
|2014-11-24 20:45:44|292.0    |536968931278598144|false    |195.0   |Congratulations to Chuck Hagel on one of the shortest tenures as Sec. of Defense. Another terrible appointee by Obama.|
+-------------------+---------+------------------+---------+--------+------------------------------------------------------------------------------

                                                                                

-------------------------------------------
Batch: 5
-------------------------------------------
+-------------------+---------+-------------------+---------+--------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|date               |favorites|id                 |isRetweet|retweets|text                                                                                                                                                                                                                                                                                            |
+-------------------+---------+-------------------+---------+--------+-----------------------------------------------------------------------------------------------------------------

                                                                                

-------------------------------------------
Batch: 6
-------------------------------------------
+-------------------+---------+------------------+---------+--------+---------------------------------------------------------------------------------------------------------------------+
|date               |favorites|id                |isRetweet|retweets|text                                                                                                                 |
+-------------------+---------+------------------+---------+--------+---------------------------------------------------------------------------------------------------------------------+
|2011-09-06 22:23:34|105.0    |111172713015545856|false    |167.0   |@BarackObama trillion $ stimulus was supposed to create jobs -- instead it gave us more debt and higher unemployment.|
+-------------------+---------+------------------+---------+--------+----------------------------------------------------------------------------------

                                                                                

-------------------------------------------
Batch: 7
-------------------------------------------
+-------------------+---------+------------------+---------+--------+--------------------------------------------------------------------------------------------+
|date               |favorites|id                |isRetweet|retweets|text                                                                                        |
+-------------------+---------+------------------+---------+--------+--------------------------------------------------------------------------------------------+
|2024-01-06 22:05:42|0.0      |111711036554255473|true     |0.0     |"RT @realDonaldTrumpThe stakes of this election could not be higher. Under Crooked Joe Biden|
+-------------------+---------+------------------+---------+--------+--------------------------------------------------------------------------------------------+



                                                                                

-------------------------------------------
Batch: 8
-------------------------------------------
+-------------------+---------+------------------+---------+--------+------------------------------------------------------------------------------------------------------------------------------------------------+
|date               |favorites|id                |isRetweet|retweets|text                                                                                                                                            |
+-------------------+---------+------------------+---------+--------+------------------------------------------------------------------------------------------------------------------------------------------------+
|2018-01-18 00:36:44|57473.0  |953773110553186312|false    |12213.0 |Main Street is BACK! Strongest Holiday Sales bump since the Great Recession -- beating forecasts by BILLIONS OF DOLLARS. https://t.co/kDcGbMieSt|
+-------------------+---------+------------

                                                                                

-------------------------------------------
Batch: 9
-------------------------------------------
+-------------------+---------+------------------+---------+--------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|date               |favorites|id                |isRetweet|retweets|text                                                                                                                                                                                                                                                                                                                                                                 |
+-------------------+---------+----------------

                                                                                

-------------------------------------------
Batch: 10
-------------------------------------------
+-------------------+---------+------------------+---------+--------+-----------------------------------------------------------------------------------------------------------------------------------------+
|date               |favorites|id                |isRetweet|retweets|text                                                                                                                                     |
+-------------------+---------+------------------+---------+--------+-----------------------------------------------------------------------------------------------------------------------------------------+
|2015-09-05 23:54:41|7162.0   |640281925270286336|false    |4564.0  |The hedge fund guys (gals) have to pay higher taxes ASAP. They are paying practically nothing. We must reduce taxes for the middle class!|
+-------------------+---------+------------------+---------+--------+-

                                                                                

-------------------------------------------
Batch: 12
-------------------------------------------
+-------------------+---------+------------------+---------+--------+---------------------------------------------------------------------------+
|date               |favorites|id                |isRetweet|retweets|text                                                                       |
+-------------------+---------+------------------+---------+--------+---------------------------------------------------------------------------+
|2012-12-11 22:28:15|128.0    |278612182021853184|false    |223.0   |The Republicans must use the debt ceiling as leverage to make a great deal!|
+-------------------+---------+------------------+---------+--------+---------------------------------------------------------------------------+

-------------------------------------------
Batch: 13
-------------------------------------------
+-------------------+---------+-------------------+---------+--------+---

                                                                                

-------------------------------------------
Batch: 15
-------------------------------------------
+-------------------+---------+------------------+---------+--------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|date               |favorites|id                |isRetweet|retweets|text                                                                                                                                                                                                                                                                                                               

                                                                                

-------------------------------------------
Batch: 35
-------------------------------------------
+-------------------+---------+------------------+---------+--------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|date               |favorites|id                |isRetweet|retweets|text                                                                                                                                                                                

                                                                                

-------------------------------------------
Batch: 36
-------------------------------------------
+-------------------+---------+-------------------+---------+--------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|date               |favorites|id                 |isRetweet|retweets|text                                                                                                                                                                                                       |
+-------------------+---------+-------------------+---------+--------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|2019-09-07 05:05:01|58931.0  |1170171081853931520|false    |13923.0 |China j

                                                                                

-------------------------------------------
Batch: 37
-------------------------------------------
+-------------------+---------+------------------+---------+--------+--------------------------------------------------------------------------------------------------------------------------+
|date               |favorites|id                |isRetweet|retweets|text                                                                                                                      |
+-------------------+---------+------------------+---------+--------+--------------------------------------------------------------------------------------------------------------------------+
|2023-03-29 19:09:13|8506.0   |110107672829779373|false    |2728.0  |"https://justthenews.com/government/security/tuetwo-years-later-jan-6-video-footage-raises-new-questions-about-police-and"|
+-------------------+---------+------------------+---------+--------+-------------------------------------------------------------

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/home/osbdet/.jupyter_venv/lib/python3.11/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/osbdet/.jupyter_venv/lib/python3.11/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.11/socket.py", line 706, in readinto
    return self._sock.recv_into(b)
           ^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt


KeyboardInterrupt: 