In [1]:
import threading

# Helper thread to avoid the Spark StreamingContext from blocking Jupyter
        
class StreamingThread(threading.Thread):
    def __init__(self, ssc):
        super().__init__()
        self.ssc = ssc
    def run(self):
        self.ssc.start()
        self.ssc.awaitTermination()
    def stop(self):
        print('----- Stopping... this may take a few seconds -----')
        self.ssc.stop(stopSparkContext=False, stopGraceFully=True)

In [2]:
sc

In [3]:
spark

In [4]:
import random
from pyspark.streaming import StreamingContext
from pyspark.sql import Row
from pyspark.sql.functions import udf, struct, array, col, lit
from pyspark.sql.types import StringType

In [5]:
globals()['models_loaded'] = False
globals()['my_model'] = None

# Toy predict function that returns a random probability. Normally you'd use your loaded globals()['my_model'] here
def predict(df):
    return random.random()

predict_udf = udf(predict, StringType())

def process(time, rdd):
    if rdd.isEmpty():
        return
    
    print("========= %s =========" % str(time))
    
    # Convert to data frame
    df = spark.read.json(rdd)
    df.show()
    
    # Utilize our predict function
    df_withpreds = df.withColumn("pred", predict_udf(
        struct([df[x] for x in df.columns])
    ))
    df_withpreds.show()
    
    # Normally, you wouldn't use a UDF (User Defined Function) Python function to predict as we did here (you can)
    # but an MLlib model you've built and saved with Spark
    # In this case, you need to prevent loading your model in every call to "process" as follows:
    
    # Load in the model if not yet loaded:
    if not globals()['models_loaded']:
        # load in your models here
        globals()['my_model'] = '***' # Replace '***' with e.g.:    [...].load('my_logistic_regression')
        globals()['models_loaded'] = True
        
    # And then predict using the loaded model (uncomment below):
    
    # df_result = globals()['my_model'].transform(df)
    # df_result.show()

In [6]:
ssc = StreamingContext(sc, 10)



In [7]:
lines = ssc.socketTextStream("seppe.net", 7778)
lines.foreachRDD(process)

In [8]:
ssc_t = StreamingThread(ssc)
ssc_t.start()

24/04/09 14:30:38 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/04/09 14:30:38 WARN BlockManager: Block input-0-1712665838600 replicated to only 0 peer(s) instead of 1 peers
                                                                                



24/04/09 14:30:41 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/04/09 14:30:41 WARN BlockManager: Block input-0-1712665840800 replicated to only 0 peer(s) instead of 1 peers
                                                                                

+--------+--------+---------+---------+-------------------+--------------------+------------+--------------------+--------------------+--------+-----+
|     aid|comments|   domain|frontpage|          posted_at|         source_text|source_title|               title|                 url|    user|votes|
+--------+--------+---------+---------+-------------------+--------------------+------------+--------------------+--------------------+--------+-----+
|39975168|       0|vaxry.net|     true|2024-04-09 00:34:16|Vaxry's Blog\n\nV...|Vaxry's Blog|Freedesktop/RedHa...|https://blog.vaxr...|jamesp33|    7|
+--------+--------+---------+---------+-------------------+--------------------+------------+--------------------+--------------------+--------+-----+



24/04/09 14:30:43 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/04/09 14:30:43 WARN BlockManager: Block input-0-1712665842800 replicated to only 0 peer(s) instead of 1 peers
                                                                                

+--------+--------+---------+---------+-------------------+--------------------+------------+--------------------+--------------------+--------+-----+-------------------+
|     aid|comments|   domain|frontpage|          posted_at|         source_text|source_title|               title|                 url|    user|votes|               pred|
+--------+--------+---------+---------+-------------------+--------------------+------------+--------------------+--------------------+--------+-----+-------------------+
|39975168|       0|vaxry.net|     true|2024-04-09 00:34:16|Vaxry's Blog\n\nV...|Vaxry's Blog|Freedesktop/RedHa...|https://blog.vaxr...|jamesp33|    7|0.23992257892086788|
+--------+--------+---------+---------+-------------------+--------------------+------------+--------------------+--------------------+--------+-----+-------------------+



24/04/09 14:30:46 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/04/09 14:30:46 WARN BlockManager: Block input-0-1712665845800 replicated to only 0 peer(s) instead of 1 peers
24/04/09 14:30:50 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/04/09 14:30:50 WARN BlockManager: Block input-0-1712665849800 replicated to only 0 peer(s) instead of 1 peers
                                                                                

+--------+--------+------------------+---------+-------------------+--------------------+--------------------+--------------------+--------------------+--------+-----+
|     aid|comments|            domain|frontpage|          posted_at|         source_text|        source_title|               title|                 url|    user|votes|
+--------+--------+------------------+---------+-------------------+--------------------+--------------------+--------------------+--------------------+--------+-----+
|39975182|       0|cosmosmagazine.com|    false|2024-04-09 00:37:23|Why human ancesto...|Why human ancesto...|Human ancestors v...|https://cosmosmag...|    geox|    2|
|39975191|       0|        morphic.sh|    false|2024-04-09 00:39:32|Morphic\n\nMorphi...|             Morphic|Morphic: Open-sou...|https://www.morph...|mpereira|    1|
|39975196|       0|        ollama.com|    false|2024-04-09 00:40:21|Embedding models ...|Embedding models ...|    Embedding Models|https://ollama.co...| ushakov

24/04/09 14:30:53 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/04/09 14:30:53 WARN BlockManager: Block input-0-1712665852800 replicated to only 0 peer(s) instead of 1 peers
24/04/09 14:30:58 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/04/09 14:30:58 WARN BlockManager: Block input-0-1712665857800 replicated to only 0 peer(s) instead of 1 peers
                                                                                

+--------+--------+--------------------+---------+-------------------+--------------------+--------------------+--------------------+--------------------+---------+-----+
|     aid|comments|              domain|frontpage|          posted_at|         source_text|        source_title|               title|                 url|     user|votes|
+--------+--------+--------------------+---------+-------------------+--------------------+--------------------+--------------------+--------------------+---------+-----+
|39975228|       0|bleepingcomputer.com|    false|2024-04-09 00:45:29|Just a moment...\...|    Just a moment...|Fake Facebook Mid...|https://www.bleep...|     cdme|    3|
|39975230|       0|           axios.com|    false|2024-04-09 00:45:58|Just a moment...\...|    Just a moment...|Meta will label A...|https://www.axios...|     cdme|    1|
|39975232|       0|           nautil.us|    false|2024-04-09 00:46:17|The Paradox of th...|The Paradox of th...|The Paradox of th...|https://naut

24/04/09 14:31:02 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/04/09 14:31:02 WARN BlockManager: Block input-0-1712665862000 replicated to only 0 peer(s) instead of 1 peers


In [9]:
ssc_t.stop()

----- Stopping... this may take a few seconds -----


24/04/09 14:31:03 WARN SocketReceiver: Error receiving data
java.net.SocketException: Socket closed
	at java.net.SocketInputStream.socketRead0(Native Method)
	at java.net.SocketInputStream.socketRead(SocketInputStream.java:116)
	at java.net.SocketInputStream.read(SocketInputStream.java:171)
	at java.net.SocketInputStream.read(SocketInputStream.java:141)
	at sun.nio.cs.StreamDecoder.readBytes(StreamDecoder.java:284)
	at sun.nio.cs.StreamDecoder.implRead(StreamDecoder.java:326)
	at sun.nio.cs.StreamDecoder.read(StreamDecoder.java:178)
	at java.io.InputStreamReader.read(InputStreamReader.java:184)
	at java.io.BufferedReader.fill(BufferedReader.java:161)
	at java.io.BufferedReader.readLine(BufferedReader.java:324)
	at java.io.BufferedReader.readLine(BufferedReader.java:389)
	at org.apache.spark.streaming.dstream.SocketReceiver$$anon$2.getNext(SocketInputDStream.scala:121)
	at org.apache.spark.streaming.dstream.SocketReceiver$$anon$2.getNext(SocketInputDStream.scala:119)
	at org.apache.spar

+--------+--------+------------+---------+-------------------+--------------------+--------------------+--------------------+--------------------+----+-----+
|     aid|comments|      domain|frontpage|          posted_at|         source_text|        source_title|               title|                 url|user|votes|
+--------+--------+------------+---------+-------------------+--------------------+--------------------+--------------------+--------------------+----+-----+
|39975234|       0|mashable.com|     true|2024-04-09 00:46:23|X's AI chatbot Gr...|Elon Musk's X pus...|X's AI chatbot Gr...|https://mashable....|cdme|    3|
+--------+--------+------------+---------+-------------------+--------------------+--------------------+--------------------+--------------------+----+-----+

+--------+--------+------------+---------+-------------------+--------------------+--------------------+--------------------+--------------------+----+-----+------------------+
|     aid|comments|      domain|