In [1]:
import threading

# Helper thread to avoid the Spark StreamingContext from blocking Jupyter
        
class StreamingThread(threading.Thread):
    def __init__(self, ssc):
        super().__init__()
        self.ssc = ssc
    def run(self):
        self.ssc.start()
        self.ssc.awaitTermination()
    def stop(self):
        print('----- Stopping... this may take a few seconds -----')
        self.ssc.stop(stopSparkContext=False, stopGraceFully=True)

In [2]:
sc

In [3]:
spark

In [6]:
import random
from pyspark.streaming import StreamingContext
from pyspark.sql import Row
from pyspark.sql.functions import udf, struct, array, col, lit, lower, when
from pyspark.sql.types import StringType
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier
from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
from pyspark.ml.feature import HashingTF, IDF
from pyspark.ml.tuning import CrossValidator, CrossValidatorModel
from pyspark.sql.functions import regexp_replace

In [7]:
globals()['models_loaded'] = False
globals()['my_model'] = CrossValidatorModel.load('C:/Users/lenne/anaconda3/envs/AA/Advanced_Analytics/Assignment_3/spark/models')

In [8]:
from pyspark.sql import functions as F

def process(time, rdd):
    # Make sure the function can handle empty income streams
    if rdd.isEmpty():
        return

    print("========= %s =========" % str(time))

    # Convert to data frame
    df = spark.read.json(rdd)

    df = df.drop("url", "posted_at", "domain", "user")
    df = df.withColumn("label", when(df["frontpage"] == True, 1).otherwise(0))
    df = df.drop("frontpage")

    df.show()

    # Load in the model if not yet loaded:
    if not globals()['models_loaded']:
        globals()['my_model'] = CrossValidatorModel.load('C:/Users/lenne/anaconda3/envs/AA/Advanced_Analytics/Assignment_3/spark/models') 
        globals()['models_loaded'] = True

    # Additional preprocessing applied before the pipeline
    for column in ["title", "source_title", "source_text"]:
        df = df.withColumn(column, F.lower(F.col(column)))

    df_result = globals()['my_model'].transform(df)
    df_result.select('aid', 'comments', 'source_text', 'source_title', 'title', 'votes', 'label', 'prediction').show()

In [9]:
ssc = StreamingContext(sc, 10)
lines = ssc.socketTextStream("seppe.net", 7778)
lines.foreachRDD(process)
ssc_t = StreamingThread(ssc)
ssc_t.start()



+--------+--------+--------------------+--------------------+--------------------+-----+-----+
|     aid|comments|         source_text|        source_title|               title|votes|label|
+--------+--------+--------------------+--------------------+--------------------+-----+-----+
|40407080|       0|Spiral Dynamics -...|     Spiral Dynamics|     Spiral Dynamics|    1|    0|
|40407087|       2|SMBlog -- 23 Febr...|SMBlog -- 23 Febr...|Usenet, authentic...|    1|    0|
|40407091|       0|The true story be...|The true story be...|The true story be...|    1|    0|
+--------+--------+--------------------+--------------------+--------------------+-----+-----+

+--------+--------+--------------------+--------------------+--------------------+-----+-----+----------+
|     aid|comments|         source_text|        source_title|               title|votes|label|prediction|
+--------+--------+--------------------+--------------------+--------------------+-----+-----+----------+
|40407080|      

Exception in thread Thread-8:
Traceback (most recent call last):
  File "C:\Users\lenne\anaconda3\lib\threading.py", line 980, in _bootstrap_inner
    self.run()
  File "C:\Users\lenne\AppData\Local\Temp\ipykernel_9240\265281298.py", line 11, in run
  File "C:\Users\lenne\anaconda3\envs\AA\Advanced_Analytics\Assignment_3\spark\spark-3.5.1-bin-hadoop3\python\pyspark\streaming\context.py", line 239, in awaitTermination
    self._jssc.awaitTermination()
  File "C:\Users\lenne\anaconda3\envs\AA\Advanced_Analytics\Assignment_3\spark\spark-3.5.1-bin-hadoop3\python\lib\py4j-0.10.9.7-src.zip\py4j\java_gateway.py", line 1322, in __call__
  File "C:\Users\lenne\anaconda3\envs\AA\Advanced_Analytics\Assignment_3\spark\spark-3.5.1-bin-hadoop3\python\pyspark\errors\exceptions\captured.py", line 179, in deco
    return f(*a, **kw)
  File "C:\Users\lenne\anaconda3\envs\AA\Advanced_Analytics\Assignment_3\spark\spark-3.5.1-bin-hadoop3\python\lib\py4j-0.10.9.7-src.zip\py4j\protocol.py", line 326, in get_

+--------+--------+--------------------+--------------------+--------------------+-----+-----+
|     aid|comments|         source_text|        source_title|               title|votes|label|
+--------+--------+--------------------+--------------------+--------------------+-----+-----+
|40407106|       0|AI deepfake Putin...|AI deepfake Putin...|AI deepfake Putin...|    1|    0|
|40407139|       0|Wine Consumption ...|Wine Consumption ...|Wine Consumption ...|    2|    0|
|40407166|       0|The One Place in ...|The One Place in ...|The competition t...|    1|    0|
+--------+--------+--------------------+--------------------+--------------------+-----+-----+

+--------+--------+--------------------+--------------------+--------------------+-----+-----+----------+
|     aid|comments|         source_text|        source_title|               title|votes|label|prediction|
+--------+--------+--------------------+--------------------+--------------------+-----+-----+----------+
|40407106|      

+--------+--------+--------------------+--------------------+--------------------+-----+-----+
|     aid|comments|         source_text|        source_title|               title|votes|label|
+--------+--------+--------------------+--------------------+--------------------+-----+-----+
|40407340|       0|How Optical Mice ...|How Optical Mice ...|How Optical Mice ...|    1|    0|
|40407364|       0|Nex Playground | ...|Nex Playground | ...|Nex Playground: A...|    1|    0|
+--------+--------+--------------------+--------------------+--------------------+-----+-----+

+--------+--------+--------------------+--------------------+--------------------+-----+-----+----------+
|     aid|comments|         source_text|        source_title|               title|votes|label|prediction|
+--------+--------+--------------------+--------------------+--------------------+-----+-----+----------+
|40407340|       0|how optical mice ...|how optical mice ...|how optical mice ...|    1|    0|       0.0|
|4040

In [10]:
ssc_t.stop()

----- Stopping... this may take a few seconds -----
+--------+--------+--------------------+--------------------+--------------------+-----+-----+
|     aid|comments|         source_text|        source_title|               title|votes|label|
+--------+--------+--------------------+--------------------+--------------------+-----+-----+
|40407417|       0|Pre-RFC Live quer...|Pre-RFC Live quer...|GraphQL Live Quer...|    1|    0|
|40407420|       0|Simplifying Linux...|Simplifying Linux...|Simplifying Linux...|    2|    0|
|40407425|       0|The Pitch for Pod...|The Pitch for Pod...|Research: Podcast...|    1|    0|
|40407451|       0|INTERNET Database...|INTERNET Database...|Walter Russell's ...|    2|    0|
|40407455|       0|Radical Quantum B...|Radical Quantum B...|Quantum Breakthro...|    1|    0|
+--------+--------+--------------------+--------------------+--------------------+-----+-----+

+--------+--------+--------------------+--------------------+--------------------+-----+----

+--------+--------+--------------------+--------------------+--------------------+-----+-----+----------+
|     aid|comments|         source_text|        source_title|               title|votes|label|prediction|
+--------+--------+--------------------+--------------------+--------------------+-----+-----+----------+
|40407604|       0|new hope for an a...|nearby trappist-1...|nearby trappist-1...|    1|    0|       0.0|
|40407617|       0|what's the price ...|what's the price ...|what's the price ...|    1|    0|       0.0|
|40407659|       0|italy's $12.7bn b...|italy's $12.7bn b...|italy's $12.7b br...|   10|    1|       1.0|
+--------+--------+--------------------+--------------------+--------------------+-----+-----+----------+

