In [12]:
import threading

# Helper thread to avoid the Spark StreamingContext from blocking Jupyter
        
class StreamingThread(threading.Thread):
    def __init__(self, ssc):
        super().__init__()
        self.ssc = ssc
    def run(self):
        self.ssc.start()
        self.ssc.awaitTermination()
    def stop(self):
        print('----- Stopping... this may take a few seconds -----')
        self.ssc.stop(stopSparkContext=False, stopGraceFully=True)

In [13]:
sc

In [14]:
spark

In [15]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf, length
from pyspark.ml import PipelineModel
from pyspark.streaming import StreamingContext
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, StringIndexer
from pyspark.sql.types import StructType, StructField, StringType, BooleanType, LongType, DoubleType
from pyspark.ml import Pipeline
from pyspark.ml.classification import GBTClassifier, GBTClassificationModel
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [16]:
# Initialize the SparkSession
spark = SparkSession.builder \
    .appName("FrontpagePrediction") \
    .config("spark.executor.memory", "2g") \
    .config("spark.driver.memory", "2g") \
    .config("spark.executor.cores", "2") \
    .config("spark.driver.cores", "2") \
    .getOrCreate()


# Define the data schema from the json
schema = StructType([
    StructField("aid", StringType(), True),
    StructField("title", StringType(), True),
    StructField("url", StringType(), True),
    StructField("domain", StringType(), True),
    StructField("votes", LongType(), True),
    StructField("user", StringType(), True),
    StructField("posted_at", StringType(), True),
    StructField("comments", LongType(), True),
    StructField("source_title", StringType(), True),
    StructField("source_text", StringType(), True),
    StructField("frontpage", BooleanType(), True)
])

# Define a function to convert correct probability
def get_prediction(probability):
    return 0.0 if probability[1] <= 0.5 else 1.0

get_prediction_udf = udf(get_prediction, DoubleType())

# Define main process function
def process(time, rdd):
    if rdd.isEmpty():
        return
    
    print("========= %s =========" % str(time))
    
    # Convert rdd into data frame with specific schema
    df = spark.read.schema(schema).json(rdd)
    df.show(n=20)  # display the data we are capturing
    
    # Convert frontpage from boolean to string
    df = df.withColumn("frontpage", col("frontpage").cast("string"))
    # Add a new feature 'text_length'
    df = df.withColumn("text_length", length(col("source_text")))
    
    # Load the saved pipeline
    pipeline_model = PipelineModel.load("/Users/xiaodi/anaconda3/spark/notebooks/saved_pipeline")
    
    # Employ pipeline to convert and extract features
    df_transformed = pipeline_model.transform(df)
    
    # Load the saved GBT model
    if not globals().get('models_loaded', False):
        globals()['my_model'] = GBTClassificationModel.load("/Users/xiaodi/anaconda3/spark/notebooks/saved_gbt_model")
        globals()['models_loaded'] = True
    
    # Make predictions
    df_result = globals()['my_model'].transform(df_transformed)
    
    # Convert result into correct prediction
    df_result = df_result.withColumn("corrected_prediction", get_prediction_udf(col("probability")))
    
    # Display the first 20 lines
    df_result.show(n=20)
    
    
    # Save the results to csv file
    df_result.select("aid", "title", "url", "domain", "votes", "user", "posted_at", 
                     "comments", "frontpage", 
                     "corrected_prediction") \
             .write.mode("append").option("header", "true").csv("/Users/xiaodi/anaconda3/spark/notebooks/predictions")




In [17]:
ssc = StreamingContext(sc, 10)

In [18]:
lines = ssc.socketTextStream("seppe.net", 7778)
lines.foreachRDD(process)

In [19]:
ssc_t = StreamingThread(ssc)
ssc_t.start()

24/05/25 20:48:40 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:48:40 WARN BlockManager: Block input-0-1716662920000 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:48:41 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:48:41 WARN BlockManager: Block input-0-1716662921000 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:48:42 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:48:42 WARN BlockManager: Block input-0-1716662922000 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:48:43 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:48:43 WARN BlockManager: Block input-0-1716662923000 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:48:46 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:48:46 WARN BlockManager: Block input-0-1716662926000 replicated to

+--------+--------------------+--------------------+--------------------+-----+--------------+-------------------+--------+--------------------+--------------------+---------+
|     aid|               title|                 url|              domain|votes|          user|          posted_at|comments|        source_title|         source_text|frontpage|
+--------+--------------------+--------------------+--------------------+-----+--------------+-------------------+--------+--------------------+--------------------+---------+
|40473125|Where can I find ...|https://github.co...|github.com/turbob...|    2|    Turboblack|2024-05-25 06:50:32|       2|GitHub - turbobla...|GitHub - turbobla...|    false|
|40473128|  EU Approves AI Act|https://vcsi.org/...|            vcsi.org|   25|  rippeltippel|2024-05-25 06:50:54|      24|EU Approves Histo...|EU Approves Histo...|     true|
|40473130|Signal's Meredith...|https://techcrunc...|      techcrunch.com|   10|        doener|2024-05-25 06:51:13|      

24/05/25 20:48:51 WARN DAGScheduler: Broadcasting large task binary with size 1418.6 KiB
24/05/25 20:48:51 WARN DAGScheduler: Broadcasting large task binary with size 1418.6 KiB
24/05/25 20:48:51 WARN DAGScheduler: Broadcasting large task binary with size 1418.6 KiB
24/05/25 20:48:52 WARN DAGScheduler: Broadcasting large task binary with size 1414.7 KiB


+--------+--------------------+--------------------+--------------------+-----+--------------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+--------------------+--------------------+--------------------+------------+-----------------+--------------------+-----+--------------------+--------------------+----------+--------------------+
|     aid|               title|                 url|              domain|votes|          user|          posted_at|comments|        source_title|         source_text|frontpage|text_length|               words|      filtered_words|         tf_features|      tfidf_features|domain_index|       domain_vec|            features|label|       rawPrediction|         probability|prediction|corrected_prediction|
+--------+--------------------+--------------------+--------------------+-----+--------------+-------------------+--------+--------------------+--------------------+---------+-----------+-----

24/05/25 20:48:52 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:48:52 WARN BlockManager: Block input-0-1716662932000 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:48:54 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:48:54 WARN BlockManager: Block input-0-1716662934200 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:48:57 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:48:57 WARN BlockManager: Block input-0-1716662937200 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:48:59 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:48:59 WARN BlockManager: Block input-0-1716662939200 replicated to only 0 peer(s) instead of 1 peers
                                                                                

+--------+--------------------+--------------------+--------------------+-----+--------------+-------------------+--------+--------------------+--------------------+---------+
|     aid|               title|                 url|              domain|votes|          user|          posted_at|comments|        source_title|         source_text|frontpage|
+--------+--------------------+--------------------+--------------------+-----+--------------+-------------------+--------+--------------------+--------------------+---------+
|40473167|           Supertape|https://www.super...|       supertape.com|    3|handfuloflight|2024-05-25 07:02:07|       0|    Just a moment...|Just a moment...\...|    false|
|40473198|   Mistral Fine-Tune|https://github.co...|github.com/mistralai|    5|     alexmolas|2024-05-25 07:09:47|       0|GitHub - mistrala...|GitHub - mistrala...|     true|
|40473206|Google's A.I. Sea...|https://www.nytim...|         nytimes.com|    2|      ivyirwin|2024-05-25 07:11:45|      

24/05/25 20:49:01 WARN DAGScheduler: Broadcasting large task binary with size 1418.6 KiB
24/05/25 20:49:01 WARN DAGScheduler: Broadcasting large task binary with size 1418.6 KiB


+--------+--------------------+--------------------+--------------------+-----+--------------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+--------------------+--------------------+--------------------+------------+------------------+--------------------+-----+--------------------+--------------------+----------+--------------------+
|     aid|               title|                 url|              domain|votes|          user|          posted_at|comments|        source_title|         source_text|frontpage|text_length|               words|      filtered_words|         tf_features|      tfidf_features|domain_index|        domain_vec|            features|label|       rawPrediction|         probability|prediction|corrected_prediction|
+--------+--------------------+--------------------+--------------------+-----+--------------+-------------------+--------+--------------------+--------------------+---------+-----------+---

24/05/25 20:49:01 WARN DAGScheduler: Broadcasting large task binary with size 1414.7 KiB
24/05/25 20:49:02 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:49:02 WARN BlockManager: Block input-0-1716662942200 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:49:07 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:49:07 WARN BlockManager: Block input-0-1716662947200 replicated to only 0 peer(s) instead of 1 peers
                                                                                

+--------+--------------------+--------------------+----------+-----+-------------+-------------------+--------+--------------------+--------------------+---------+
|     aid|               title|                 url|    domain|votes|         user|          posted_at|comments|        source_title|         source_text|frontpage|
+--------+--------------------+--------------------+----------+-----+-------------+-------------------+--------+--------------------+--------------------+---------+
|40473340|ICQ will stop wor...|https://icq.com/d...|   icq.com|    1|Wasserpuncher|2024-05-25 07:44:50|       2|                 ICQ|ICQ\n\n  * Sign I...|    false|
|40473375|Singapore Airline...|https://www.natur...|nature.com|    3|     zeristor|2024-05-25 07:52:34|       0|Singapore Airline...|Singapore Airline...|     true|
+--------+--------------------+--------------------+----------+-----+-------------+-------------------+--------+--------------------+--------------------+---------+



24/05/25 20:49:11 WARN DAGScheduler: Broadcasting large task binary with size 1418.6 KiB
24/05/25 20:49:11 WARN DAGScheduler: Broadcasting large task binary with size 1418.6 KiB


+--------+--------------------+--------------------+----------+-----+-------------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+--------------------+--------------------+--------------------+------------+-----------------+--------------------+-----+--------------------+--------------------+----------+--------------------+
|     aid|               title|                 url|    domain|votes|         user|          posted_at|comments|        source_title|         source_text|frontpage|text_length|               words|      filtered_words|         tf_features|      tfidf_features|domain_index|       domain_vec|            features|label|       rawPrediction|         probability|prediction|corrected_prediction|
+--------+--------------------+--------------------+----------+-----+-------------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+-----------------

24/05/25 20:49:11 WARN DAGScheduler: Broadcasting large task binary with size 1414.7 KiB
24/05/25 20:49:11 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:49:11 WARN BlockManager: Block input-0-1716662951400 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:49:15 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:49:15 WARN BlockManager: Block input-0-1716662955600 replicated to only 0 peer(s) instead of 1 peers
                                                                                

+--------+--------------------+--------------------+--------------------+-----+--------------+-------------------+--------+--------------------+--------------------+---------+
|     aid|               title|                 url|              domain|votes|          user|          posted_at|comments|        source_title|         source_text|frontpage|
+--------+--------------------+--------------------+--------------------+-----+--------------+-------------------+--------+--------------------+--------------------+---------+
|40473381|Tail Recursion El...|http://neopythoni...|neopythonic.blogs...|    1|      mpweiher|2024-05-25 07:54:10|       0|Tail Recursion El...|Neopythonic: Tail...|    false|
|40473382| Book Covers of 2023|https://lithub.co...|          lithub.com|    2|handfuloflight|2024-05-25 07:54:35|       0|The 139 Best Book...|The 139 Best Book...|    false|
+--------+--------------------+--------------------+--------------------+-----+--------------+-------------------+------

24/05/25 20:49:20 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:49:20 WARN BlockManager: Block input-0-1716662960600 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:49:21 WARN DAGScheduler: Broadcasting large task binary with size 1418.6 KiB
24/05/25 20:49:21 WARN DAGScheduler: Broadcasting large task binary with size 1418.6 KiB


+--------+--------------------+--------------------+--------------------+-----+--------------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+--------------------+--------------------+--------------------+------------+------------------+--------------------+-----+--------------------+--------------------+----------+--------------------+
|     aid|               title|                 url|              domain|votes|          user|          posted_at|comments|        source_title|         source_text|frontpage|text_length|               words|      filtered_words|         tf_features|      tfidf_features|domain_index|        domain_vec|            features|label|       rawPrediction|         probability|prediction|corrected_prediction|
+--------+--------------------+--------------------+--------------------+-----+--------------+-------------------+--------+--------------------+--------------------+---------+-----------+---

24/05/25 20:49:21 WARN DAGScheduler: Broadcasting large task binary with size 1414.7 KiB
24/05/25 20:49:21 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:49:21 WARN BlockManager: Block input-0-1716662961600 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:49:26 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:49:26 WARN BlockManager: Block input-0-1716662966600 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:49:28 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:49:28 WARN BlockManager: Block input-0-1716662968600 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:49:29 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:49:29 WARN BlockManager: Block input-0-1716662969600 replicated to only 0 peer(s) instead of 1 peers
                                                                                

+--------+--------------------+--------------------+--------------+-----+--------------+-------------------+--------+--------------------+--------------------+---------+
|     aid|               title|                 url|        domain|votes|          user|          posted_at|comments|        source_title|         source_text|frontpage|
+--------+--------------------+--------------------+--------------+-----+--------------+-------------------+--------+--------------------+--------------------+---------+
|40473387|Timeless and Unfo...|https://www.paper...| papertrue.com|    1|handfuloflight|2024-05-25 07:54:58|       0|50 Timeless and U...|50 Timeless and U...|    false|
|40473395|Knot-based Key Ex...|https://eprint.ia...|      iacr.org|    1|     g0xA52A2A|2024-05-25 07:56:32|       0|                NULL|                \n\n|    false|
|40473417|Choosing Technolo...|https://www.eno-w...|eno-writer.com|    1|        trbznk|2024-05-25 08:00:46|       0|012 - choosing te...|012 - choosi

24/05/25 20:49:30 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:49:30 WARN BlockManager: Block input-0-1716662970600 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:49:31 WARN DAGScheduler: Broadcasting large task binary with size 1418.6 KiB
24/05/25 20:49:31 WARN DAGScheduler: Broadcasting large task binary with size 1418.6 KiB


+--------+--------------------+--------------------+--------------+-----+--------------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+--------------------+--------------------+--------------------+------------+------------------+--------------------+-----+--------------------+--------------------+----------+--------------------+
|     aid|               title|                 url|        domain|votes|          user|          posted_at|comments|        source_title|         source_text|frontpage|text_length|               words|      filtered_words|         tf_features|      tfidf_features|domain_index|        domain_vec|            features|label|       rawPrediction|         probability|prediction|corrected_prediction|
+--------+--------------------+--------------------+--------------+-----+--------------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+

24/05/25 20:49:31 WARN DAGScheduler: Broadcasting large task binary with size 1414.7 KiB
24/05/25 20:49:36 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:49:36 WARN BlockManager: Block input-0-1716662975800 replicated to only 0 peer(s) instead of 1 peers
                                                                                

+--------+--------------------+--------------------+------------+-----+----------+-------------------+--------+--------------------+--------------------+---------+
|     aid|               title|                 url|      domain|votes|      user|          posted_at|comments|        source_title|         source_text|frontpage|
+--------+--------------------+--------------------+------------+-----+----------+-------------------+--------+--------------------+--------------------+---------+
|40473443|Tech Analysis – L...|https://www.lttla...| lttlabs.com|    1|  xbmcuser|2024-05-25 08:08:45|       0|    Just a moment...|Just a moment...\...|    false|
|40473446|From Pollution to...|https://www.cremi...|cremieux.xyz|    2|arrowsmith|2024-05-25 08:09:14|       0|From Pollution to...|From Pollution to...|    false|
+--------+--------------------+--------------------+------------+-----+----------+-------------------+--------+--------------------+--------------------+---------+



24/05/25 20:49:40 WARN DAGScheduler: Broadcasting large task binary with size 1418.6 KiB
24/05/25 20:49:41 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:49:41 WARN BlockManager: Block input-0-1716662980800 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:49:41 WARN DAGScheduler: Broadcasting large task binary with size 1418.6 KiB


+--------+--------------------+--------------------+------------+-----+----------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+--------------------+--------------------+--------------------+------------+------------+--------------------+-----+--------------------+--------------------+----------+--------------------+
|     aid|               title|                 url|      domain|votes|      user|          posted_at|comments|        source_title|         source_text|frontpage|text_length|               words|      filtered_words|         tf_features|      tfidf_features|domain_index|  domain_vec|            features|label|       rawPrediction|         probability|prediction|corrected_prediction|
+--------+--------------------+--------------------+------------+-----+----------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+--------------------+---------

24/05/25 20:49:41 WARN DAGScheduler: Broadcasting large task binary with size 1414.7 KiB
24/05/25 20:49:43 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:49:43 WARN BlockManager: Block input-0-1716662982800 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:49:46 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:49:46 WARN BlockManager: Block input-0-1716662986600 replicated to only 0 peer(s) instead of 1 peers
                                                                                

+--------+--------------------+--------------------+---------------+-----+-------+-------------------+--------+--------------------+--------------------+---------+
|     aid|               title|                 url|         domain|votes|   user|          posted_at|comments|        source_title|         source_text|frontpage|
+--------+--------------------+--------------------+---------------+-----+-------+-------------------+--------+--------------------+--------------------+---------+
|40473454|Male birth contro...|https://newatlas....|   newatlas.com|   34|   geox|2024-05-25 08:11:11|      10|Male birth contro...|Male birth contro...|     true|
|40473464|         Don't Panic|https://www.towel...|   towelday.org|    3|   hggh|2024-05-25 08:14:47|       0|         Don't Panic|Don't Panic\n\nDO...|     true|
|40473467|Post Office lobbi...|https://www.thegu...|theguardian.com|    1|chrisjj|2024-05-25 08:15:03|       0|‘A tragedy is not...|‘A tragedy is not...|    false|
+--------+------

24/05/25 20:49:51 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:49:51 WARN BlockManager: Block input-0-1716662990800 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:49:51 WARN DAGScheduler: Broadcasting large task binary with size 1418.6 KiB
24/05/25 20:49:51 WARN DAGScheduler: Broadcasting large task binary with size 1418.6 KiB


+--------+--------------------+--------------------+---------------+-----+-------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+--------------------+--------------------+--------------------+------------+-----------------+--------------------+-----+--------------------+--------------------+----------+--------------------+
|     aid|               title|                 url|         domain|votes|   user|          posted_at|comments|        source_title|         source_text|frontpage|text_length|               words|      filtered_words|         tf_features|      tfidf_features|domain_index|       domain_vec|            features|label|       rawPrediction|         probability|prediction|corrected_prediction|
+--------+--------------------+--------------------+---------------+-----+-------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+--------------------

24/05/25 20:49:51 WARN DAGScheduler: Broadcasting large task binary with size 1414.7 KiB
24/05/25 20:49:55 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:49:55 WARN BlockManager: Block input-0-1716662994800 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:49:58 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:49:58 WARN BlockManager: Block input-0-1716662997800 replicated to only 0 peer(s) instead of 1 peers
                                                                                

+--------+--------------------+--------------------+--------------------+-----+----------+-------------------+--------+--------------------+--------------------+---------+
|     aid|               title|                 url|              domain|votes|      user|          posted_at|comments|        source_title|         source_text|frontpage|
+--------+--------------------+--------------------+--------------------+-----+----------+-------------------+--------+--------------------+--------------------+---------+
|40473468|BackgroundMusic: ...|https://github.co...|github.com/kylene...|    1|cadmium-44|2024-05-25 08:15:19|       0|GitHub - kyleneid...|GitHub - kyleneid...|    false|
|40473531|Elon Musk slams M...|https://www.pguru...|          pgurus.com|    1|    taubek|2024-05-25 08:24:58|       0|Elon Musk Slams M...|Elon Musk Slams M...|    false|
|40473550|Big tech has dist...|https://www.thegu...|     theguardian.com|    3|  lastdong|2024-05-25 08:28:56|       0|Big tech has dist...|

24/05/25 20:50:01 WARN DAGScheduler: Broadcasting large task binary with size 1418.6 KiB
24/05/25 20:50:01 WARN DAGScheduler: Broadcasting large task binary with size 1418.6 KiB


+--------+--------------------+--------------------+--------------------+-----+----------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+--------------------+--------------------+--------------------+------------+----------------+--------------------+-----+--------------------+--------------------+----------+--------------------+
|     aid|               title|                 url|              domain|votes|      user|          posted_at|comments|        source_title|         source_text|frontpage|text_length|               words|      filtered_words|         tf_features|      tfidf_features|domain_index|      domain_vec|            features|label|       rawPrediction|         probability|prediction|corrected_prediction|
+--------+--------------------+--------------------+--------------------+-----+----------+-------------------+--------+--------------------+--------------------+---------+-----------+-------------------

24/05/25 20:50:01 WARN DAGScheduler: Broadcasting large task binary with size 1414.7 KiB
24/05/25 20:50:03 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:50:03 WARN BlockManager: Block input-0-1716663003000 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:50:05 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:50:05 WARN BlockManager: Block input-0-1716663005000 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:50:08 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:50:08 WARN BlockManager: Block input-0-1716663008000 replicated to only 0 peer(s) instead of 1 peers
                                                                                

+--------+--------------------+--------------------+-------------+-----+------------+-------------------+--------+--------------------+--------------------+---------+
|     aid|               title|                 url|       domain|votes|        user|          posted_at|comments|        source_title|         source_text|frontpage|
+--------+--------------------+--------------------+-------------+-----+------------+-------------------+--------+--------------------+--------------------+---------+
|40473595|Pāṇini: Catching ...|https://blog.gran...| granthika.co|    1|stareatgoats|2024-05-25 08:34:54|       0|Pāṇini: Catching ...|Pāṇini: Catching ...|    false|
|40473618|Parents are getti...|https://www.shaan...|shaanpuri.com|    2|       cdcro|2024-05-25 08:38:12|       0|Your parents are ...|Your parents are ...|    false|
|40473619|          Discipline|https://lichess.o...|  lichess.org|    1|       fzliu|2024-05-25 08:38:26|       0|          Discipline|BenjiPortheault's...|    false

24/05/25 20:50:11 WARN DAGScheduler: Broadcasting large task binary with size 1418.6 KiB
24/05/25 20:50:11 WARN DAGScheduler: Broadcasting large task binary with size 1418.6 KiB


+--------+--------------------+--------------------+-------------+-----+------------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+--------------------+--------------------+--------------------+------------+-------------------+--------------------+-----+--------------------+--------------------+----------+--------------------+
|     aid|               title|                 url|       domain|votes|        user|          posted_at|comments|        source_title|         source_text|frontpage|text_length|               words|      filtered_words|         tf_features|      tfidf_features|domain_index|         domain_vec|            features|label|       rawPrediction|         probability|prediction|corrected_prediction|
+--------+--------------------+--------------------+-------------+-----+------------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+-------

24/05/25 20:50:11 WARN DAGScheduler: Broadcasting large task binary with size 1414.7 KiB
24/05/25 20:50:13 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:50:13 WARN BlockManager: Block input-0-1716663013000 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:50:18 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:50:18 WARN BlockManager: Block input-0-1716663018000 replicated to only 0 peer(s) instead of 1 peers
                                                                                

+--------+--------------------+--------------------+----------------+-----+---------+-------------------+--------+--------------------+--------------------+---------+
|     aid|               title|                 url|          domain|votes|     user|          posted_at|comments|        source_title|         source_text|frontpage|
+--------+--------------------+--------------------+----------------+-----+---------+-------------------+--------+--------------------+--------------------+---------+
|40473656|Three New Superco...|https://spectrum....|        ieee.org|    2|westurner|2024-05-25 08:46:35|       0|Three New Superco...|Three New Superco...|    false|
|40473685|What makes a coun...|https://2019.watt...|wattenberger.com|    1|    tsujp|2024-05-25 08:55:03|       0| Amelia Wattenberger|Amelia Wattenberg...|    false|
+--------+--------------------+--------------------+----------------+-----+---------+-------------------+--------+--------------------+--------------------+---------

24/05/25 20:50:20 WARN DAGScheduler: Broadcasting large task binary with size 1418.6 KiB
24/05/25 20:50:21 WARN DAGScheduler: Broadcasting large task binary with size 1418.6 KiB


+--------+--------------------+--------------------+----------------+-----+---------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+--------------------+--------------------+--------------------+------------+-----------------+--------------------+-----+--------------------+--------------------+----------+--------------------+
|     aid|               title|                 url|          domain|votes|     user|          posted_at|comments|        source_title|         source_text|frontpage|text_length|               words|      filtered_words|         tf_features|      tfidf_features|domain_index|       domain_vec|            features|label|       rawPrediction|         probability|prediction|corrected_prediction|
+--------+--------------------+--------------------+----------------+-----+---------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+-----------

24/05/25 20:50:21 WARN DAGScheduler: Broadcasting large task binary with size 1414.7 KiB
24/05/25 20:50:22 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:50:22 WARN BlockManager: Block input-0-1716663022000 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:50:23 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:50:23 WARN BlockManager: Block input-0-1716663023000 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:50:24 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:50:24 WARN BlockManager: Block input-0-1716663024000 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:50:26 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:50:26 WARN BlockManager: Block input-0-1716663026200 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:50:29 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only

+--------+--------------------+--------------------+----------------+-----+-------------+-------------------+--------+--------------------+--------------------+---------+
|     aid|               title|                 url|          domain|votes|         user|          posted_at|comments|        source_title|         source_text|frontpage|
+--------+--------------------+--------------------+----------------+-----+-------------+-------------------+--------+--------------------+--------------------+---------+
|40473704|       Singles Atlas|https://singlesat...|singlesatlas.com|    2|        faebi|2024-05-25 08:59:00|       0|World singles map...|World singles map...|    false|
|40473725|What's New on PHP...|https://developer...| developerjoy.co|    3|      falcon_|2024-05-25 09:05:09|       0|        DeveloperJoy|What's new on PHP...|    false|
|40473774|     From Vim to Zed|https://registers...|thorstenball.com|    2|      redbell|2024-05-25 09:16:24|       0|     From Vim to Zed|From V

24/05/25 20:50:31 WARN DAGScheduler: Broadcasting large task binary with size 1418.6 KiB
24/05/25 20:50:31 WARN DAGScheduler: Broadcasting large task binary with size 1418.6 KiB


+--------+--------------------+--------------------+----------------+-----+-------------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+--------------------+--------------------+--------------------+------------+-------------------+--------------------+-----+--------------------+--------------------+----------+--------------------+
|     aid|               title|                 url|          domain|votes|         user|          posted_at|comments|        source_title|         source_text|frontpage|text_length|               words|      filtered_words|         tf_features|      tfidf_features|domain_index|         domain_vec|            features|label|       rawPrediction|         probability|prediction|corrected_prediction|
+--------+--------------------+--------------------+----------------+-----+-------------+-------------------+--------+--------------------+--------------------+---------+-----------+----------------

24/05/25 20:50:31 WARN DAGScheduler: Broadcasting large task binary with size 1414.7 KiB
24/05/25 20:50:32 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:50:32 WARN BlockManager: Block input-0-1716663032200 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:50:37 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:50:37 WARN BlockManager: Block input-0-1716663037200 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:50:38 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:50:38 WARN BlockManager: Block input-0-1716663038200 replicated to only 0 peer(s) instead of 1 peers
                                                                                

+--------+--------------------+--------------------+-------------+-----+---------------+-------------------+--------+--------------------+--------------------+---------+
|     aid|               title|                 url|       domain|votes|           user|          posted_at|comments|        source_title|         source_text|frontpage|
+--------+--------------------+--------------------+-------------+-----+---------------+-------------------+--------+--------------------+--------------------+---------+
|40473810|The Mechanics of ...|https://www.reute...|  reuters.com|    2|PuddleOfSausage|2024-05-25 09:25:13|       0|The mechanics of ...|How the Singapore...|    false|
|40473836|The Penrose Triangle|https://en.wikipe...|wikipedia.org|    1|         markx2|2024-05-25 09:34:02|       0|    Penrose triangle|Penrose triangle ...|    false|
|40473855|Interior design c...|https://algedra.a...|   algedra.ae|    1|    ameliaalana|2024-05-25 09:37:55|       0|Best Interior Des...|Best Interio

24/05/25 20:50:41 WARN DAGScheduler: Broadcasting large task binary with size 1418.6 KiB
24/05/25 20:50:41 WARN DAGScheduler: Broadcasting large task binary with size 1418.6 KiB


+--------+--------------------+--------------------+-------------+-----+---------------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+--------------------+--------------------+--------------------+------------+------------------+--------------------+-----+--------------------+--------------------+----------+--------------------+
|     aid|               title|                 url|       domain|votes|           user|          posted_at|comments|        source_title|         source_text|frontpage|text_length|               words|      filtered_words|         tf_features|      tfidf_features|domain_index|        domain_vec|            features|label|       rawPrediction|         probability|prediction|corrected_prediction|
+--------+--------------------+--------------------+-------------+-----+---------------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+

24/05/25 20:50:41 WARN DAGScheduler: Broadcasting large task binary with size 1414.7 KiB
24/05/25 20:50:43 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:50:43 WARN BlockManager: Block input-0-1716663043200 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:50:46 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:50:46 WARN BlockManager: Block input-0-1716663046400 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:50:48 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:50:48 WARN BlockManager: Block input-0-1716663048200 replicated to only 0 peer(s) instead of 1 peers
                                                                                

+--------+--------------------+--------------------+--------------------+-----+---------------+-------------------+--------+--------------------+--------------------+---------+
|     aid|               title|                 url|              domain|votes|           user|          posted_at|comments|        source_title|         source_text|frontpage|
+--------+--------------------+--------------------+--------------------+-----+---------------+-------------------+--------+--------------------+--------------------+---------+
|40473876|            SmidgION|https://nanoporet...|    nanoporetech.com|    1|       pr337h4m|2024-05-25 09:42:15|       0|            SmidgION|SmidgION | Oxford...|    false|
|40473889|"The mother of al...|https://github.co...|github.com/gothin...|    1|      probhavsh|2024-05-25 09:43:49|       0|GitHub - gothinks...|GitHub - gothinks...|    false|
|40473909|Last major Arabic...|https://www.thegu...|     theguardian.com|    4|YeGoblynQueenne|2024-05-25 09:48:52|

24/05/25 20:50:51 WARN DAGScheduler: Broadcasting large task binary with size 1418.6 KiB
24/05/25 20:50:51 WARN DAGScheduler: Broadcasting large task binary with size 1418.6 KiB


+--------+--------------------+--------------------+--------------------+-----+---------------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+--------------------+--------------------+--------------------+------------+----------------+--------------------+-----+--------------------+--------------------+----------+--------------------+
|     aid|               title|                 url|              domain|votes|           user|          posted_at|comments|        source_title|         source_text|frontpage|text_length|               words|      filtered_words|         tf_features|      tfidf_features|domain_index|      domain_vec|            features|label|       rawPrediction|         probability|prediction|corrected_prediction|
+--------+--------------------+--------------------+--------------------+-----+---------------+-------------------+--------+--------------------+--------------------+---------+-----------+----

24/05/25 20:50:51 WARN DAGScheduler: Broadcasting large task binary with size 1414.7 KiB
24/05/25 20:50:53 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:50:53 WARN BlockManager: Block input-0-1716663053400 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:50:55 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:50:55 WARN BlockManager: Block input-0-1716663055200 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:50:58 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:50:58 WARN BlockManager: Block input-0-1716663058400 replicated to only 0 peer(s) instead of 1 peers
                                                                                

+--------+--------------------+--------------------+--------------------+-----+------------+-------------------+--------+--------------------+--------------------+---------+
|     aid|               title|                 url|              domain|votes|        user|          posted_at|comments|        source_title|         source_text|frontpage|
+--------+--------------------+--------------------+--------------------+-----+------------+-------------------+--------+--------------------+--------------------+---------+
|40473916|    Nushell Bashisms|https://www.nushe...|          nushell.sh|    1|        tosh|2024-05-25 09:50:54|       0|Nushell Bashisms ...|Nushell Bashisms ...|    false|
|40473923|Daylight – A More...|https://daylightc...|daylightcomputer.com|    1|        gavi|2024-05-25 09:52:54|       0|Daylight | A More...|Daylight | A More...|    false|
|40473935|Installing Adobe'...|https://linuxtech...|        linuxtech.in|    3|avinvarghese|2024-05-25 09:56:08|       0|Installin

24/05/25 20:51:01 WARN DAGScheduler: Broadcasting large task binary with size 1418.6 KiB
24/05/25 20:51:01 WARN DAGScheduler: Broadcasting large task binary with size 1418.6 KiB


+--------+--------------------+--------------------+--------------------+-----+------------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+--------------------+--------------------+--------------------+------------+------------+--------------------+-----+--------------------+--------------------+----------+--------------------+
|     aid|               title|                 url|              domain|votes|        user|          posted_at|comments|        source_title|         source_text|frontpage|text_length|               words|      filtered_words|         tf_features|      tfidf_features|domain_index|  domain_vec|            features|label|       rawPrediction|         probability|prediction|corrected_prediction|
+--------+--------------------+--------------------+--------------------+-----+------------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+

24/05/25 20:51:01 WARN DAGScheduler: Broadcasting large task binary with size 1414.7 KiB
24/05/25 20:51:03 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:51:03 WARN BlockManager: Block input-0-1716663063400 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:51:08 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:51:08 WARN BlockManager: Block input-0-1716663068400 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:51:09 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:51:09 WARN BlockManager: Block input-0-1716663069400 replicated to only 0 peer(s) instead of 1 peers
                                                                                

+--------+--------------------+--------------------+------------+-----+--------+-------------------+--------+--------------------+--------------------+---------+
|     aid|               title|                 url|      domain|votes|    user|          posted_at|comments|        source_title|         source_text|frontpage|
+--------+--------------------+--------------------+------------+-----+--------+-------------------+--------+--------------------+--------------------+---------+
|40473950|Boeing 737-9 MAX ...|https://www.bosto...|  boston.com|    3| hosteur|2024-05-25 10:00:10|       0|Boston flight mak...|Boston flight mak...|    false|
|40473961|How the German co...|https://www.dw.co...|      dw.com|    2|    tosh|2024-05-25 10:05:03|       0|How the German co...|How the German co...|    false|
|40473984|Show HN: LaraDocs...|https://laradocs.dev|laradocs.dev|    1|TonnnnUK|2024-05-25 10:13:27|       0|            LaraDocs|LaraDocs - Larave...|    false|
+--------+------------------

24/05/25 20:51:11 WARN DAGScheduler: Broadcasting large task binary with size 1418.6 KiB
24/05/25 20:51:11 WARN DAGScheduler: Broadcasting large task binary with size 1418.6 KiB


+--------+--------------------+--------------------+------------+-----+--------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+--------------------+--------------------+--------------------+------------+------------------+--------------------+-----+--------------------+--------------------+----------+--------------------+
|     aid|               title|                 url|      domain|votes|    user|          posted_at|comments|        source_title|         source_text|frontpage|text_length|               words|      filtered_words|         tf_features|      tfidf_features|domain_index|        domain_vec|            features|label|       rawPrediction|         probability|prediction|corrected_prediction|
+--------+--------------------+--------------------+------------+-----+--------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+--------------------+---

24/05/25 20:51:11 WARN DAGScheduler: Broadcasting large task binary with size 1414.7 KiB
24/05/25 20:52:10 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:52:10 WARN BlockManager: Block input-0-1716663130600 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:52:15 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:52:15 WARN BlockManager: Block input-0-1716663134800 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:52:15 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:52:15 WARN BlockManager: Block input-0-1716663135600 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:52:18 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:52:18 WARN BlockManager: Block input-0-1716663137800 replicated to only 0 peer(s) instead of 1 peers
                                                                                

+--------+--------------------+--------------------+-----------------+-----+-----------+-------------------+--------+--------------------+--------------------+---------+
|     aid|               title|                 url|           domain|votes|       user|          posted_at|comments|        source_title|         source_text|frontpage|
+--------+--------------------+--------------------+-----------------+-----+-----------+-------------------+--------+--------------------+--------------------+---------+
|40474019|Deploying Perl Da...|https://www.perl....|         perl.com|    1|thunderbong|2024-05-25 10:22:49|       0|Deploying Dancer ...|Deploying Dancer ...|    false|
|40474021|A Woman Who Left ...|https://bikepacki...|  bikepacking.com|    5|     latexr|2024-05-25 10:23:54|       0|A Woman Who Left ...|Weighing in on "M...|     true|
|40474023|Front Crawl Swimm...|https://www.scien...|sciencedirect.com|    1|       wslh|2024-05-25 10:24:02|       0|       ScienceDirect|ScienceDirec

24/05/25 20:52:20 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:52:20 WARN BlockManager: Block input-0-1716663140600 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:52:21 WARN DAGScheduler: Broadcasting large task binary with size 1418.6 KiB
24/05/25 20:52:21 WARN DAGScheduler: Broadcasting large task binary with size 1418.6 KiB


+--------+--------------------+--------------------+-----------------+-----+-----------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+--------------------+--------------------+--------------------+------------+------------------+--------------------+-----+--------------------+--------------------+----------+--------------------+
|     aid|               title|                 url|           domain|votes|       user|          posted_at|comments|        source_title|         source_text|frontpage|text_length|               words|      filtered_words|         tf_features|      tfidf_features|domain_index|        domain_vec|            features|label|       rawPrediction|         probability|prediction|corrected_prediction|
+--------+--------------------+--------------------+-----------------+-----+-----------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+

24/05/25 20:52:21 WARN DAGScheduler: Broadcasting large task binary with size 1414.7 KiB
24/05/25 20:52:21 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:52:21 WARN BlockManager: Block input-0-1716663141600 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:52:23 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:52:23 WARN BlockManager: Block input-0-1716663143600 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:52:28 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:52:28 WARN BlockManager: Block input-0-1716663147800 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:52:29 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:52:29 WARN BlockManager: Block input-0-1716663148800 replicated to only 0 peer(s) instead of 1 peers
                                                                                

+--------+--------------------+--------------------+------------------+-----+------------+-------------------+--------+--------------------+--------------------+---------+
|     aid|               title|                 url|            domain|votes|        user|          posted_at|comments|        source_title|         source_text|frontpage|
+--------+--------------------+--------------------+------------------+-----+------------+-------------------+--------+--------------------+--------------------+---------+
|40474049|Google once again...|https://www.genig...|     genigears.com|    5|   zerohedge|2024-05-25 10:28:57|       2|Google's AI Overv...|Google's AI Overv...|    false|
|40474051|Using a light air...|http://www.stepha...|stephan-schwab.com|    1|    mpweiher|2024-05-25 10:29:36|       0|Using a light air...|Using a light air...|    false|
|40474052|Trees and Dags in...|https://soundness...|     soundness.dev|    3|      cbeach|2024-05-25 10:29:39|       0|Soundness: Dendro...|

24/05/25 20:52:31 WARN DAGScheduler: Broadcasting large task binary with size 1418.6 KiB
24/05/25 20:52:31 WARN DAGScheduler: Broadcasting large task binary with size 1418.6 KiB


+--------+--------------------+--------------------+------------------+-----+------------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+--------------------+--------------------+--------------------+------------+------------------+--------------------+-----+--------------------+--------------------+----------+--------------------+
|     aid|               title|                 url|            domain|votes|        user|          posted_at|comments|        source_title|         source_text|frontpage|text_length|               words|      filtered_words|         tf_features|      tfidf_features|domain_index|        domain_vec|            features|label|       rawPrediction|         probability|prediction|corrected_prediction|
+--------+--------------------+--------------------+------------------+-----+------------+-------------------+--------+--------------------+--------------------+---------+-----------+---------------

24/05/25 20:52:31 WARN DAGScheduler: Broadcasting large task binary with size 1414.7 KiB
24/05/25 20:52:33 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:52:33 WARN BlockManager: Block input-0-1716663153000 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:52:37 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:52:37 WARN BlockManager: Block input-0-1716663157000 replicated to only 0 peer(s) instead of 1 peers
                                                                                

+--------+--------------------+--------------------+---------------+-----+-------+-------------------+--------+--------------------+--------------------+---------+
|     aid|               title|                 url|         domain|votes|   user|          posted_at|comments|        source_title|         source_text|frontpage|
+--------+--------------------+--------------------+---------------+-----+-------+-------------------+--------+--------------------+--------------------+---------+
|40474069|Triathlon Forum: ...|https://forum.slo...| slowtwitch.com|    1|   wslh|2024-05-25 10:32:16|       0|Early Thoughts on...|Early Thoughts on...|    false|
|40474079| Embracing Ambiguity|https://en.wikive...|wikiversity.org|    1|yamrzou|2024-05-25 10:34:55|       0|Embracing Ambigui...|Embracing Ambigui...|    false|
+--------+--------------------+--------------------+---------------+-----+-------+-------------------+--------+--------------------+--------------------+---------+



24/05/25 20:52:40 WARN DAGScheduler: Broadcasting large task binary with size 1418.6 KiB
24/05/25 20:52:41 WARN DAGScheduler: Broadcasting large task binary with size 1418.6 KiB


+--------+--------------------+--------------------+---------------+-----+-------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+--------------------+--------------------+--------------------+------------+------------+--------------------+-----+--------------------+--------------------+----------+--------------------+
|     aid|               title|                 url|         domain|votes|   user|          posted_at|comments|        source_title|         source_text|frontpage|text_length|               words|      filtered_words|         tf_features|      tfidf_features|domain_index|  domain_vec|            features|label|       rawPrediction|         probability|prediction|corrected_prediction|
+--------+--------------------+--------------------+---------------+-----+-------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+--------------------+---------

24/05/25 20:52:41 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:52:41 WARN BlockManager: Block input-0-1716663161000 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:52:41 WARN DAGScheduler: Broadcasting large task binary with size 1414.7 KiB
24/05/25 20:52:43 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:52:43 WARN BlockManager: Block input-0-1716663163000 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:52:45 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:52:45 WARN BlockManager: Block input-0-1716663165000 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:52:50 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:52:50 WARN BlockManager: Block input-0-1716663170000 replicated to only 0 peer(s) instead of 1 peers


+--------+--------------------+--------------------+-------------+-----+-------+-------------------+--------+--------------------+--------------------+---------+
|     aid|               title|                 url|       domain|votes|   user|          posted_at|comments|        source_title|         source_text|frontpage|
+--------+--------------------+--------------------+-------------+-----+-------+-------------------+--------+--------------------+--------------------+---------+
|40474084|KDE Plasma 6.1 Be...|https://kde.org/a...|      kde.org|    1|jrepinc|2024-05-25 10:36:31|       0|KDE Plasma 6.1 Be...|KDE Plasma 6.1 Be...|    false|
|40474101|The Route of a Te...|https://web.archi...|  archive.org|    3|  dewey|2024-05-25 10:41:31|       0|The Route of a Te...|The Route of a Te...|     true|
|40474108|Ghost in the Ethe...|https://blog.benj...|benjojo.co.uk|    1|  fanf2|2024-05-25 10:42:03|       0|Ghost in the ethe...|Ghost in the ethe...|    false|
+--------+------------------

24/05/25 20:52:51 WARN DAGScheduler: Broadcasting large task binary with size 1418.6 KiB
24/05/25 20:52:51 WARN DAGScheduler: Broadcasting large task binary with size 1418.6 KiB


+--------+--------------------+--------------------+-------------+-----+-------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+--------------------+--------------------+--------------------+------------+------------------+--------------------+-----+--------------------+--------------------+----------+--------------------+
|     aid|               title|                 url|       domain|votes|   user|          posted_at|comments|        source_title|         source_text|frontpage|text_length|               words|      filtered_words|         tf_features|      tfidf_features|domain_index|        domain_vec|            features|label|       rawPrediction|         probability|prediction|corrected_prediction|
+--------+--------------------+--------------------+-------------+-----+-------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+--------------------+---

24/05/25 20:52:51 WARN DAGScheduler: Broadcasting large task binary with size 1414.7 KiB
24/05/25 20:52:53 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:52:53 WARN BlockManager: Block input-0-1716663173000 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:52:57 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:52:57 WARN BlockManager: Block input-0-1716663177000 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:53:00 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:53:00 WARN BlockManager: Block input-0-1716663180000 replicated to only 0 peer(s) instead of 1 peers


+--------+--------------------+--------------------+--------------------+-----+-----------+-------------------+--------+--------------------+--------------------+---------+
|     aid|               title|                 url|              domain|votes|       user|          posted_at|comments|        source_title|         source_text|frontpage|
+--------+--------------------+--------------------+--------------------+-----+-----------+-------------------+--------+--------------------+--------------------+---------+
|40474125|The Minimum Marke...|https://www.ellio...|    elliotcsmith.com|    1|     smitec|2024-05-25 10:45:56|       0|The Minimum Marke...|The Minimum Marke...|    false|
|40474165|Lapis: A Web Fram...|https://leafo.net...|           leafo.net|   14|thunderbong|2024-05-25 10:56:41|       0|Lapis - A web fra...|Lapis - A web fra...|     true|
|40474180|It's cheaper to b...|https://twitter.c...|twitter.com/billf...|    7|     doener|2024-05-25 11:01:20|       4|               

24/05/25 20:53:01 WARN DAGScheduler: Broadcasting large task binary with size 1418.6 KiB
24/05/25 20:53:01 WARN DAGScheduler: Broadcasting large task binary with size 1418.6 KiB


+--------+--------------------+--------------------+--------------------+-----+-----------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+--------------------+--------------------+--------------------+------------+------------+--------------------+-----+--------------------+--------------------+----------+--------------------+
|     aid|               title|                 url|              domain|votes|       user|          posted_at|comments|        source_title|         source_text|frontpage|text_length|               words|      filtered_words|         tf_features|      tfidf_features|domain_index|  domain_vec|            features|label|       rawPrediction|         probability|prediction|corrected_prediction|
+--------+--------------------+--------------------+--------------------+-----+-----------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+---

24/05/25 20:53:01 WARN DAGScheduler: Broadcasting large task binary with size 1414.7 KiB
24/05/25 20:53:02 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:53:02 WARN BlockManager: Block input-0-1716663182200 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:53:06 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:53:06 WARN BlockManager: Block input-0-1716663186000 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:53:08 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:53:08 WARN BlockManager: Block input-0-1716663188200 replicated to only 0 peer(s) instead of 1 peers
                                                                                

+--------+--------------------+--------------------+--------------------+-----+--------+-------------------+--------+--------------------+--------------------+---------+
|     aid|               title|                 url|              domain|votes|    user|          posted_at|comments|        source_title|         source_text|frontpage|
+--------+--------------------+--------------------+--------------------+-----+--------+-------------------+--------+--------------------+--------------------+---------+
|40474195|     Bing API Outage|https://twitter.c...|twitter.com/nixcraft|    1|  doener|2024-05-25 11:05:09|       0|                   X|X\n\nDon’t miss w...|    false|
|40474201|I'm in love with ...|https://unherd.co...|          unherd.com|   27|elsewhen|2024-05-25 11:13:47|      21|I'm in love with ...|I'm in love with ...|     true|
|40474202|Google just updat...|https://www.bbc.c...|             bbc.com|   68|sonabinu|2024-05-25 11:14:26|      31|Google just updat...|Google just 

24/05/25 20:53:11 WARN DAGScheduler: Broadcasting large task binary with size 1418.6 KiB
24/05/25 20:53:11 WARN DAGScheduler: Broadcasting large task binary with size 1418.6 KiB


+--------+--------------------+--------------------+--------------------+-----+--------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+--------------------+--------------------+--------------------+------------+-----------------+--------------------+-----+--------------------+--------------------+----------+--------------------+
|     aid|               title|                 url|              domain|votes|    user|          posted_at|comments|        source_title|         source_text|frontpage|text_length|               words|      filtered_words|         tf_features|      tfidf_features|domain_index|       domain_vec|            features|label|       rawPrediction|         probability|prediction|corrected_prediction|
+--------+--------------------+--------------------+--------------------+-----+--------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+--

24/05/25 20:53:11 WARN DAGScheduler: Broadcasting large task binary with size 1414.7 KiB
24/05/25 20:53:13 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:53:13 WARN BlockManager: Block input-0-1716663193200 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:53:16 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:53:16 WARN BlockManager: Block input-0-1716663196400 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:53:18 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:53:18 WARN BlockManager: Block input-0-1716663198200 replicated to only 0 peer(s) instead of 1 peers
                                                                                

+--------+--------------------+--------------------+-----------+-----+---------------+-------------------+--------+--------------------+--------------------+---------+
|     aid|               title|                 url|     domain|votes|           user|          posted_at|comments|        source_title|         source_text|frontpage|
+--------+--------------------+--------------------+-----------+-----+---------------+-------------------+--------+--------------------+--------------------+---------+
|40474207|What airlines can...|https://www.ft.co...|     ft.com|    2|quick_brown_fox|2024-05-25 11:18:51|       0|                NULL|What airlines can...|    false|
|40474212|Atomstr: RSS/Atom...|https://atomstr.d...|  data.haus|    2|     janandonly|2024-05-25 11:20:25|       0|             atomstr|atomstr\n\n# atom...|    false|
|40474227|Fitting High-Leve...|https://www.youtu...|youtube.com|    1|      hasheddan|2024-05-25 11:24:19|       0|Fitting High-Leve...|Fitting High-Leve...|   

24/05/25 20:53:21 WARN DAGScheduler: Broadcasting large task binary with size 1418.6 KiB
24/05/25 20:53:21 WARN DAGScheduler: Broadcasting large task binary with size 1418.6 KiB
24/05/25 20:53:21 WARN DAGScheduler: Broadcasting large task binary with size 1414.7 KiB


+--------+--------------------+--------------------+-----------+-----+---------------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+--------------------+--------------------+--------------------+------------+------------------+--------------------+-----+--------------------+--------------------+----------+--------------------+
|     aid|               title|                 url|     domain|votes|           user|          posted_at|comments|        source_title|         source_text|frontpage|text_length|               words|      filtered_words|         tf_features|      tfidf_features|domain_index|        domain_vec|            features|label|       rawPrediction|         probability|prediction|corrected_prediction|
+--------+--------------------+--------------------+-----------+-----+---------------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+------

24/05/25 20:53:23 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:53:23 WARN BlockManager: Block input-0-1716663203200 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:53:26 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:53:26 WARN BlockManager: Block input-0-1716663206400 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:53:27 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:53:27 WARN BlockManager: Block input-0-1716663207200 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:53:28 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:53:28 WARN BlockManager: Block input-0-1716663208400 replicated to only 0 peer(s) instead of 1 peers
                                                                                

+--------+--------------------+--------------------+------------------+-----+-----------+-------------------+--------+--------------------+--------------------+---------+
|     aid|               title|                 url|            domain|votes|       user|          posted_at|comments|        source_title|         source_text|frontpage|
+--------+--------------------+--------------------+------------------+-----+-----------+-------------------+--------+--------------------+--------------------+---------+
|40474234|NASA finds more i...|https://arstechni...|   arstechnica.com|    1|andreiursan|2024-05-25 11:25:38|       0|NASA finds more i...|NASA finds more i...|    false|
|40474235|The potential of ...|https://www.europ...|         europa.eu|    1|      kimmk|2024-05-25 11:25:43|       0|                NULL|                \n\n|    false|
|40474236|Publishing AI Slo...|https://daringfir...|daringfireball.net|    9|   mpweiher|2024-05-25 11:25:52|       0|Publishing AI Slo...|Daring

24/05/25 20:53:30 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:53:30 WARN BlockManager: Block input-0-1716663210200 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:53:31 WARN DAGScheduler: Broadcasting large task binary with size 1418.6 KiB
24/05/25 20:53:31 WARN DAGScheduler: Broadcasting large task binary with size 1418.6 KiB


+--------+--------------------+--------------------+------------------+-----+-----------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+--------------------+--------------------+--------------------+------------+------------------+--------------------+-----+--------------------+--------------------+----------+--------------------+
|     aid|               title|                 url|            domain|votes|       user|          posted_at|comments|        source_title|         source_text|frontpage|text_length|               words|      filtered_words|         tf_features|      tfidf_features|domain_index|        domain_vec|            features|label|       rawPrediction|         probability|prediction|corrected_prediction|
+--------+--------------------+--------------------+------------------+-----+-----------+-------------------+--------+--------------------+--------------------+---------+-----------+------------------

24/05/25 20:53:31 WARN DAGScheduler: Broadcasting large task binary with size 1414.7 KiB
24/05/25 20:53:33 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:53:33 WARN BlockManager: Block input-0-1716663213200 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:53:35 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:53:35 WARN BlockManager: Block input-0-1716663215200 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:53:36 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:53:36 WARN BlockManager: Block input-0-1716663216400 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:53:37 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:53:37 WARN BlockManager: Block input-0-1716663217400 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:53:38 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only

+--------+--------------------+--------------------+-------------------+-----+------------+-------------------+--------+--------------------+--------------------+---------+
|     aid|               title|                 url|             domain|votes|        user|          posted_at|comments|        source_title|         source_text|frontpage|
+--------+--------------------+--------------------+-------------------+-----+------------+-------------------+--------+--------------------+--------------------+---------+
|40474294|Lessons from the ...|https://arxiv.org...|          arxiv.org|    3|veryluckyxyz|2024-05-25 11:42:23|       0|Lessons from the ...|[2405.14782] Less...|     true|
|40474296|Feed and Blogroll...|https://andregarz...|    andregarzia.com|    1|todsacerdoti|2024-05-25 11:42:44|       0|Feed and Blogroll...|Feed and Blogroll...|    false|
|40474304|Ilgpu: A Modern G...|  https://ilgpu.net/|          ilgpu.net|    1|   PaulHoule|2024-05-25 11:43:37|       0|ILGPU - A Moder

24/05/25 20:53:41 WARN DAGScheduler: Broadcasting large task binary with size 1418.6 KiB
24/05/25 20:53:41 WARN DAGScheduler: Broadcasting large task binary with size 1418.6 KiB
24/05/25 20:53:41 WARN DAGScheduler: Broadcasting large task binary with size 1418.6 KiB
24/05/25 20:53:41 WARN DAGScheduler: Broadcasting large task binary with size 1414.7 KiB


+--------+--------------------+--------------------+-------------------+-----+------------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+--------------------+--------------------+--------------------+------------+-----------------+--------------------+-----+--------------------+--------------------+----------+--------------------+
|     aid|               title|                 url|             domain|votes|        user|          posted_at|comments|        source_title|         source_text|frontpage|text_length|               words|      filtered_words|         tf_features|      tfidf_features|domain_index|       domain_vec|            features|label|       rawPrediction|         probability|prediction|corrected_prediction|
+--------+--------------------+--------------------+-------------------+-----+------------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------

24/05/25 20:53:41 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:53:41 WARN BlockManager: Block input-0-1716663221600 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:53:44 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:53:44 WARN BlockManager: Block input-0-1716663224600 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:53:48 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:53:48 WARN BlockManager: Block input-0-1716663228400 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:53:49 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:53:49 WARN BlockManager: Block input-0-1716663229600 replicated to only 0 peer(s) instead of 1 peers
                                                                                

+--------+--------------------+--------------------+-------------------+-----+-----------+-------------------+--------+--------------------+--------------------+---------+
|     aid|               title|                 url|             domain|votes|       user|          posted_at|comments|        source_title|         source_text|frontpage|
+--------+--------------------+--------------------+-------------------+-----+-----------+-------------------+--------+--------------------+--------------------+---------+
|40474351|Apple elaborates ...|https://9to5mac.c...|        9to5mac.com|    1|  f_allwein|2024-05-25 11:50:45|       0|Apple elaborates ...|Apple elaborates ...|    false|
|40474376|SQLCipher is a se...|https://www.zetet...|        zetetic.net|    1|     doener|2024-05-25 11:55:09|       0|SQLCipher Design ...|SQLCipher Design ...|    false|
|40474425|    Hacked Happiness|https://www.mindp...|       mindpluz.com|    1|   samleecs|2024-05-25 12:05:17|       0|Mindpluz: Persona...|

24/05/25 20:53:51 WARN DAGScheduler: Broadcasting large task binary with size 1418.6 KiB
24/05/25 20:53:51 WARN DAGScheduler: Broadcasting large task binary with size 1418.6 KiB


+--------+--------------------+--------------------+-------------------+-----+-----------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+--------------------+--------------------+--------------------+------------+------------------+--------------------+-----+--------------------+--------------------+----------+--------------------+
|     aid|               title|                 url|             domain|votes|       user|          posted_at|comments|        source_title|         source_text|frontpage|text_length|               words|      filtered_words|         tf_features|      tfidf_features|domain_index|        domain_vec|            features|label|       rawPrediction|         probability|prediction|corrected_prediction|
+--------+--------------------+--------------------+-------------------+-----+-----------+-------------------+--------+--------------------+--------------------+---------+-----------+---------------

24/05/25 20:53:51 WARN DAGScheduler: Broadcasting large task binary with size 1414.7 KiB
24/05/25 20:53:52 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:53:52 WARN BlockManager: Block input-0-1716663232600 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:53:53 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:53:53 WARN BlockManager: Block input-0-1716663233400 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:53:55 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:53:55 WARN BlockManager: Block input-0-1716663235600 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:53:57 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:53:57 WARN BlockManager: Block input-0-1716663237600 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:53:59 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only

+--------+--------------------+--------------------+--------------+-----+-------------+-------------------+--------+--------------------+--------------------+---------+
|     aid|               title|                 url|        domain|votes|         user|          posted_at|comments|        source_title|         source_text|frontpage|
+--------+--------------------+--------------------+--------------+-----+-------------+-------------------+--------+--------------------+--------------------+---------+
|40474464|Benefits with CSV...|https://impler.io...|     impler.io|    4| bhavikchavda|2024-05-25 12:12:15|       0|ROI on CSV Excel ...|ROI on CSV Excel ...|    false|
|40474501|LeechBlock – A Si...|https://www.progi...|proginosko.com|    1|alabhyajindal|2024-05-25 12:17:27|       0|          LeechBlock|LeechBlock | A Si...|    false|
|40474505|System Settings g...|https://www.macru...| macrumors.com|    1|       doener|2024-05-25 12:17:46|       0|macOS 15 System S...|macOS 15 System S.

24/05/25 20:54:00 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:54:00 WARN BlockManager: Block input-0-1716663240600 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:54:01 WARN DAGScheduler: Broadcasting large task binary with size 1418.6 KiB
24/05/25 20:54:01 WARN DAGScheduler: Broadcasting large task binary with size 1418.6 KiB


+--------+--------------------+--------------------+--------------+-----+-------------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+--------------------+--------------------+--------------------+------------+-----------------+--------------------+-----+--------------------+--------------------+----------+--------------------+
|     aid|               title|                 url|        domain|votes|         user|          posted_at|comments|        source_title|         source_text|frontpage|text_length|               words|      filtered_words|         tf_features|      tfidf_features|domain_index|       domain_vec|            features|label|       rawPrediction|         probability|prediction|corrected_prediction|
+--------+--------------------+--------------------+--------------+-----+-------------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+-----

24/05/25 20:54:01 WARN DAGScheduler: Broadcasting large task binary with size 1414.7 KiB
24/05/25 20:54:04 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:54:04 WARN BlockManager: Block input-0-1716663244600 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:54:06 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:54:06 WARN BlockManager: Block input-0-1716663245800 replicated to only 0 peer(s) instead of 1 peers
                                                                                

+--------+--------------------+--------------------+--------------------+-----+------+-------------------+--------+--------------------+--------------------+---------+
|     aid|               title|                 url|              domain|votes|  user|          posted_at|comments|        source_title|         source_text|frontpage|
+--------+--------------------+--------------------+--------------------+-----+------+-------------------+--------+--------------------+--------------------+---------+
|40474516|The deskilling of...|https://www.baldu...| baldurbjarnason.com|   29|loop22|2024-05-25 12:18:53|       9|The deskilling of...|The deskilling of...|     true|
|40474517|The day Putin cri...|https://not-enter...|not-entertainment...|    1|microt|2024-05-25 12:18:57|       0|             Vietnam|Vietnam\n\nSkip t...|    false|
|40474520|        Horror Vacui|https://en.wikipe...|       wikipedia.org|    1| EndXA|2024-05-25 12:19:41|       0|Horror vacui (phy...|Horror vacui (phy...|   

24/05/25 20:54:11 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:54:11 WARN BlockManager: Block input-0-1716663250800 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:54:11 WARN DAGScheduler: Broadcasting large task binary with size 1418.6 KiB
24/05/25 20:54:11 WARN DAGScheduler: Broadcasting large task binary with size 1418.6 KiB


+--------+--------------------+--------------------+--------------------+-----+------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+--------------------+--------------------+--------------------+------------+----------------+--------------------+-----+--------------------+--------------------+----------+--------------------+
|     aid|               title|                 url|              domain|votes|  user|          posted_at|comments|        source_title|         source_text|frontpage|text_length|               words|      filtered_words|         tf_features|      tfidf_features|domain_index|      domain_vec|            features|label|       rawPrediction|         probability|prediction|corrected_prediction|
+--------+--------------------+--------------------+--------------------+-----+------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+----------

24/05/25 20:54:11 WARN DAGScheduler: Broadcasting large task binary with size 1414.7 KiB
24/05/25 20:54:12 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:54:12 WARN BlockManager: Block input-0-1716663252600 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:54:18 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:54:18 WARN BlockManager: Block input-0-1716663257800 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:54:20 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:54:20 WARN BlockManager: Block input-0-1716663260000 replicated to only 0 peer(s) instead of 1 peers


+--------+--------------------+--------------------+--------------------+-----+----------+-------------------+--------+--------------------+--------------------+---------+
|     aid|               title|                 url|              domain|votes|      user|          posted_at|comments|        source_title|         source_text|frontpage|
+--------+--------------------+--------------------+--------------------+-----+----------+-------------------+--------+--------------------+--------------------+---------+
|40474521|David Heinemeier ...|https://highperfo...|highperformancesq...|    2|      tosh|2024-05-25 12:19:42|       0|David Heinemeier ...|David Heinemeier ...|    false|
|40474535|OpenAI: Non-dispa...|https://twitter.c...|twitter.com/jacob...|    1|    doener|2024-05-25 12:21:13|       0|                   X|X\n\nDon’t miss w...|    false|
|40474540|In defense of usi...|https://www.dseba...|      dsebastien.net|    2|dSebastien|2024-05-25 12:22:32|       0|Why You Should Ta...|

24/05/25 20:54:21 WARN DAGScheduler: Broadcasting large task binary with size 1418.6 KiB
24/05/25 20:54:21 WARN DAGScheduler: Broadcasting large task binary with size 1418.6 KiB


+--------+--------------------+--------------------+--------------------+-----+----------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+--------------------+--------------------+--------------------+------------+------------+--------------------+-----+--------------------+--------------------+----------+--------------------+
|     aid|               title|                 url|              domain|votes|      user|          posted_at|comments|        source_title|         source_text|frontpage|text_length|               words|      filtered_words|         tf_features|      tfidf_features|domain_index|  domain_vec|            features|label|       rawPrediction|         probability|prediction|corrected_prediction|
+--------+--------------------+--------------------+--------------------+-----+----------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+------

24/05/25 20:54:21 WARN DAGScheduler: Broadcasting large task binary with size 1414.7 KiB
                                                                                

+--------+--------------------+--------------------+--------+-----+-----+-------------------+--------+--------------------+--------------------+---------+
|     aid|               title|                 url|  domain|votes| user|          posted_at|comments|        source_title|         source_text|frontpage|
+--------+--------------------+--------------------+--------+-----+-----+-------------------+--------+--------------------+--------------------+---------+
|40474545|How we crafted a ...|https://www.craft...|craft.do|    2|gklka|2024-05-25 12:23:09|       0|How we crafted a ...|How we crafted a ...|    false|
+--------+--------------------+--------------------+--------+-----+-----+-------------------+--------+--------------------+--------------------+---------+



24/05/25 20:54:30 WARN DAGScheduler: Broadcasting large task binary with size 1418.6 KiB
24/05/25 20:54:31 WARN DAGScheduler: Broadcasting large task binary with size 1414.7 KiB


+--------+--------------------+--------------------+--------+-----+-----+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+--------------------+--------------------+--------------------+------------+------------+--------------------+-----+--------------------+--------------------+----------+--------------------+
|     aid|               title|                 url|  domain|votes| user|          posted_at|comments|        source_title|         source_text|frontpage|text_length|               words|      filtered_words|         tf_features|      tfidf_features|domain_index|  domain_vec|            features|label|       rawPrediction|         probability|prediction|corrected_prediction|
+--------+--------------------+--------------------+--------+-----+-----+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+--------------------+--------------------+---------------

24/05/25 20:55:24 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:55:24 WARN BlockManager: Block input-0-1716663324000 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:55:28 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:55:28 WARN BlockManager: Block input-0-1716663328200 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:55:29 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:55:29 WARN BlockManager: Block input-0-1716663329200 replicated to only 0 peer(s) instead of 1 peers
                                                                                

+--------+--------------------+--------------------+--------------+-----+---------+-------------------+--------+--------------------+--------------------+---------+
|     aid|               title|                 url|        domain|votes|     user|          posted_at|comments|        source_title|         source_text|frontpage|
+--------+--------------------+--------------------+--------------+-----+---------+-------------------+--------+--------------------+--------------------+---------+
|40474549|Fauci aide trigge...|https://usrtk.org...|     usrtk.org|    1|miguelazo|2024-05-25 12:23:50|       0|Fauci aide trigge...|Fauci aide trigge...|    false|
|40474557|Kombucha-associat...|https://journals....|      plos.org|   12|bookofjoe|2024-05-25 12:24:38|       0|Kombucha Tea-asso...|Kombucha Tea-asso...|     true|
|40474563|MoonScript A prog...|https://moonscrip...|moonscript.org|    2| raytopia|2024-05-25 12:25:34|       0|MoonScript, a lan...|MoonScript, a lan...|    false|
+--------+

24/05/25 20:55:31 WARN DAGScheduler: Broadcasting large task binary with size 1418.6 KiB
24/05/25 20:55:31 WARN DAGScheduler: Broadcasting large task binary with size 1418.6 KiB


+--------+--------------------+--------------------+--------------+-----+---------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+--------------------+--------------------+--------------------+------------+-------------------+--------------------+-----+--------------------+--------------------+----------+--------------------+
|     aid|               title|                 url|        domain|votes|     user|          posted_at|comments|        source_title|         source_text|frontpage|text_length|               words|      filtered_words|         tf_features|      tfidf_features|domain_index|         domain_vec|            features|label|       rawPrediction|         probability|prediction|corrected_prediction|
+--------+--------------------+--------------------+--------------+-----+---------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+-------------

24/05/25 20:55:31 WARN DAGScheduler: Broadcasting large task binary with size 1414.7 KiB
24/05/25 20:55:32 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:55:32 WARN BlockManager: Block input-0-1716663332200 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:55:33 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:55:33 WARN BlockManager: Block input-0-1716663333200 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:55:34 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:55:34 WARN BlockManager: Block input-0-1716663334200 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:55:37 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:55:37 WARN BlockManager: Block input-0-1716663337600 replicated to only 0 peer(s) instead of 1 peers
                                                                                

+--------+--------------------+--------------------+-------------+-----+--------------+-------------------+--------+--------------------+--------------------+---------+
|     aid|               title|                 url|       domain|votes|          user|          posted_at|comments|        source_title|         source_text|frontpage|
+--------+--------------------+--------------------+-------------+-----+--------------+-------------------+--------+--------------------+--------------------+---------+
|40474564|Neuralink Compres...|https://content.n...|neuralink.com|    1|         atrus|2024-05-25 12:25:42|       0|                NULL|                NULL|    false|
|40474594|Developers are no...|https://0xff.nu/d...|      0xff.nu|   21|          hxii|2024-05-25 12:30:12|      13|Developers aren't...|Developers aren't...|     true|
|40474603|OpenBSD – Clang -...|https://www.undea...| undeadly.org|    6|peter_hansteen|2024-05-25 12:32:00|       0|clang -fret-clean...|clang -fret-clean.

24/05/25 20:55:41 WARN DAGScheduler: Broadcasting large task binary with size 1418.6 KiB
24/05/25 20:55:41 ERROR PythonUDFRunner: Python worker exited unexpectedly (crashed)
org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/Users/xiaodi/anaconda3/spark/spark-3.5.1-bin-hadoop3/python/lib/pyspark.zip/pyspark/worker.py", line 1225, in main
    eval_type = read_int(infile)
                ^^^^^^^^^^^^^^^^
  File "/Users/xiaodi/anaconda3/spark/spark-3.5.1-bin-hadoop3/python/lib/pyspark.zip/pyspark/serializers.py", line 596, in read_int
    raise EOFError
EOFError

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:572)
	at org.apache.spark.sql.execution.python.BasePythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:94)
	at org.apache.spark.sql.execution.python.BasePythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:75)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunne

Exception in thread Thread-10:
Traceback (most recent call last):
  File "/Users/xiaodi/anaconda3/lib/python3.11/threading.py", line 1038, in _bootstrap_inner
    self.run()
  File "/var/folders/cz/pmt5rhc96x1dr6fqpqr44bmr0000gn/T/ipykernel_33325/265281298.py", line 11, in run
  File "/Users/xiaodi/anaconda3/spark/spark-3.5.1-bin-hadoop3/python/pyspark/streaming/context.py", line 239, in awaitTermination
    self._jssc.awaitTermination()
  File "/Users/xiaodi/anaconda3/spark/spark-3.5.1-bin-hadoop3/python/lib/py4j-0.10.9.7-src.zip/py4j/java_gateway.py", line 1322, in __call__
  File "/Users/xiaodi/anaconda3/spark/spark-3.5.1-bin-hadoop3/python/pyspark/errors/exceptions/captured.py", line 179, in deco
    return f(*a, **kw)
           ^^^^^^^^^^^
  File "/Users/xiaodi/anaconda3/spark/spark-3.5.1-bin-hadoop3/python/lib/py4j-0.10.9.7-src.zip/py4j/protocol.py", line 326, in get_return_value
py4j.protocol.Py4JJavaError: An error occurred while calling o13646.awaitTermination.
: org.apache.s

24/05/25 20:55:41 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:55:41 WARN BlockManager: Block input-0-1716663341400 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:55:42 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:55:42 WARN BlockManager: Block input-0-1716663342600 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:55:45 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:55:45 WARN BlockManager: Block input-0-1716663345400 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:55:49 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:55:49 WARN BlockManager: Block input-0-1716663349400 replicated to only 0 peer(s) instead of 1 peers
                                                                                

+--------+--------------------+--------------------+--------------------+-----+----------+-------------------+--------+--------------------+--------------------+---------+
|     aid|               title|                 url|              domain|votes|      user|          posted_at|comments|        source_title|         source_text|frontpage|
+--------+--------------------+--------------------+--------------------+-----+----------+-------------------+--------+--------------------+--------------------+---------+
|40474607|Bill Hwang Had $2...|https://www.bloom...|       bloomberg.com|    3|DeathArrow|2024-05-25 12:32:25|       0|           Bloomberg|Bloomberg - Are y...|    false|
|40474619|MCU #13: building...|https://lcamtuf.s...|lcamtuf.substack.com|    2| chmaynard|2024-05-25 12:34:53|       0|MCU #13: building...|MCU #13: building...|    false|
|40474632|Spark apps optimi...|https://medium.co...|   medium.com/akkidx|    1|  akkidx06|2024-05-25 12:37:31|       0|    Just a moment...|

24/05/25 20:55:51 WARN DAGScheduler: Broadcasting large task binary with size 1418.6 KiB
24/05/25 20:55:51 WARN DAGScheduler: Broadcasting large task binary with size 1418.6 KiB


+--------+--------------------+--------------------+--------------------+-----+----------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+--------------------+--------------------+--------------------+------------+-------------------+--------------------+-----+--------------------+--------------------+----------+--------------------+
|     aid|               title|                 url|              domain|votes|      user|          posted_at|comments|        source_title|         source_text|frontpage|text_length|               words|      filtered_words|         tf_features|      tfidf_features|domain_index|         domain_vec|            features|label|       rawPrediction|         probability|prediction|corrected_prediction|
+--------+--------------------+--------------------+--------------------+-----+----------+-------------------+--------+--------------------+--------------------+---------+-----------+-------------

24/05/25 20:55:51 WARN DAGScheduler: Broadcasting large task binary with size 1414.7 KiB
24/05/25 20:55:52 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:55:52 WARN BlockManager: Block input-0-1716663352600 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:55:56 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:55:56 WARN BlockManager: Block input-0-1716663356600 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:55:58 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:55:58 WARN BlockManager: Block input-0-1716663357800 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:56:00 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:56:00 WARN BlockManager: Block input-0-1716663359800 replicated to only 0 peer(s) instead of 1 peers
                                                                                

+--------+--------------------+--------------------+------------------+-----+---------+-------------------+--------+--------------------+--------------------+---------+
|     aid|               title|                 url|            domain|votes|     user|          posted_at|comments|        source_title|         source_text|frontpage|
+--------+--------------------+--------------------+------------------+-----+---------+-------------------+--------+--------------------+--------------------+---------+
|40474653|Tools for Thought...|https://maggieapp...|maggieappleton.com|    2|andsoitis|2024-05-25 12:41:57|       0|Tools for Thought...|Tools for Thought...|    false|
|40474654|Centimeter-scale ...|https://www.natur...|        nature.com|    1|PaulHoule|2024-05-25 12:42:03|       0|Centimeter-scale ...|Centimeter-scale ...|    false|
|40474697|At Random: The bu...|https://harpers.o...|       harpers.org|    1|    EndXA|2024-05-25 12:47:51|       0|           At Random|At Random, by Chr.

24/05/25 20:56:01 WARN DAGScheduler: Broadcasting large task binary with size 1418.6 KiB
24/05/25 20:56:01 WARN DAGScheduler: Broadcasting large task binary with size 1418.6 KiB


+--------+--------------------+--------------------+------------------+-----+---------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+--------------------+--------------------+--------------------+------------+------------------+--------------------+-----+--------------------+--------------------+----------+--------------------+
|     aid|               title|                 url|            domain|votes|     user|          posted_at|comments|        source_title|         source_text|frontpage|text_length|               words|      filtered_words|         tf_features|      tfidf_features|domain_index|        domain_vec|            features|label|       rawPrediction|         probability|prediction|corrected_prediction|
+--------+--------------------+--------------------+------------------+-----+---------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+---

24/05/25 20:56:01 WARN DAGScheduler: Broadcasting large task binary with size 1414.7 KiB
24/05/25 20:56:05 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:56:05 WARN BlockManager: Block input-0-1716663364800 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:56:06 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:56:06 WARN BlockManager: Block input-0-1716663365800 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:56:09 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:56:09 WARN BlockManager: Block input-0-1716663368800 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:56:10 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:56:10 WARN BlockManager: Block input-0-1716663369800 replicated to only 0 peer(s) instead of 1 peers
                                                                                

+--------+--------------------+--------------------+--------------------+-----+-------------+-------------------+--------+--------------------+--------------------+---------+
|     aid|               title|                 url|              domain|votes|         user|          posted_at|comments|        source_title|         source_text|frontpage|
+--------+--------------------+--------------------+--------------------+-----+-------------+-------------------+--------+--------------------+--------------------+---------+
|40474701|Why does Arc prox...|https://twitter.c...|twitter.com/rafal...|    2|   rpastuszak|2024-05-25 12:48:25|       0|                   X|X\n\nDon’t miss w...|    false|
|40474710|Linux as the new ...|https://www.hey.c...|             hey.com|    3|RyeCombinator|2024-05-25 12:49:54|       0|           HEY World|HEY — HEY World\n...|    false|
|40474712|Abusing Go's Infr...|https://reverse.p...|              put.as|   81|         efge|2024-05-25 12:50:00|       7|Abu

24/05/25 20:56:11 WARN DAGScheduler: Broadcasting large task binary with size 1418.6 KiB
24/05/25 20:56:11 WARN DAGScheduler: Broadcasting large task binary with size 1418.6 KiB


+--------+--------------------+--------------------+--------------------+-----+-------------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+--------------------+--------------------+--------------------+------------+-----------------+--------------------+-----+--------------------+--------------------+----------+--------------------+
|     aid|               title|                 url|              domain|votes|         user|          posted_at|comments|        source_title|         source_text|frontpage|text_length|               words|      filtered_words|         tf_features|      tfidf_features|domain_index|       domain_vec|            features|label|       rawPrediction|         probability|prediction|corrected_prediction|
+--------+--------------------+--------------------+--------------------+-----+-------------+-------------------+--------+--------------------+--------------------+---------+-----------+--------

24/05/25 20:56:11 WARN DAGScheduler: Broadcasting large task binary with size 1414.7 KiB
                                                                                

+--------+--------------------+--------------------+-----------------+-----+-----+-------------------+--------+--------------------+--------------------+---------+
|     aid|               title|                 url|           domain|votes| user|          posted_at|comments|        source_title|         source_text|frontpage|
+--------+--------------------+--------------------+-----------------+-----+-----+-------------------+--------+--------------------+--------------------+---------+
|40474736|You spend more wh...|https://kenthendr...|kenthendricks.com|    1|alihm|2024-05-25 12:55:06|       0|Why you spend mor...|Why you spend mor...|    false|
+--------+--------------------+--------------------+-----------------+-----+-----+-------------------+--------+--------------------+--------------------+---------+



24/05/25 20:56:20 WARN DAGScheduler: Broadcasting large task binary with size 1418.6 KiB
24/05/25 20:56:21 WARN DAGScheduler: Broadcasting large task binary with size 1414.7 KiB


+--------+--------------------+--------------------+-----------------+-----+-----+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+--------------------+--------------------+--------------------+------------+------------+--------------------+-----+--------------------+--------------------+----------+--------------------+
|     aid|               title|                 url|           domain|votes| user|          posted_at|comments|        source_title|         source_text|frontpage|text_length|               words|      filtered_words|         tf_features|      tfidf_features|domain_index|  domain_vec|            features|label|       rawPrediction|         probability|prediction|corrected_prediction|
+--------+--------------------+--------------------+-----------------+-----+-----+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+--------------------+---------

24/05/25 20:59:13 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:59:13 WARN BlockManager: Block input-0-1716663553200 replicated to only 0 peer(s) instead of 1 peers
                                                                                

+--------+--------------------+--------------------+--------+-----+---------+-------------------+--------+------------+-----------+---------+
|     aid|               title|                 url|  domain|votes|     user|          posted_at|comments|source_title|source_text|frontpage|
+--------+--------------------+--------------------+--------+-----+---------+-------------------+--------+------------+-----------+---------+
|40474759|The Cognitive Des...|http://gjgreenber...|ucla.edu|    7|andsoitis|2024-05-25 12:58:20|       2|        NULL|       \n\n|     true|
+--------+--------------------+--------------------+--------+-----+---------+-------------------+--------+------------+-----------+---------+



24/05/25 20:59:20 WARN DAGScheduler: Broadcasting large task binary with size 1418.6 KiB


+--------+--------------------+--------------------+--------+-----+---------+-------------------+--------+------------+-----------+---------+-----------+-----+--------------+-------------+--------------+------------+-------------------+--------------------+-----+--------------------+--------------------+----------+--------------------+
|     aid|               title|                 url|  domain|votes|     user|          posted_at|comments|source_title|source_text|frontpage|text_length|words|filtered_words|  tf_features|tfidf_features|domain_index|         domain_vec|            features|label|       rawPrediction|         probability|prediction|corrected_prediction|
+--------+--------------------+--------------------+--------+-----+---------+-------------------+--------+------------+-----------+---------+-----------+-----+--------------+-------------+--------------+------------+-------------------+--------------------+-----+--------------------+--------------------+----------+--------

24/05/25 20:59:21 WARN DAGScheduler: Broadcasting large task binary with size 1414.7 KiB
24/05/25 21:01:16 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 21:01:16 WARN BlockManager: Block input-0-1716663676600 replicated to only 0 peer(s) instead of 1 peers
                                                                                

+--------+--------------------+--------------------+-------------+-----+--------+-------------------+--------+--------------------+--------------------+---------+
|     aid|               title|                 url|       domain|votes|    user|          posted_at|comments|        source_title|         source_text|frontpage|
+--------+--------------------+--------------------+-------------+-----+--------+-------------------+--------+--------------------+--------------------+---------+
|40474779|Production of nex...|https://www.autoc...|autocar.co.uk|    1|peutetre|2024-05-25 13:01:03|       0|Production of nex...|Production of nex...|    false|
+--------+--------------------+--------------------+-------------+-----+--------+-------------------+--------+--------------------+--------------------+---------+



24/05/25 21:01:20 WARN DAGScheduler: Broadcasting large task binary with size 1418.6 KiB


+--------+--------------------+--------------------+-------------+-----+--------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+--------------------+--------------------+--------------------+------------+------------+--------------------+-----+--------------------+--------------------+----------+--------------------+
|     aid|               title|                 url|       domain|votes|    user|          posted_at|comments|        source_title|         source_text|frontpage|text_length|               words|      filtered_words|         tf_features|      tfidf_features|domain_index|  domain_vec|            features|label|       rawPrediction|         probability|prediction|corrected_prediction|
+--------+--------------------+--------------------+-------------+-----+--------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+--------------------+------------

24/05/25 21:01:21 WARN DAGScheduler: Broadcasting large task binary with size 1414.7 KiB
[Stage 1237:>                                                       (0 + 1) / 1]

In [20]:
ssc_t.stop()

----- Stopping... this may take a few seconds -----


24/05/25 21:04:13 ERROR ReceiverTracker: Deregistered receiver for stream 0: Stopped by driver
24/05/25 21:04:13 WARN SocketReceiver: Error receiving data
java.net.SocketException: Socket closed
	at java.net.SocketInputStream.socketRead0(Native Method)
	at java.net.SocketInputStream.socketRead(SocketInputStream.java:116)
	at java.net.SocketInputStream.read(SocketInputStream.java:171)
	at java.net.SocketInputStream.read(SocketInputStream.java:141)
	at sun.nio.cs.StreamDecoder.readBytes(StreamDecoder.java:284)
	at sun.nio.cs.StreamDecoder.implRead(StreamDecoder.java:326)
	at sun.nio.cs.StreamDecoder.read(StreamDecoder.java:178)
	at java.io.InputStreamReader.read(InputStreamReader.java:184)
	at java.io.BufferedReader.fill(BufferedReader.java:161)
	at java.io.BufferedReader.readLine(BufferedReader.java:324)
	at java.io.BufferedReader.readLine(BufferedReader.java:389)
	at org.apache.spark.streaming.dstream.SocketReceiver$$anon$2.getNext(SocketInputDStream.scala:121)
	at org.apache.spark.str

In [28]:
# Load saved predictions
predictions_df = spark.read.option("header", "true").csv("/Users/xiaodi/anaconda3/spark/notebooks/predictions")

# Define a functions to convert numbers to boolen
def bool_to_double(value):
    return 1.0 if value == "true" else 0.0 if value == "false" else None

bool_to_double_udf = udf(bool_to_double, DoubleType())
predictions_df = predictions_df.withColumn("frontpage_num", bool_to_double_udf(predictions_df["frontpage"]))

# Check for the lines
predictions_df.show(n=5)

# Compute the correct predictions and accuracy
correct_predictions = predictions_df.filter(predictions_df["frontpage_num"] == predictions_df["corrected_prediction"]).count()
total_predictions = predictions_df.count()
true_case = predictions_df.filter(predictions_df["frontpage_num"] == 1.0).count()
false_case = predictions_df.filter(predictions_df["frontpage_num"] == 0.0).count()

accuracy = correct_predictions / total_predictions

# Calculate other evaluation metrics
tp = predictions_df.filter((predictions_df["frontpage_num"] == 1.0) & (predictions_df["corrected_prediction"] == 1.0)).count()  # True Positives
tn = predictions_df.filter((predictions_df["frontpage_num"] == 0.0) & (predictions_df["corrected_prediction"] == 1.0)).count()  # True Negatives
fp = predictions_df.filter((predictions_df["frontpage_num"] == 0.0) & (predictions_df["corrected_prediction"] == 1.0)).count()  # False Positives
fn = predictions_df.filter((predictions_df["frontpage_num"] == 1.0) & (predictions_df["corrected_prediction"] == 0.0)).count()  # False Negatives

precision = tp / (tp + fp) if tp + fp != 0 else 0  # Precision
recall = tp / (tp + fn) if tp + fn != 0 else 0  # Recall
f1 = 2 * precision * recall / (precision + recall) if precision + recall != 0 else 0  # F1 Score

print(f"Correct Predictions: {correct_predictions}")
print(f"Total Predictions: {total_predictions}")
print("Frontpage = True:", true_case)
print("Frontpage = False:", false_case)
print(f"Accuracy: {accuracy}")
print("Precision: ", precision)
print("Recall: ", recall)
print("F1 Score: ", f1)


+--------+--------------------+--------------------+---------------+-----+------------+-------------------+--------+---------+--------------------+-------------+
|     aid|               title|                 url|         domain|votes|        user|          posted_at|comments|frontpage|corrected_prediction|frontpage_num|
+--------+--------------------+--------------------+---------------+-----+------------+-------------------+--------+---------+--------------------+-------------+
|40474069|Triathlon Forum: ...|https://forum.slo...| slowtwitch.com|    1|        wslh|2024-05-25 10:32:16|       0|    false|                 0.0|          0.0|
|40473550|Big tech has dist...|https://www.thegu...|theguardian.com|    3|    lastdong|2024-05-25 08:28:56|       0|     true|                 0.0|          1.0|
|40474234|NASA finds more i...|https://arstechni...|arstechnica.com|    1| andreiursan|2024-05-25 11:25:38|       0|    false|                 0.0|          0.0|
|40474064|If You Wish to Un.