In [1]:
import threading

# Helper thread to avoid the Spark StreamingContext from blocking Jupyter
        
class StreamingThread(threading.Thread):
    def __init__(self, ssc):
        super().__init__()
        self.ssc = ssc
    def run(self):
        self.ssc.start()
        self.ssc.awaitTermination()
    def stop(self):
        print('----- Stopping... this may take a few seconds -----')
        self.ssc.stop(stopSparkContext=False, stopGraceFully=True)

In [2]:
sc

In [3]:
spark

In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf, length
from pyspark.ml import PipelineModel
from pyspark.streaming import StreamingContext
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, StringIndexer
from pyspark.sql.types import StructType, StructField, StringType, BooleanType, LongType, DoubleType
from pyspark.ml import Pipeline
from pyspark.ml.classification import GBTClassifier, GBTClassificationModel
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [5]:
# Initialize the SparkSession
spark = SparkSession.builder \
    .appName("FrontpagePrediction") \
    .config("spark.executor.memory", "2g") \
    .config("spark.driver.memory", "2g") \
    .config("spark.executor.cores", "2") \
    .config("spark.driver.cores", "2") \
    .getOrCreate()


# Define the data schema from the json
schema = StructType([
    StructField("aid", StringType(), True),
    StructField("title", StringType(), True),
    StructField("url", StringType(), True),
    StructField("domain", StringType(), True),
    StructField("votes", LongType(), True),
    StructField("user", StringType(), True),
    StructField("posted_at", StringType(), True),
    StructField("comments", LongType(), True),
    StructField("source_title", StringType(), True),
    StructField("source_text", StringType(), True),
    StructField("frontpage", BooleanType(), True)
])

# Define a function to convert correct probability
def get_prediction(probability):
    return 0.0 if probability[1] <= 0.5 else 1.0

get_prediction_udf = udf(get_prediction, DoubleType())

# Define main process function
def process(time, rdd):
    if rdd.isEmpty():
        return
    
    print("========= %s =========" % str(time))
    
    # Convert rdd into data frame with specific schema
    df = spark.read.schema(schema).json(rdd)
    df.show(n=20)  # display the data we are capturing
    
    # Convert frontpage from boolean to string
    df = df.withColumn("frontpage", col("frontpage").cast("string"))
    # Add a new feature 'text_length'
    df = df.withColumn("text_length", length(col("source_text")))
    
    # Load the saved pipeline
    pipeline_model = PipelineModel.load("/Users/xiaodi/anaconda3/spark/notebooks/saved_pipeline")
    
    # Employ pipeline to convert and extract features
    df_transformed = pipeline_model.transform(df)
    
    # Load the saved GBT model
    if not globals().get('models_loaded', False):
        globals()['my_model'] = GBTClassificationModel.load("/Users/xiaodi/anaconda3/spark/notebooks/saved_gbt_model")
        globals()['models_loaded'] = True
    
    # Make predictions
    df_result = globals()['my_model'].transform(df_transformed)
    
    # Convert result into correct prediction
    df_result = df_result.withColumn("corrected_prediction", get_prediction_udf(col("probability")))
    
    # Display the first 20 lines
    df_result.show(n=20)
    
    
    # Save the results to csv file
    df_result.select("aid", "title", "url", "domain", "votes", "user", "posted_at", 
                     "comments", "frontpage", 
                     "corrected_prediction") \
             .write.mode("append").option("header", "true").csv("/Users/xiaodi/anaconda3/spark/notebooks/predictions")




24/05/25 20:20:42 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [6]:
ssc = StreamingContext(sc, 10)



In [7]:
lines = ssc.socketTextStream("seppe.net", 7778)
lines.foreachRDD(process)

In [8]:
ssc_t = StreamingThread(ssc)
ssc_t.start()

24/05/25 20:20:49 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:20:49 WARN BlockManager: Block input-0-1716661249000 replicated to only 0 peer(s) instead of 1 peers
                                                                                



                                                                                

+--------+--------------------+--------------------+---------------+-----+------+-------------------+--------+--------------------+--------------------+---------+
|     aid|               title|                 url|         domain|votes|  user|          posted_at|comments|        source_title|         source_text|frontpage|
+--------+--------------------+--------------------+---------------+-----+------+-------------------+--------+--------------------+--------------------+---------+
|40473012|MPs urge under-16...|https://www.thegu...|theguardian.com|    3|kieloo|2024-05-25 06:23:19|       0|MPs urge under-16...|MPs urge under-16...|    false|
+--------+--------------------+--------------------+---------------+-----+------+-------------------+--------+--------------------+--------------------+---------+



24/05/25 20:20:53 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:20:53 WARN BlockManager: Block input-0-1716661253200 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:20:56 WARN DAGScheduler: Broadcasting large task binary with size 1407.9 KiB
24/05/25 20:20:57 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
24/05/25 20:20:57 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:20:57 WARN BlockManager: Block input-0-1716661257200 replicated to only 0 peer(s) instead of 1 peers
                                                                                

+--------+--------------------+--------------------+---------------+-----+------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+--------------------+--------------------+--------------------+------------+----------------+--------------------+-----+--------------------+--------------------+----------+--------------------+
|     aid|               title|                 url|         domain|votes|  user|          posted_at|comments|        source_title|         source_text|frontpage|text_length|               words|      filtered_words|         tf_features|      tfidf_features|domain_index|      domain_vec|            features|label|       rawPrediction|         probability|prediction|corrected_prediction|
+--------+--------------------+--------------------+---------------+-----+------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+--------------------+----

24/05/25 20:20:57 WARN DAGScheduler: Broadcasting large task binary with size 1407.4 KiB
                                                                                

+--------+---------------+--------------------+---------------+-----+--------------+-------------------+--------+---------------+--------------------+---------+
|     aid|          title|                 url|         domain|votes|          user|          posted_at|comments|   source_title|         source_text|frontpage|
+--------+---------------+--------------------+---------------+-----+--------------+-------------------+--------+---------------+--------------------+---------+
|40473014|Angle of Repose|https://en.wikipe...|  wikipedia.org|    2|         Hooke|2024-05-25 06:23:56|       0|Angle of repose|Angle of repose -...|    false|
|40473053|  Endless Tools|https://endlessto...|endlesstools.io|    3|handfuloflight|2024-05-25 06:32:47|       0|  Endless Tools|Endless Tools\n\n...|    false|
+--------+---------------+--------------------+---------------+-----+--------------+-------------------+--------+---------------+--------------------+---------+



24/05/25 20:21:01 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:21:01 WARN BlockManager: Block input-0-1716661261200 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:21:01 WARN DAGScheduler: Broadcasting large task binary with size 1407.9 KiB
24/05/25 20:21:01 WARN DAGScheduler: Broadcasting large task binary with size 1407.9 KiB


+--------+---------------+--------------------+---------------+-----+--------------+-------------------+--------+---------------+--------------------+---------+-----------+--------------------+--------------------+--------------------+--------------------+------------+----------------+--------------------+-----+--------------------+--------------------+----------+--------------------+
|     aid|          title|                 url|         domain|votes|          user|          posted_at|comments|   source_title|         source_text|frontpage|text_length|               words|      filtered_words|         tf_features|      tfidf_features|domain_index|      domain_vec|            features|label|       rawPrediction|         probability|prediction|corrected_prediction|
+--------+---------------+--------------------+---------------+-----+--------------+-------------------+--------+---------------+--------------------+---------+-----------+--------------------+--------------------+----------

24/05/25 20:21:02 WARN DAGScheduler: Broadcasting large task binary with size 1407.4 KiB
24/05/25 20:21:06 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:21:06 WARN BlockManager: Block input-0-1716661266200 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:21:07 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:21:07 WARN BlockManager: Block input-0-1716661267400 replicated to only 0 peer(s) instead of 1 peers
                                                                                

+--------+--------------------+--------------------+-------------------+-----+--------+-------------------+--------+--------------------+--------------------+---------+
|     aid|               title|                 url|             domain|votes|    user|          posted_at|comments|        source_title|         source_text|frontpage|
+--------+--------------------+--------------------+-------------------+-----+--------+-------------------+--------+--------------------+--------------------+---------+
|40473060|America's premier...|https://www.thegu...|    theguardian.com|    4| defrost|2024-05-25 06:36:16|       2|America’s premier...|America’s premier...|     true|
|40473063|Will Google Have ...|https://www.busin...|businessinsider.com|    5|ulrischa|2024-05-25 06:36:51|       0|Why Google is (pr...|Will Google Have ...|     true|
|40473089|Lauryn Hill's 'Mi...|https://apnews.co...|         apnews.com|    2|    geox|2024-05-25 06:42:09|       0|Lauryn Hill's cla...|'The Miseducation.

24/05/25 20:21:11 WARN DAGScheduler: Broadcasting large task binary with size 1407.9 KiB
24/05/25 20:21:11 WARN DAGScheduler: Broadcasting large task binary with size 1407.9 KiB


+--------+--------------------+--------------------+-------------------+-----+--------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+--------------------+--------------------+--------------------+------------+-----------------+--------------------+-----+--------------------+--------------------+----------+--------------------+
|     aid|               title|                 url|             domain|votes|    user|          posted_at|comments|        source_title|         source_text|frontpage|text_length|               words|      filtered_words|         tf_features|      tfidf_features|domain_index|       domain_vec|            features|label|       rawPrediction|         probability|prediction|corrected_prediction|
+--------+--------------------+--------------------+-------------------+-----+--------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+-----

24/05/25 20:21:11 WARN DAGScheduler: Broadcasting large task binary with size 1407.4 KiB
24/05/25 20:21:12 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:21:12 WARN BlockManager: Block input-0-1716661272200 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:21:14 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:21:14 WARN BlockManager: Block input-0-1716661274200 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:21:17 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:21:17 WARN BlockManager: Block input-0-1716661277400 replicated to only 0 peer(s) instead of 1 peers
                                                                                

+--------+--------------------+--------------------+--------------------+-----+------------+-------------------+--------+--------------------+--------------------+---------+
|     aid|               title|                 url|              domain|votes|        user|          posted_at|comments|        source_title|         source_text|frontpage|
+--------+--------------------+--------------------+--------------------+-----+------------+-------------------+--------+--------------------+--------------------+---------+
|40473117|The Cost of a Sub...|https://math.berk...|   math.berkeley.edu|    1|         mhb|2024-05-25 06:47:40|       0|                NULL|                \n\n|    false|
|40473125|Where can I find ...|https://github.co...|github.com/turbob...|    2|  Turboblack|2024-05-25 06:50:32|       2|GitHub - turbobla...|GitHub - turbobla...|    false|
|40473128|  EU Approves AI Act|https://vcsi.org/...|            vcsi.org|   25|rippeltippel|2024-05-25 06:50:54|      24|EU Approv

24/05/25 20:21:20 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:21:20 WARN BlockManager: Block input-0-1716661280200 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:21:21 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:21:21 WARN BlockManager: Block input-0-1716661281200 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:21:21 WARN DAGScheduler: Broadcasting large task binary with size 1407.9 KiB
24/05/25 20:21:21 WARN DAGScheduler: Broadcasting large task binary with size 1407.9 KiB


+--------+--------------------+--------------------+--------------------+-----+------------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+--------------------+--------------------+--------------------+------------+------------+--------------------+-----+--------------------+--------------------+----------+--------------------+
|     aid|               title|                 url|              domain|votes|        user|          posted_at|comments|        source_title|         source_text|frontpage|text_length|               words|      filtered_words|         tf_features|      tfidf_features|domain_index|  domain_vec|            features|label|       rawPrediction|         probability|prediction|corrected_prediction|
+--------+--------------------+--------------------+--------------------+-----+------------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+

24/05/25 20:21:21 WARN DAGScheduler: Broadcasting large task binary with size 1407.4 KiB
24/05/25 20:21:24 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:21:24 WARN BlockManager: Block input-0-1716661284400 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:21:25 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:21:25 WARN BlockManager: Block input-0-1716661285400 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:21:28 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:21:28 WARN BlockManager: Block input-0-1716661288400 replicated to only 0 peer(s) instead of 1 peers
                                                                                

+--------+--------------------+--------------------+---------------+-----+--------------+-------------------+--------+--------------------+--------------------+---------+
|     aid|               title|                 url|         domain|votes|          user|          posted_at|comments|        source_title|         source_text|frontpage|
+--------+--------------------+--------------------+---------------+-----+--------------+-------------------+--------+--------------------+--------------------+---------+
|40473130|Signal's Meredith...|https://techcrunc...| techcrunch.com|   10|        doener|2024-05-25 06:51:13|       0|TechCrunch is par...|TechCrunch is par...|     true|
|40473137|        Stable Audio|https://www.stabl...|stableaudio.com|    2|handfuloflight|2024-05-25 06:53:32|       0|Stable Audio - Ge...|Stable Audio - Ge...|    false|
|40473161|Consensus in Inte...|https://www.mnot....|       mnot.net|    1|  todsacerdoti|2024-05-25 06:59:55|       0|Consensus in Inte...|Consen

24/05/25 20:21:31 WARN DAGScheduler: Broadcasting large task binary with size 1407.9 KiB
24/05/25 20:21:31 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:21:31 WARN BlockManager: Block input-0-1716661291400 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:21:31 WARN DAGScheduler: Broadcasting large task binary with size 1407.9 KiB


+--------+--------------------+--------------------+---------------+-----+--------------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+--------------------+--------------------+--------------------+------------+-----------------+--------------------+-----+--------------------+--------------------+----------+--------------------+
|     aid|               title|                 url|         domain|votes|          user|          posted_at|comments|        source_title|         source_text|frontpage|text_length|               words|      filtered_words|         tf_features|      tfidf_features|domain_index|       domain_vec|            features|label|       rawPrediction|         probability|prediction|corrected_prediction|
+--------+--------------------+--------------------+---------------+-----+--------------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------

24/05/25 20:21:31 WARN DAGScheduler: Broadcasting large task binary with size 1407.4 KiB
24/05/25 20:21:33 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:21:33 WARN BlockManager: Block input-0-1716661293600 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:21:34 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:21:34 WARN BlockManager: Block input-0-1716661294400 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:21:37 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:21:37 WARN BlockManager: Block input-0-1716661297600 replicated to only 0 peer(s) instead of 1 peers
                                                                                

+--------+--------------------+--------------------+--------------------+-----+--------------+-------------------+--------+--------------------+--------------------+---------+
|     aid|               title|                 url|              domain|votes|          user|          posted_at|comments|        source_title|         source_text|frontpage|
+--------+--------------------+--------------------+--------------------+-----+--------------+-------------------+--------+--------------------+--------------------+---------+
|40473198|   Mistral Fine-Tune|https://github.co...|github.com/mistralai|    5|     alexmolas|2024-05-25 07:09:47|       0|GitHub - mistrala...|GitHub - mistrala...|     true|
|40473206|Google's A.I. Sea...|https://www.nytim...|         nytimes.com|    2|      ivyirwin|2024-05-25 07:11:45|       2|Google’s A.I. Sea...|Google AI Overvie...|    false|
|40473240|This week in KDE:...|https://pointiest...|  pointieststick.com|    2|TangerineDream|2024-05-25 07:20:42|      

24/05/25 20:21:41 WARN DAGScheduler: Broadcasting large task binary with size 1407.9 KiB
24/05/25 20:21:41 WARN DAGScheduler: Broadcasting large task binary with size 1407.9 KiB


+--------+--------------------+--------------------+--------------------+-----+--------------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+--------------------+--------------------+--------------------+------------+------------------+--------------------+-----+--------------------+--------------------+----------+--------------------+
|     aid|               title|                 url|              domain|votes|          user|          posted_at|comments|        source_title|         source_text|frontpage|text_length|               words|      filtered_words|         tf_features|      tfidf_features|domain_index|        domain_vec|            features|label|       rawPrediction|         probability|prediction|corrected_prediction|
+--------+--------------------+--------------------+--------------------+-----+--------------+-------------------+--------+--------------------+--------------------+---------+-----------+---

24/05/25 20:21:41 WARN DAGScheduler: Broadcasting large task binary with size 1407.4 KiB
24/05/25 20:21:42 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:21:42 WARN BlockManager: Block input-0-1716661302400 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:21:43 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:21:43 WARN BlockManager: Block input-0-1716661303600 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:21:48 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:21:48 WARN BlockManager: Block input-0-1716661308600 replicated to only 0 peer(s) instead of 1 peers
                                                                                

+--------+--------------------+--------------------+--------------------+-----+-------------+-------------------+--------+--------------------+--------------------+---------+
|     aid|               title|                 url|              domain|votes|         user|          posted_at|comments|        source_title|         source_text|frontpage|
+--------+--------------------+--------------------+--------------------+-----+-------------+-------------------+--------+--------------------+--------------------+---------+
|40473340|ICQ will stop wor...|https://icq.com/d...|             icq.com|    1|Wasserpuncher|2024-05-25 07:44:50|       2|                 ICQ|ICQ\n\n  * Sign I...|    false|
|40473375|Singapore Airline...|https://www.natur...|          nature.com|    3|     zeristor|2024-05-25 07:52:34|       0|Singapore Airline...|Singapore Airline...|     true|
|40473381|Tail Recursion El...|http://neopythoni...|neopythonic.blogs...|    1|     mpweiher|2024-05-25 07:54:10|       0|Tai

24/05/25 20:21:51 WARN DAGScheduler: Broadcasting large task binary with size 1407.9 KiB
24/05/25 20:21:51 WARN DAGScheduler: Broadcasting large task binary with size 1407.9 KiB


+--------+--------------------+--------------------+--------------------+-----+-------------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+--------------------+--------------------+--------------------+------------+-----------------+--------------------+-----+--------------------+--------------------+----------+--------------------+
|     aid|               title|                 url|              domain|votes|         user|          posted_at|comments|        source_title|         source_text|frontpage|text_length|               words|      filtered_words|         tf_features|      tfidf_features|domain_index|       domain_vec|            features|label|       rawPrediction|         probability|prediction|corrected_prediction|
+--------+--------------------+--------------------+--------------------+-----+-------------+-------------------+--------+--------------------+--------------------+---------+-----------+--------

24/05/25 20:21:51 WARN DAGScheduler: Broadcasting large task binary with size 1407.4 KiB
24/05/25 20:21:52 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:21:52 WARN BlockManager: Block input-0-1716661311800 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:21:54 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:21:54 WARN BlockManager: Block input-0-1716661314000 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:21:55 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:21:55 WARN BlockManager: Block input-0-1716661315000 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:21:59 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:21:59 WARN BlockManager: Block input-0-1716661319000 replicated to only 0 peer(s) instead of 1 peers
                                                                                

+--------+--------------------+--------------------+--------------+-----+--------------+-------------------+--------+--------------------+--------------------+---------+
|     aid|               title|                 url|        domain|votes|          user|          posted_at|comments|        source_title|         source_text|frontpage|
+--------+--------------------+--------------------+--------------+-----+--------------+-------------------+--------+--------------------+--------------------+---------+
|40473382| Book Covers of 2023|https://lithub.co...|    lithub.com|    2|handfuloflight|2024-05-25 07:54:35|       0|The 139 Best Book...|The 139 Best Book...|    false|
|40473387|Timeless and Unfo...|https://www.paper...| papertrue.com|    1|handfuloflight|2024-05-25 07:54:58|       0|50 Timeless and U...|50 Timeless and U...|    false|
|40473395|Knot-based Key Ex...|https://eprint.ia...|      iacr.org|    1|     g0xA52A2A|2024-05-25 07:56:32|       0|                NULL|            

24/05/25 20:22:01 WARN DAGScheduler: Broadcasting large task binary with size 1407.9 KiB
24/05/25 20:22:01 WARN DAGScheduler: Broadcasting large task binary with size 1407.9 KiB


+--------+--------------------+--------------------+--------------+-----+--------------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+--------------------+--------------------+--------------------+------------+------------------+--------------------+-----+--------------------+--------------------+----------+--------------------+
|     aid|               title|                 url|        domain|votes|          user|          posted_at|comments|        source_title|         source_text|frontpage|text_length|               words|      filtered_words|         tf_features|      tfidf_features|domain_index|        domain_vec|            features|label|       rawPrediction|         probability|prediction|corrected_prediction|
+--------+--------------------+--------------------+--------------+-----+--------------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+

24/05/25 20:22:01 WARN DAGScheduler: Broadcasting large task binary with size 1407.4 KiB
24/05/25 20:22:03 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:22:03 WARN BlockManager: Block input-0-1716661323000 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:22:06 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:22:06 WARN BlockManager: Block input-0-1716661326000 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:22:09 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:22:09 WARN BlockManager: Block input-0-1716661329000 replicated to only 0 peer(s) instead of 1 peers
                                                                                

+--------+--------------------+--------------------+--------------+-----+---------+-------------------+--------+--------------------+--------------------+---------+
|     aid|               title|                 url|        domain|votes|     user|          posted_at|comments|        source_title|         source_text|frontpage|
+--------+--------------------+--------------------+--------------+-----+---------+-------------------+--------+--------------------+--------------------+---------+
|40473421|C4AI releases two...|https://huggingfa...|huggingface.co|    1|alexmolas|2024-05-25 08:02:07|       0|C4AI Aya 23 - a H...|C4AI Aya 23 - a H...|    false|
|40473439|   SVG Status Monkey|https://vectorart...|  vectorart.ai|    2|   tm11zz|2024-05-25 08:07:32|       0|Status Monkey - V...|Status Monkey - V...|    false|
|40473443|Tech Analysis – L...|https://www.lttla...|   lttlabs.com|    1| xbmcuser|2024-05-25 08:08:45|       0|    Just a moment...|Just a moment...\...|    false|
+--------+

24/05/25 20:22:11 WARN DAGScheduler: Broadcasting large task binary with size 1407.9 KiB
24/05/25 20:22:11 WARN DAGScheduler: Broadcasting large task binary with size 1407.9 KiB


+--------+--------------------+--------------------+--------------+-----+---------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+--------------------+--------------------+--------------------+------------+-----------------+--------------------+-----+--------------------+--------------------+----------+--------------------+
|     aid|               title|                 url|        domain|votes|     user|          posted_at|comments|        source_title|         source_text|frontpage|text_length|               words|      filtered_words|         tf_features|      tfidf_features|domain_index|       domain_vec|            features|label|       rawPrediction|         probability|prediction|corrected_prediction|
+--------+--------------------+--------------------+--------------+-----+---------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+-----------------

24/05/25 20:22:11 WARN DAGScheduler: Broadcasting large task binary with size 1407.4 KiB
24/05/25 20:22:12 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:22:12 WARN BlockManager: Block input-0-1716661332000 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:22:16 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:22:16 WARN BlockManager: Block input-0-1716661336200 replicated to only 0 peer(s) instead of 1 peers
                                                                                

+--------+--------------------+--------------------+------------+-----+----------+-------------------+--------+--------------------+--------------------+---------+
|     aid|               title|                 url|      domain|votes|      user|          posted_at|comments|        source_title|         source_text|frontpage|
+--------+--------------------+--------------------+------------+-----+----------+-------------------+--------+--------------------+--------------------+---------+
|40473446|From Pollution to...|https://www.cremi...|cremieux.xyz|    2|arrowsmith|2024-05-25 08:09:14|       0|From Pollution to...|From Pollution to...|    false|
|40473454|Male birth contro...|https://newatlas....|newatlas.com|   34|      geox|2024-05-25 08:11:11|      10|Male birth contro...|Male birth contro...|     true|
+--------+--------------------+--------------------+------------+-----+----------+-------------------+--------+--------------------+--------------------+---------+



24/05/25 20:22:21 WARN DAGScheduler: Broadcasting large task binary with size 1407.9 KiB
24/05/25 20:22:21 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:22:21 WARN BlockManager: Block input-0-1716661341000 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:22:21 WARN DAGScheduler: Broadcasting large task binary with size 1407.9 KiB


+--------+--------------------+--------------------+------------+-----+----------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+--------------------+--------------------+--------------------+------------+-----------------+--------------------+-----+--------------------+--------------------+----------+--------------------+
|     aid|               title|                 url|      domain|votes|      user|          posted_at|comments|        source_title|         source_text|frontpage|text_length|               words|      filtered_words|         tf_features|      tfidf_features|domain_index|       domain_vec|            features|label|       rawPrediction|         probability|prediction|corrected_prediction|
+--------+--------------------+--------------------+------------+-----+----------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+--------------------

24/05/25 20:22:21 WARN DAGScheduler: Broadcasting large task binary with size 1407.4 KiB
24/05/25 20:22:22 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:22:22 WARN BlockManager: Block input-0-1716661342000 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:22:26 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:22:26 WARN BlockManager: Block input-0-1716661346000 replicated to only 0 peer(s) instead of 1 peers
                                                                                

+--------+--------------------+--------------------+--------------------+-----+----------+-------------------+--------+--------------------+--------------------+---------+
|     aid|               title|                 url|              domain|votes|      user|          posted_at|comments|        source_title|         source_text|frontpage|
+--------+--------------------+--------------------+--------------------+-----+----------+-------------------+--------+--------------------+--------------------+---------+
|40473464|         Don't Panic|https://www.towel...|        towelday.org|    3|      hggh|2024-05-25 08:14:47|       0|         Don't Panic|Don't Panic\n\nDO...|     true|
|40473467|Post Office lobbi...|https://www.thegu...|     theguardian.com|    1|   chrisjj|2024-05-25 08:15:03|       0|‘A tragedy is not...|‘A tragedy is not...|    false|
|40473468|BackgroundMusic: ...|https://github.co...|github.com/kylene...|    1|cadmium-44|2024-05-25 08:15:19|       0|GitHub - kyleneid...|

24/05/25 20:22:31 WARN DAGScheduler: Broadcasting large task binary with size 1407.9 KiB
24/05/25 20:22:31 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:22:31 WARN BlockManager: Block input-0-1716661351000 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:22:31 WARN DAGScheduler: Broadcasting large task binary with size 1407.9 KiB


+--------+--------------------+--------------------+--------------------+-----+----------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+--------------------+--------------------+--------------------+------------+----------------+--------------------+-----+--------------------+--------------------+----------+--------------------+
|     aid|               title|                 url|              domain|votes|      user|          posted_at|comments|        source_title|         source_text|frontpage|text_length|               words|      filtered_words|         tf_features|      tfidf_features|domain_index|      domain_vec|            features|label|       rawPrediction|         probability|prediction|corrected_prediction|
+--------+--------------------+--------------------+--------------------+-----+----------+-------------------+--------+--------------------+--------------------+---------+-----------+-------------------

24/05/25 20:22:31 WARN DAGScheduler: Broadcasting large task binary with size 1407.4 KiB
24/05/25 20:22:36 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:22:36 WARN BlockManager: Block input-0-1716661356200 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:22:38 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:22:38 WARN BlockManager: Block input-0-1716661358400 replicated to only 0 peer(s) instead of 1 peers
                                                                                

+--------+--------------------+--------------------+---------------+-----+------------+-------------------+--------+--------------------+--------------------+---------+
|     aid|               title|                 url|         domain|votes|        user|          posted_at|comments|        source_title|         source_text|frontpage|
+--------+--------------------+--------------------+---------------+-----+------------+-------------------+--------+--------------------+--------------------+---------+
|40473531|Elon Musk slams M...|https://www.pguru...|     pgurus.com|    1|      taubek|2024-05-25 08:24:58|       0|Elon Musk Slams M...|Elon Musk Slams M...|    false|
|40473550|Big tech has dist...|https://www.thegu...|theguardian.com|    3|    lastdong|2024-05-25 08:28:56|       0|Big tech has dist...|Big tech has dist...|     true|
|40473595|Pāṇini: Catching ...|https://blog.gran...|   granthika.co|    1|stareatgoats|2024-05-25 08:34:54|       0|Pāṇini: Catching ...|Pāṇini: Catching .

24/05/25 20:22:40 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:22:40 WARN BlockManager: Block input-0-1716661360400 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:22:41 WARN DAGScheduler: Broadcasting large task binary with size 1407.9 KiB
24/05/25 20:22:41 WARN DAGScheduler: Broadcasting large task binary with size 1407.9 KiB


+--------+--------------------+--------------------+---------------+-----+------------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+--------------------+--------------------+--------------------+------------+----------------+--------------------+-----+--------------------+--------------------+----------+--------------------+
|     aid|               title|                 url|         domain|votes|        user|          posted_at|comments|        source_title|         source_text|frontpage|text_length|               words|      filtered_words|         tf_features|      tfidf_features|domain_index|      domain_vec|            features|label|       rawPrediction|         probability|prediction|corrected_prediction|
+--------+--------------------+--------------------+---------------+-----+------------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+-------

24/05/25 20:22:41 WARN DAGScheduler: Broadcasting large task binary with size 1407.4 KiB
24/05/25 20:22:43 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:22:43 WARN BlockManager: Block input-0-1716661363400 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:22:48 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:22:48 WARN BlockManager: Block input-0-1716661368400 replicated to only 0 peer(s) instead of 1 peers
                                                                                

+--------+--------------------+--------------------+-------------+-----+---------+-------------------+--------+--------------------+--------------------+---------+
|     aid|               title|                 url|       domain|votes|     user|          posted_at|comments|        source_title|         source_text|frontpage|
+--------+--------------------+--------------------+-------------+-----+---------+-------------------+--------+--------------------+--------------------+---------+
|40473618|Parents are getti...|https://www.shaan...|shaanpuri.com|    2|    cdcro|2024-05-25 08:38:12|       0|Your parents are ...|Your parents are ...|    false|
|40473619|          Discipline|https://lichess.o...|  lichess.org|    1|    fzliu|2024-05-25 08:38:26|       0|          Discipline|BenjiPortheault's...|    false|
|40473656|Three New Superco...|https://spectrum....|     ieee.org|    2|westurner|2024-05-25 08:46:35|       0|Three New Superco...|Three New Superco...|    false|
+--------+------

24/05/25 20:22:51 WARN DAGScheduler: Broadcasting large task binary with size 1407.9 KiB
24/05/25 20:22:51 WARN DAGScheduler: Broadcasting large task binary with size 1407.9 KiB


+--------+--------------------+--------------------+-------------+-----+---------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+--------------------+--------------------+--------------------+------------+-------------------+--------------------+-----+--------------------+--------------------+----------+--------------------+
|     aid|               title|                 url|       domain|votes|     user|          posted_at|comments|        source_title|         source_text|frontpage|text_length|               words|      filtered_words|         tf_features|      tfidf_features|domain_index|         domain_vec|            features|label|       rawPrediction|         probability|prediction|corrected_prediction|
+--------+--------------------+--------------------+-------------+-----+---------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+----------------

24/05/25 20:22:51 WARN DAGScheduler: Broadcasting large task binary with size 1407.4 KiB
24/05/25 20:22:51 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:22:51 WARN BlockManager: Block input-0-1716661371600 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:22:56 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:22:56 WARN BlockManager: Block input-0-1716661376600 replicated to only 0 peer(s) instead of 1 peers
                                                                                

+--------+--------------------+--------------------+----------------+-----+-----+-------------------+--------+--------------------+--------------------+---------+
|     aid|               title|                 url|          domain|votes| user|          posted_at|comments|        source_title|         source_text|frontpage|
+--------+--------------------+--------------------+----------------+-----+-----+-------------------+--------+--------------------+--------------------+---------+
|40473685|What makes a coun...|https://2019.watt...|wattenberger.com|    1|tsujp|2024-05-25 08:55:03|       0| Amelia Wattenberger|Amelia Wattenberg...|    false|
|40473704|       Singles Atlas|https://singlesat...|singlesatlas.com|    2|faebi|2024-05-25 08:59:00|       0|World singles map...|World singles map...|    false|
+--------+--------------------+--------------------+----------------+-----+-----+-------------------+--------+--------------------+--------------------+---------+



24/05/25 20:23:00 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:23:00 WARN BlockManager: Block input-0-1716661380600 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:23:01 WARN DAGScheduler: Broadcasting large task binary with size 1407.9 KiB
24/05/25 20:23:01 WARN DAGScheduler: Broadcasting large task binary with size 1407.9 KiB


+--------+--------------------+--------------------+----------------+-----+-----+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+--------------------+--------------------+--------------------+------------+------------+--------------------+-----+--------------------+--------------------+----------+--------------------+
|     aid|               title|                 url|          domain|votes| user|          posted_at|comments|        source_title|         source_text|frontpage|text_length|               words|      filtered_words|         tf_features|      tfidf_features|domain_index|  domain_vec|            features|label|       rawPrediction|         probability|prediction|corrected_prediction|
+--------+--------------------+--------------------+----------------+-----+-----+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+--------------------+------------

24/05/25 20:23:01 WARN DAGScheduler: Broadcasting large task binary with size 1407.4 KiB
24/05/25 20:23:02 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:23:02 WARN BlockManager: Block input-0-1716661382600 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:23:04 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:23:04 WARN BlockManager: Block input-0-1716661383800 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:23:05 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:23:05 WARN BlockManager: Block input-0-1716661385600 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:23:08 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:23:08 WARN BlockManager: Block input-0-1716661388600 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:23:10 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only

+--------+--------------------+--------------------+----------------+-----+---------------+-------------------+--------+--------------------+--------------------+---------+
|     aid|               title|                 url|          domain|votes|           user|          posted_at|comments|        source_title|         source_text|frontpage|
+--------+--------------------+--------------------+----------------+-----+---------------+-------------------+--------+--------------------+--------------------+---------+
|40473725|What's New on PHP...|https://developer...| developerjoy.co|    3|        falcon_|2024-05-25 09:05:09|       0|        DeveloperJoy|What's new on PHP...|    false|
|40473774|     From Vim to Zed|https://registers...|thorstenball.com|    2|        redbell|2024-05-25 09:16:24|       0|     From Vim to Zed|From Vim to Zed -...|    false|
|40473791|Uvalde Families A...|https://www.nytim...|     nytimes.com|    1|        docdeek|2024-05-25 09:21:31|       0|Uvalde Families

24/05/25 20:23:11 WARN DAGScheduler: Broadcasting large task binary with size 1407.9 KiB
24/05/25 20:23:11 WARN DAGScheduler: Broadcasting large task binary with size 1407.9 KiB
24/05/25 20:23:11 WARN DAGScheduler: Broadcasting large task binary with size 1407.9 KiB
24/05/25 20:23:11 WARN DAGScheduler: Broadcasting large task binary with size 1407.4 KiB


+--------+--------------------+--------------------+----------------+-----+---------------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+--------------------+--------------------+--------------------+------------+-------------------+--------------------+-----+--------------------+--------------------+----------+--------------------+
|     aid|               title|                 url|          domain|votes|           user|          posted_at|comments|        source_title|         source_text|frontpage|text_length|               words|      filtered_words|         tf_features|      tfidf_features|domain_index|         domain_vec|            features|label|       rawPrediction|         probability|prediction|corrected_prediction|
+--------+--------------------+--------------------+----------------+-----+---------------+-------------------+--------+--------------------+--------------------+---------+-----------+----------

24/05/25 20:23:13 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:23:13 WARN BlockManager: Block input-0-1716661392800 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:23:14 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:23:14 WARN BlockManager: Block input-0-1716661393800 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:23:17 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:23:17 WARN BlockManager: Block input-0-1716661397000 replicated to only 0 peer(s) instead of 1 peers
                                                                                

+--------+--------------------+--------------------+--------------------+-----+-----------+-------------------+--------+--------------------+--------------------+---------+
|     aid|               title|                 url|              domain|votes|       user|          posted_at|comments|        source_title|         source_text|frontpage|
+--------+--------------------+--------------------+--------------------+-----+-----------+-------------------+--------+--------------------+--------------------+---------+
|40473855|Interior design c...|https://algedra.a...|          algedra.ae|    1|ameliaalana|2024-05-25 09:37:55|       0|Best Interior Des...|Best Interior Des...|    false|
|40473876|            SmidgION|https://nanoporet...|    nanoporetech.com|    1|   pr337h4m|2024-05-25 09:42:15|       0|            SmidgION|SmidgION | Oxford...|    false|
|40473889|"The mother of al...|https://github.co...|github.com/gothin...|    1|  probhavsh|2024-05-25 09:43:49|       0|GitHub - gothin

24/05/25 20:23:21 WARN DAGScheduler: Broadcasting large task binary with size 1407.9 KiB
24/05/25 20:23:21 WARN DAGScheduler: Broadcasting large task binary with size 1407.9 KiB


+--------+--------------------+--------------------+--------------------+-----+-----------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+--------------------+--------------------+--------------------+------------+------------------+--------------------+-----+--------------------+--------------------+----------+--------------------+
|     aid|               title|                 url|              domain|votes|       user|          posted_at|comments|        source_title|         source_text|frontpage|text_length|               words|      filtered_words|         tf_features|      tfidf_features|domain_index|        domain_vec|            features|label|       rawPrediction|         probability|prediction|corrected_prediction|
+--------+--------------------+--------------------+--------------------+-----+-----------+-------------------+--------+--------------------+--------------------+---------+-----------+------------

24/05/25 20:23:21 WARN DAGScheduler: Broadcasting large task binary with size 1407.4 KiB
24/05/25 20:24:20 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:24:20 WARN BlockManager: Block input-0-1716661460000 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:24:22 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:24:22 WARN BlockManager: Block input-0-1716661462000 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:24:26 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:24:26 WARN BlockManager: Block input-0-1716661466000 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:24:30 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:24:30 WARN BlockManager: Block input-0-1716661470000 replicated to only 0 peer(s) instead of 1 peers


+--------+--------------------+--------------------+--------------------+-----+---------------+-------------------+--------+--------------------+--------------------+---------+
|     aid|               title|                 url|              domain|votes|           user|          posted_at|comments|        source_title|         source_text|frontpage|
+--------+--------------------+--------------------+--------------------+-----+---------------+-------------------+--------+--------------------+--------------------+---------+
|40473909|Last major Arabic...|https://www.thegu...|     theguardian.com|    4|YeGoblynQueenne|2024-05-25 09:48:52|       0|Last major Arabic...|Last major Arabic...|     true|
|40473916|    Nushell Bashisms|https://www.nushe...|          nushell.sh|    1|           tosh|2024-05-25 09:50:54|       0|Nushell Bashisms ...|Nushell Bashisms ...|    false|
|40473923|Daylight – A More...|https://daylightc...|daylightcomputer.com|    1|           gavi|2024-05-25 09:52:54|

24/05/25 20:24:31 WARN DAGScheduler: Broadcasting large task binary with size 1407.9 KiB
24/05/25 20:24:31 WARN DAGScheduler: Broadcasting large task binary with size 1407.9 KiB


+--------+--------------------+--------------------+--------------------+-----+---------------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+--------------------+--------------------+--------------------+------------+----------------+--------------------+-----+--------------------+--------------------+----------+--------------------+
|     aid|               title|                 url|              domain|votes|           user|          posted_at|comments|        source_title|         source_text|frontpage|text_length|               words|      filtered_words|         tf_features|      tfidf_features|domain_index|      domain_vec|            features|label|       rawPrediction|         probability|prediction|corrected_prediction|
+--------+--------------------+--------------------+--------------------+-----+---------------+-------------------+--------+--------------------+--------------------+---------+-----------+----

24/05/25 20:24:31 WARN DAGScheduler: Broadcasting large task binary with size 1407.4 KiB
24/05/25 20:24:35 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:24:35 WARN BlockManager: Block input-0-1716661475000 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:24:40 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:24:40 WARN BlockManager: Block input-0-1716661480000 replicated to only 0 peer(s) instead of 1 peers


+--------+--------------------+--------------------+------------+-----+------------+-------------------+--------+--------------------+--------------------+---------+
|     aid|               title|                 url|      domain|votes|        user|          posted_at|comments|        source_title|         source_text|frontpage|
+--------+--------------------+--------------------+------------+-----+------------+-------------------+--------+--------------------+--------------------+---------+
|40473935|Installing Adobe'...|https://linuxtech...|linuxtech.in|    3|avinvarghese|2024-05-25 09:56:08|       0|Installing Adobe'...|Installing Adobe'...|    false|
|40473950|Boeing 737-9 MAX ...|https://www.bosto...|  boston.com|    3|     hosteur|2024-05-25 10:00:10|       0|Boston flight mak...|Boston flight mak...|    false|
+--------+--------------------+--------------------+------------+-----+------------+-------------------+--------+--------------------+--------------------+---------+



24/05/25 20:24:41 WARN DAGScheduler: Broadcasting large task binary with size 1407.9 KiB
24/05/25 20:24:41 WARN DAGScheduler: Broadcasting large task binary with size 1407.9 KiB


+--------+--------------------+--------------------+------------+-----+------------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+--------------------+--------------------+--------------------+------------+------------+--------------------+-----+--------------------+--------------------+----------+--------------------+
|     aid|               title|                 url|      domain|votes|        user|          posted_at|comments|        source_title|         source_text|frontpage|text_length|               words|      filtered_words|         tf_features|      tfidf_features|domain_index|  domain_vec|            features|label|       rawPrediction|         probability|prediction|corrected_prediction|
+--------+--------------------+--------------------+------------+-----+------------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+--------------------+---

24/05/25 20:24:41 WARN DAGScheduler: Broadcasting large task binary with size 1407.4 KiB
24/05/25 20:24:43 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:24:43 WARN BlockManager: Block input-0-1716661483000 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:24:47 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:24:47 WARN BlockManager: Block input-0-1716661487000 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:24:48 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:24:48 WARN BlockManager: Block input-0-1716661488200 replicated to only 0 peer(s) instead of 1 peers
                                                                                

+--------+--------------------+--------------------+---------------+-----+-----------+-------------------+--------+--------------------+--------------------+---------+
|     aid|               title|                 url|         domain|votes|       user|          posted_at|comments|        source_title|         source_text|frontpage|
+--------+--------------------+--------------------+---------------+-----+-----------+-------------------+--------+--------------------+--------------------+---------+
|40473961|How the German co...|https://www.dw.co...|         dw.com|    2|       tosh|2024-05-25 10:05:03|       0|How the German co...|How the German co...|    false|
|40473984|Show HN: LaraDocs...|https://laradocs.dev|   laradocs.dev|    1|   TonnnnUK|2024-05-25 10:13:27|       0|            LaraDocs|LaraDocs - Larave...|    false|
|40474019|Deploying Perl Da...|https://www.perl....|       perl.com|    1|thunderbong|2024-05-25 10:22:49|       0|Deploying Dancer ...|Deploying Dancer ...|   

24/05/25 20:24:50 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:24:50 WARN BlockManager: Block input-0-1716661490200 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:24:51 WARN DAGScheduler: Broadcasting large task binary with size 1407.9 KiB
24/05/25 20:24:51 WARN DAGScheduler: Broadcasting large task binary with size 1407.9 KiB


+--------+--------------------+--------------------+---------------+-----+-----------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+--------------------+--------------------+--------------------+------------+------------------+--------------------+-----+--------------------+--------------------+----------+--------------------+
|     aid|               title|                 url|         domain|votes|       user|          posted_at|comments|        source_title|         source_text|frontpage|text_length|               words|      filtered_words|         tf_features|      tfidf_features|domain_index|        domain_vec|            features|label|       rawPrediction|         probability|prediction|corrected_prediction|
+--------+--------------------+--------------------+---------------+-----+-----------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+------

24/05/25 20:24:51 WARN DAGScheduler: Broadcasting large task binary with size 1407.4 KiB
24/05/25 20:24:55 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:24:55 WARN BlockManager: Block input-0-1716661495200 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:24:56 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:24:56 WARN BlockManager: Block input-0-1716661496200 replicated to only 0 peer(s) instead of 1 peers
                                                                                

+--------+--------------------+--------------------+-----------------+-----+----------+-------------------+--------+--------------------+--------------------+---------+
|     aid|               title|                 url|           domain|votes|      user|          posted_at|comments|        source_title|         source_text|frontpage|
+--------+--------------------+--------------------+-----------------+-----+----------+-------------------+--------+--------------------+--------------------+---------+
|40474023|Front Crawl Swimm...|https://www.scien...|sciencedirect.com|    1|      wslh|2024-05-25 10:24:02|       0|       ScienceDirect|ScienceDirect\n\n...|    false|
|40474048|California city p...|https://www.thegu...|  theguardian.com|    2|passwordle|2024-05-25 10:28:24|       0|‘Psychologically ...|‘Psychologically ...|    false|
|40474049|Google once again...|https://www.genig...|    genigears.com|    5| zerohedge|2024-05-25 10:28:57|       2|Google's AI Overv...|Google's AI Overv.

24/05/25 20:25:01 WARN DAGScheduler: Broadcasting large task binary with size 1407.9 KiB
24/05/25 20:25:01 WARN DAGScheduler: Broadcasting large task binary with size 1407.9 KiB


+--------+--------------------+--------------------+-----------------+-----+----------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+--------------------+--------------------+--------------------+------------+------------------+--------------------+-----+--------------------+--------------------+----------+--------------------+
|     aid|               title|                 url|           domain|votes|      user|          posted_at|comments|        source_title|         source_text|frontpage|text_length|               words|      filtered_words|         tf_features|      tfidf_features|domain_index|        domain_vec|            features|label|       rawPrediction|         probability|prediction|corrected_prediction|
+--------+--------------------+--------------------+-----------------+-----+----------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+---

24/05/25 20:25:01 WARN DAGScheduler: Broadcasting large task binary with size 1407.4 KiB
24/05/25 20:25:01 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:25:01 WARN BlockManager: Block input-0-1716661501200 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:25:05 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:25:05 WARN BlockManager: Block input-0-1716661505200 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:25:06 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:25:06 WARN BlockManager: Block input-0-1716661506400 replicated to only 0 peer(s) instead of 1 peers
                                                                                

+--------+--------------------+--------------------+------------------+-----+--------+-------------------+--------+--------------------+--------------------+---------+
|     aid|               title|                 url|            domain|votes|    user|          posted_at|comments|        source_title|         source_text|frontpage|
+--------+--------------------+--------------------+------------------+-----+--------+-------------------+--------+--------------------+--------------------+---------+
|40474051|Using a light air...|http://www.stepha...|stephan-schwab.com|    1|mpweiher|2024-05-25 10:29:36|       0|Using a light air...|Using a light air...|    false|
|40474052|Trees and Dags in...|https://soundness...|     soundness.dev|    3|  cbeach|2024-05-25 10:29:39|       0|Soundness: Dendro...|Soundness: Dendro...|    false|
|40474057|Framework for Int...|https://www.mdpi....|          mdpi.com|    2|    wslh|2024-05-25 10:30:07|       0|Framework for Int...|Sensors | Free Fu...|   

24/05/25 20:25:10 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:25:10 WARN BlockManager: Block input-0-1716661510200 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:25:11 WARN DAGScheduler: Broadcasting large task binary with size 1407.9 KiB
24/05/25 20:25:11 WARN DAGScheduler: Broadcasting large task binary with size 1407.9 KiB


+--------+--------------------+--------------------+------------------+-----+--------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+--------------------+--------------------+--------------------+------------+-----------------+--------------------+-----+--------------------+--------------------+----------+--------------------+
|     aid|               title|                 url|            domain|votes|    user|          posted_at|comments|        source_title|         source_text|frontpage|text_length|               words|      filtered_words|         tf_features|      tfidf_features|domain_index|       domain_vec|            features|label|       rawPrediction|         probability|prediction|corrected_prediction|
+--------+--------------------+--------------------+------------------+-----+--------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+--------

24/05/25 20:25:11 WARN DAGScheduler: Broadcasting large task binary with size 1407.4 KiB
24/05/25 20:25:11 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:25:11 WARN BlockManager: Block input-0-1716661511400 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:25:16 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:25:16 WARN BlockManager: Block input-0-1716661516400 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:25:17 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:25:17 WARN BlockManager: Block input-0-1716661517400 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:25:19 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:25:19 WARN BlockManager: Block input-0-1716661519400 replicated to only 0 peer(s) instead of 1 peers
                                                                                

+--------+--------------------+--------------------+---------------+-----+------------+-------------------+--------+--------------------+--------------------+---------+
|     aid|               title|                 url|         domain|votes|        user|          posted_at|comments|        source_title|         source_text|frontpage|
+--------+--------------------+--------------------+---------------+-----+------------+-------------------+--------+--------------------+--------------------+---------+
|40474064|If You Wish to Un...|https://frappe.io...|      frappe.io|    2|todsacerdoti|2024-05-25 10:31:01|       0|If You Wish to Tr...|If You Wish to Tr...|    false|
|40474069|Triathlon Forum: ...|https://forum.slo...| slowtwitch.com|    1|        wslh|2024-05-25 10:32:16|       0|Early Thoughts on...|Early Thoughts on...|    false|
|40474079| Embracing Ambiguity|https://en.wikive...|wikiversity.org|    1|     yamrzou|2024-05-25 10:34:55|       0|Embracing Ambigui...|Embracing Ambigui.

24/05/25 20:25:21 WARN DAGScheduler: Broadcasting large task binary with size 1407.9 KiB
24/05/25 20:25:21 WARN DAGScheduler: Broadcasting large task binary with size 1407.9 KiB


+--------+--------------------+--------------------+---------------+-----+------------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+--------------------+--------------------+--------------------+------------+------------------+--------------------+-----+--------------------+--------------------+----------+--------------------+
|     aid|               title|                 url|         domain|votes|        user|          posted_at|comments|        source_title|         source_text|frontpage|text_length|               words|      filtered_words|         tf_features|      tfidf_features|domain_index|        domain_vec|            features|label|       rawPrediction|         probability|prediction|corrected_prediction|
+--------+--------------------+--------------------+---------------+-----+------------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+---

24/05/25 20:25:21 WARN DAGScheduler: Broadcasting large task binary with size 1407.4 KiB
24/05/25 20:25:24 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:25:24 WARN BlockManager: Block input-0-1716661524400 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:25:27 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:25:27 WARN BlockManager: Block input-0-1716661527600 replicated to only 0 peer(s) instead of 1 peers
                                                                                

+--------+--------------------+--------------------+----------------+-----+------+-------------------+--------+--------------------+--------------------+---------+
|     aid|               title|                 url|          domain|votes|  user|          posted_at|comments|        source_title|         source_text|frontpage|
+--------+--------------------+--------------------+----------------+-----+------+-------------------+--------+--------------------+--------------------+---------+
|40474108|Ghost in the Ethe...|https://blog.benj...|   benjojo.co.uk|    1| fanf2|2024-05-25 10:42:03|       0|Ghost in the ethe...|Ghost in the ethe...|    false|
|40474125|The Minimum Marke...|https://www.ellio...|elliotcsmith.com|    1|smitec|2024-05-25 10:45:56|       0|The Minimum Marke...|The Minimum Marke...|    false|
+--------+--------------------+--------------------+----------------+-----+------+-------------------+--------+--------------------+--------------------+---------+



24/05/25 20:25:31 WARN DAGScheduler: Broadcasting large task binary with size 1407.9 KiB
24/05/25 20:25:31 WARN DAGScheduler: Broadcasting large task binary with size 1407.9 KiB


+--------+--------------------+--------------------+----------------+-----+------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+--------------------+--------------------+--------------------+------------+------------------+--------------------+-----+--------------------+--------------------+----------+--------------------+
|     aid|               title|                 url|          domain|votes|  user|          posted_at|comments|        source_title|         source_text|frontpage|text_length|               words|      filtered_words|         tf_features|      tfidf_features|domain_index|        domain_vec|            features|label|       rawPrediction|         probability|prediction|corrected_prediction|
+--------+--------------------+--------------------+----------------+-----+------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+------------------

24/05/25 20:25:31 WARN DAGScheduler: Broadcasting large task binary with size 1407.4 KiB
24/05/25 20:25:31 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:25:31 WARN BlockManager: Block input-0-1716661531400 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:25:33 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:25:33 WARN BlockManager: Block input-0-1716661533400 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:25:38 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:25:38 WARN BlockManager: Block input-0-1716661538400 replicated to only 0 peer(s) instead of 1 peers
                                                                                

+--------+--------------------+--------------------+--------------------+-----+-----------+-------------------+--------+--------------------+--------------------+---------+
|     aid|               title|                 url|              domain|votes|       user|          posted_at|comments|        source_title|         source_text|frontpage|
+--------+--------------------+--------------------+--------------------+-----+-----------+-------------------+--------+--------------------+--------------------+---------+
|40474165|Lapis: A Web Fram...|https://leafo.net...|           leafo.net|   14|thunderbong|2024-05-25 10:56:41|       0|Lapis - A web fra...|Lapis - A web fra...|     true|
|40474180|It's cheaper to b...|https://twitter.c...|twitter.com/billf...|    7|     doener|2024-05-25 11:01:20|       4|                   X|X\n\nDon’t miss w...|     true|
|40474195|     Bing API Outage|https://twitter.c...|twitter.com/nixcraft|    1|     doener|2024-05-25 11:05:09|       0|               

24/05/25 20:25:41 WARN DAGScheduler: Broadcasting large task binary with size 1407.9 KiB
24/05/25 20:25:41 WARN DAGScheduler: Broadcasting large task binary with size 1407.9 KiB


+--------+--------------------+--------------------+--------------------+-----+-----------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+--------------------+--------------------+--------------------+------------+------------+--------------------+-----+--------------------+--------------------+----------+--------------------+
|     aid|               title|                 url|              domain|votes|       user|          posted_at|comments|        source_title|         source_text|frontpage|text_length|               words|      filtered_words|         tf_features|      tfidf_features|domain_index|  domain_vec|            features|label|       rawPrediction|         probability|prediction|corrected_prediction|
+--------+--------------------+--------------------+--------------------+-----+-----------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+---

24/05/25 20:25:41 WARN DAGScheduler: Broadcasting large task binary with size 1407.4 KiB
24/05/25 20:25:42 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:25:42 WARN BlockManager: Block input-0-1716661542600 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:25:47 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:25:47 WARN BlockManager: Block input-0-1716661547600 replicated to only 0 peer(s) instead of 1 peers
                                                                                

+--------+--------------------+--------------------+----------+-----+--------+-------------------+--------+--------------------+--------------------+---------+
|     aid|               title|                 url|    domain|votes|    user|          posted_at|comments|        source_title|         source_text|frontpage|
+--------+--------------------+--------------------+----------+-----+--------+-------------------+--------+--------------------+--------------------+---------+
|40474201|I'm in love with ...|https://unherd.co...|unherd.com|   27|elsewhen|2024-05-25 11:13:47|      21|I'm in love with ...|I'm in love with ...|     true|
|40474202|Google just updat...|https://www.bbc.c...|   bbc.com|   68|sonabinu|2024-05-25 11:14:26|      31|Google just updat...|Google just updat...|     true|
+--------+--------------------+--------------------+----------+-----+--------+-------------------+--------+--------------------+--------------------+---------+



24/05/25 20:25:51 WARN DAGScheduler: Broadcasting large task binary with size 1407.9 KiB
24/05/25 20:25:51 WARN DAGScheduler: Broadcasting large task binary with size 1407.9 KiB


+--------+--------------------+--------------------+----------+-----+--------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+--------------------+--------------------+--------------------+------------+------------------+--------------------+-----+--------------------+--------------------+----------+--------------------+
|     aid|               title|                 url|    domain|votes|    user|          posted_at|comments|        source_title|         source_text|frontpage|text_length|               words|      filtered_words|         tf_features|      tfidf_features|domain_index|        domain_vec|            features|label|       rawPrediction|         probability|prediction|corrected_prediction|
+--------+--------------------+--------------------+----------+-----+--------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+--------------------+---------

24/05/25 20:25:51 WARN DAGScheduler: Broadcasting large task binary with size 1407.4 KiB
24/05/25 20:25:51 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:25:51 WARN BlockManager: Block input-0-1716661551600 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:25:54 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:25:54 WARN BlockManager: Block input-0-1716661554600 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:26:00 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:26:00 WARN BlockManager: Block input-0-1716661559800 replicated to only 0 peer(s) instead of 1 peers
                                                                                

+--------+--------------------+--------------------+--------------------+-----+---------------+-------------------+--------+------------+--------------------+---------+
|     aid|               title|                 url|              domain|votes|           user|          posted_at|comments|source_title|         source_text|frontpage|
+--------+--------------------+--------------------+--------------------+-----+---------------+-------------------+--------+------------+--------------------+---------+
|40474203|I Built a Simple ...|https://billburst...|billburst.vercel.app|    1|          aiibe|2024-05-25 11:15:01|       0|   BillBurst|BillBurst\n\nQuic...|    false|
|40474207|What airlines can...|https://www.ft.co...|              ft.com|    2|quick_brown_fox|2024-05-25 11:18:51|       0|        NULL|What airlines can...|    false|
+--------+--------------------+--------------------+--------------------+-----+---------------+-------------------+--------+------------+------------------

24/05/25 20:26:01 WARN DAGScheduler: Broadcasting large task binary with size 1407.9 KiB
24/05/25 20:26:01 WARN DAGScheduler: Broadcasting large task binary with size 1407.9 KiB


+--------+--------------------+--------------------+--------------------+-----+---------------+-------------------+--------+------------+--------------------+---------+-----------+--------------------+--------------------+--------------------+--------------------+------------+------------------+--------------------+-----+--------------------+--------------------+----------+--------------------+
|     aid|               title|                 url|              domain|votes|           user|          posted_at|comments|source_title|         source_text|frontpage|text_length|               words|      filtered_words|         tf_features|      tfidf_features|domain_index|        domain_vec|            features|label|       rawPrediction|         probability|prediction|corrected_prediction|
+--------+--------------------+--------------------+--------------------+-----+---------------+-------------------+--------+------------+--------------------+---------+-----------+--------------------+---

24/05/25 20:26:01 WARN DAGScheduler: Broadcasting large task binary with size 1407.4 KiB
24/05/25 20:26:02 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:26:02 WARN BlockManager: Block input-0-1716661561800 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:26:04 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:26:04 WARN BlockManager: Block input-0-1716661563800 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:26:09 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:26:09 WARN BlockManager: Block input-0-1716661568800 replicated to only 0 peer(s) instead of 1 peers
                                                                                

+--------+--------------------+--------------------+---------------+-----+-----------+-------------------+--------+--------------------+--------------------+---------+
|     aid|               title|                 url|         domain|votes|       user|          posted_at|comments|        source_title|         source_text|frontpage|
+--------+--------------------+--------------------+---------------+-----+-----------+-------------------+--------+--------------------+--------------------+---------+
|40474212|Atomstr: RSS/Atom...|https://atomstr.d...|      data.haus|    2| janandonly|2024-05-25 11:20:25|       0|             atomstr|atomstr\n\n# atom...|    false|
|40474227|Fitting High-Leve...|https://www.youtu...|    youtube.com|    1|  hasheddan|2024-05-25 11:24:19|       0|Fitting High-Leve...|Fitting High-Leve...|    false|
|40474234|NASA finds more i...|https://arstechni...|arstechnica.com|    1|andreiursan|2024-05-25 11:25:38|       0|NASA finds more i...|NASA finds more i...|   

24/05/25 20:26:10 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:26:10 WARN BlockManager: Block input-0-1716661570600 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:26:11 WARN DAGScheduler: Broadcasting large task binary with size 1407.9 KiB
24/05/25 20:26:11 WARN DAGScheduler: Broadcasting large task binary with size 1407.9 KiB


+--------+--------------------+--------------------+---------------+-----+-----------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+--------------------+--------------------+--------------------+------------+-----------------+--------------------+-----+--------------------+--------------------+----------+--------------------+
|     aid|               title|                 url|         domain|votes|       user|          posted_at|comments|        source_title|         source_text|frontpage|text_length|               words|      filtered_words|         tf_features|      tfidf_features|domain_index|       domain_vec|            features|label|       rawPrediction|         probability|prediction|corrected_prediction|
+--------+--------------------+--------------------+---------------+-----+-----------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+--------

24/05/25 20:26:11 WARN DAGScheduler: Broadcasting large task binary with size 1407.4 KiB
24/05/25 20:26:16 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:26:16 WARN BlockManager: Block input-0-1716661575800 replicated to only 0 peer(s) instead of 1 peers
                                                                                

+--------+--------------------+--------------------+------------------+-----+--------+-------------------+--------+--------------------+--------------------+---------+
|     aid|               title|                 url|            domain|votes|    user|          posted_at|comments|        source_title|         source_text|frontpage|
+--------+--------------------+--------------------+------------------+-----+--------+-------------------+--------+--------------------+--------------------+---------+
|40474236|Publishing AI Slo...|https://daringfir...|daringfireball.net|    9|mpweiher|2024-05-25 11:25:52|       0|Publishing AI Slo...|Daring Fireball: ...|     true|
|40474238|A startup's "tabl...|https://om.co/202...|             om.co|    2|mpweiher|2024-05-25 11:26:06|       0|A startup’s “tabl...|A startup’s “tabl...|    false|
+--------+--------------------+--------------------+------------------+-----+--------+-------------------+--------+--------------------+--------------------+---

24/05/25 20:26:21 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:26:21 WARN BlockManager: Block input-0-1716661580800 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:26:21 WARN DAGScheduler: Broadcasting large task binary with size 1407.9 KiB
24/05/25 20:26:21 WARN DAGScheduler: Broadcasting large task binary with size 1407.9 KiB


+--------+--------------------+--------------------+------------------+-----+--------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+--------------------+--------------------+--------------------+------------+------------------+--------------------+-----+--------------------+--------------------+----------+--------------------+
|     aid|               title|                 url|            domain|votes|    user|          posted_at|comments|        source_title|         source_text|frontpage|text_length|               words|      filtered_words|         tf_features|      tfidf_features|domain_index|        domain_vec|            features|label|       rawPrediction|         probability|prediction|corrected_prediction|
+--------+--------------------+--------------------+------------------+-----+--------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+------

24/05/25 20:26:21 WARN DAGScheduler: Broadcasting large task binary with size 1407.4 KiB
24/05/25 20:26:24 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:26:24 WARN BlockManager: Block input-0-1716661583800 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:26:25 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:26:25 WARN BlockManager: Block input-0-1716661584800 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:26:28 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:26:28 WARN BlockManager: Block input-0-1716661587800 replicated to only 0 peer(s) instead of 1 peers
                                                                                

+--------+--------------------+--------------------+-------------------+-----+------------+-------------------+--------+--------------------+--------------------+---------+
|     aid|               title|                 url|             domain|votes|        user|          posted_at|comments|        source_title|         source_text|frontpage|
+--------+--------------------+--------------------+-------------------+-----+------------+-------------------+--------+--------------------+--------------------+---------+
|40474294|Lessons from the ...|https://arxiv.org...|          arxiv.org|    3|veryluckyxyz|2024-05-25 11:42:23|       0|Lessons from the ...|[2405.14782] Less...|     true|
|40474296|Feed and Blogroll...|https://andregarz...|    andregarzia.com|    1|todsacerdoti|2024-05-25 11:42:44|       0|Feed and Blogroll...|Feed and Blogroll...|    false|
|40474304|Ilgpu: A Modern G...|  https://ilgpu.net/|          ilgpu.net|    1|   PaulHoule|2024-05-25 11:43:37|       0|ILGPU - A Moder

24/05/25 20:26:31 WARN DAGScheduler: Broadcasting large task binary with size 1407.9 KiB
24/05/25 20:26:31 WARN DAGScheduler: Broadcasting large task binary with size 1407.9 KiB


+--------+--------------------+--------------------+-------------------+-----+------------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+--------------------+--------------------+--------------------+------------+----------------+--------------------+-----+--------------------+--------------------+----------+--------------------+
|     aid|               title|                 url|             domain|votes|        user|          posted_at|comments|        source_title|         source_text|frontpage|text_length|               words|      filtered_words|         tf_features|      tfidf_features|domain_index|      domain_vec|            features|label|       rawPrediction|         probability|prediction|corrected_prediction|
+--------+--------------------+--------------------+-------------------+-----+------------+-------------------+--------+--------------------+--------------------+---------+-----------+----------------

24/05/25 20:26:31 WARN DAGScheduler: Broadcasting large task binary with size 1407.4 KiB
24/05/25 20:26:33 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:26:33 WARN BlockManager: Block input-0-1716661592800 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:26:34 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:26:34 WARN BlockManager: Block input-0-1716661593800 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:26:38 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:26:38 WARN BlockManager: Block input-0-1716661598000 replicated to only 0 peer(s) instead of 1 peers
                                                                                

+--------+--------------------+--------------------+-------------+-----+------------+-------------------+--------+--------------------+--------------------+---------+
|     aid|               title|                 url|       domain|votes|        user|          posted_at|comments|        source_title|         source_text|frontpage|
+--------+--------------------+--------------------+-------------+-----+------------+-------------------+--------+--------------------+--------------------+---------+
|40474334|Teenage Engineeri...|https://www.ifixi...|   ifixit.com|    2|    follower|2024-05-25 11:47:37|       0|How Teenage Engin...|How Teenage Engin...|    false|
|40474342|Microsoft Launche...|https://www.micro...|microsoft.com|    2|NayamAmarshe|2024-05-25 11:48:49|       0|Copilot for Teleg...|Copilot for Teleg...|    false|
|40474351|Apple elaborates ...|https://9to5mac.c...|  9to5mac.com|    1|   f_allwein|2024-05-25 11:50:45|       0|Apple elaborates ...|Apple elaborates ...|    false

24/05/25 20:26:41 WARN DAGScheduler: Broadcasting large task binary with size 1407.9 KiB
24/05/25 20:26:41 WARN DAGScheduler: Broadcasting large task binary with size 1407.9 KiB


+--------+--------------------+--------------------+-------------+-----+------------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+--------------------+--------------------+--------------------+------------+-----------------+--------------------+-----+--------------------+--------------------+----------+--------------------+
|     aid|               title|                 url|       domain|votes|        user|          posted_at|comments|        source_title|         source_text|frontpage|text_length|               words|      filtered_words|         tf_features|      tfidf_features|domain_index|       domain_vec|            features|label|       rawPrediction|         probability|prediction|corrected_prediction|
+--------+--------------------+--------------------+-------------+-----+------------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+-----------

24/05/25 20:26:41 WARN DAGScheduler: Broadcasting large task binary with size 1407.4 KiB
24/05/25 20:26:43 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:26:43 WARN BlockManager: Block input-0-1716661603000 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:26:48 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:26:48 WARN BlockManager: Block input-0-1716661608000 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:26:50 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:26:50 WARN BlockManager: Block input-0-1716661610000 replicated to only 0 peer(s) instead of 1 peers


+--------+--------------------+--------------------+------------+-----+--------+-------------------+--------+--------------------+--------------------+---------+
|     aid|               title|                 url|      domain|votes|    user|          posted_at|comments|        source_title|         source_text|frontpage|
+--------+--------------------+--------------------+------------+-----+--------+-------------------+--------+--------------------+--------------------+---------+
|40474376|SQLCipher is a se...|https://www.zetet...| zetetic.net|    1|  doener|2024-05-25 11:55:09|       0|SQLCipher Design ...|SQLCipher Design ...|    false|
|40474425|    Hacked Happiness|https://www.mindp...|mindpluz.com|    1|samleecs|2024-05-25 12:05:17|       0|Mindpluz: Persona...|Mindpluz: Persona...|    false|
+--------+--------------------+--------------------+------------+-----+--------+-------------------+--------+--------------------+--------------------+---------+



24/05/25 20:26:51 WARN DAGScheduler: Broadcasting large task binary with size 1407.9 KiB
24/05/25 20:26:51 WARN DAGScheduler: Broadcasting large task binary with size 1407.9 KiB


+--------+--------------------+--------------------+------------+-----+--------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+--------------------+--------------------+--------------------+------------+------------+--------------------+-----+--------------------+--------------------+----------+--------------------+
|     aid|               title|                 url|      domain|votes|    user|          posted_at|comments|        source_title|         source_text|frontpage|text_length|               words|      filtered_words|         tf_features|      tfidf_features|domain_index|  domain_vec|            features|label|       rawPrediction|         probability|prediction|corrected_prediction|
+--------+--------------------+--------------------+------------+-----+--------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+--------------------+---------------

24/05/25 20:26:51 WARN DAGScheduler: Broadcasting large task binary with size 1407.4 KiB
24/05/25 20:26:53 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:26:53 WARN BlockManager: Block input-0-1716661613000 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:26:58 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:26:58 WARN BlockManager: Block input-0-1716661618000 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:26:59 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:26:59 WARN BlockManager: Block input-0-1716661619200 replicated to only 0 peer(s) instead of 1 peers
                                                                                

+--------+--------------------+--------------------+-------------------+-----+-------------+-------------------+--------+--------------------+--------------------+---------+
|     aid|               title|                 url|             domain|votes|         user|          posted_at|comments|        source_title|         source_text|frontpage|
+--------+--------------------+--------------------+-------------------+-----+-------------+-------------------+--------+--------------------+--------------------+---------+
|40474451|Unsafe OpenAI exa...|https://cookbook....|cookbook.openai.com|    1|  upwardbound|2024-05-25 12:10:24|       0|How to call funct...|How to call funct...|    false|
|40474464|Benefits with CSV...|https://impler.io...|          impler.io|    4| bhavikchavda|2024-05-25 12:12:15|       0|ROI on CSV Excel ...|ROI on CSV Excel ...|    false|
|40474501|LeechBlock – A Si...|https://www.progi...|     proginosko.com|    1|alabhyajindal|2024-05-25 12:17:27|       0|         

24/05/25 20:27:01 WARN DAGScheduler: Broadcasting large task binary with size 1407.9 KiB
24/05/25 20:27:01 WARN DAGScheduler: Broadcasting large task binary with size 1407.9 KiB
24/05/25 20:27:01 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:27:01 WARN BlockManager: Block input-0-1716661621200 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:27:01 WARN DAGScheduler: Broadcasting large task binary with size 1407.4 KiB


+--------+--------------------+--------------------+-------------------+-----+-------------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+--------------------+--------------------+--------------------+------------+-----------------+--------------------+-----+--------------------+--------------------+----------+--------------------+
|     aid|               title|                 url|             domain|votes|         user|          posted_at|comments|        source_title|         source_text|frontpage|text_length|               words|      filtered_words|         tf_features|      tfidf_features|domain_index|       domain_vec|            features|label|       rawPrediction|         probability|prediction|corrected_prediction|
+--------+--------------------+--------------------+-------------------+-----+-------------+-------------------+--------+--------------------+--------------------+---------+-----------+-----------

24/05/25 20:27:03 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:27:03 WARN BlockManager: Block input-0-1716661623000 replicated to only 0 peer(s) instead of 1 peers
                                                                                

+--------+--------------------+--------------------+------------+-----+---------+-------------------+--------+--------------------+--------------------+---------+
|     aid|               title|                 url|      domain|votes|     user|          posted_at|comments|        source_title|         source_text|frontpage|
+--------+--------------------+--------------------+------------+-----+---------+-------------------+--------+--------------------+--------------------+---------+
|40474507|Bill Maher: Ameri...|https://www.thefp...|   thefp.com|    2|   cebert|2024-05-25 12:17:51|       0|Bill Maher: Ameri...|Bill Maher: Ameri...|    false|
|40474510|Memory Sealing "M...|https://www.phoro...|phoronix.com|    1|westurner|2024-05-25 12:17:58|       0|Memory Sealing "m...|Memory Sealing "m...|    false|
+--------+--------------------+--------------------+------------+-----+---------+-------------------+--------+--------------------+--------------------+---------+



24/05/25 20:27:11 WARN DAGScheduler: Broadcasting large task binary with size 1407.9 KiB
24/05/25 20:27:11 WARN DAGScheduler: Broadcasting large task binary with size 1407.9 KiB


+--------+--------------------+--------------------+------------+-----+---------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+--------------------+--------------------+--------------------+------------+-----------------+--------------------+-----+--------------------+--------------------+----------+--------------------+
|     aid|               title|                 url|      domain|votes|     user|          posted_at|comments|        source_title|         source_text|frontpage|text_length|               words|      filtered_words|         tf_features|      tfidf_features|domain_index|       domain_vec|            features|label|       rawPrediction|         probability|prediction|corrected_prediction|
+--------+--------------------+--------------------+------------+-----+---------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+--------------------+--

24/05/25 20:27:11 WARN DAGScheduler: Broadcasting large task binary with size 1407.4 KiB
24/05/25 20:28:06 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:28:06 WARN BlockManager: Block input-0-1716661686400 replicated to only 0 peer(s) instead of 1 peers
                                                                                

+--------+--------------------+--------------------+-------------------+-----+------+-------------------+--------+--------------------+--------------------+---------+
|     aid|               title|                 url|             domain|votes|  user|          posted_at|comments|        source_title|         source_text|frontpage|
+--------+--------------------+--------------------+-------------------+-----+------+-------------------+--------+--------------------+--------------------+---------+
|40474516|The deskilling of...|https://www.baldu...|baldurbjarnason.com|   29|loop22|2024-05-25 12:18:53|       9|The deskilling of...|The deskilling of...|     true|
+--------+--------------------+--------------------+-------------------+-----+------+-------------------+--------+--------------------+--------------------+---------+



24/05/25 20:28:10 WARN DAGScheduler: Broadcasting large task binary with size 1407.9 KiB
24/05/25 20:28:11 WARN DAGScheduler: Broadcasting large task binary with size 1407.4 KiB


+--------+--------------------+--------------------+-------------------+-----+------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+--------------------+--------------------+--------------------+------------+------------+--------------------+-----+--------------------+--------------------+----------+--------------------+
|     aid|               title|                 url|             domain|votes|  user|          posted_at|comments|        source_title|         source_text|frontpage|text_length|               words|      filtered_words|         tf_features|      tfidf_features|domain_index|  domain_vec|            features|label|       rawPrediction|         probability|prediction|corrected_prediction|
+--------+--------------------+--------------------+-------------------+-----+------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+--------------------+

24/05/25 20:28:11 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:28:11 WARN BlockManager: Block input-0-1716661691400 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:28:13 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:28:13 WARN BlockManager: Block input-0-1716661693400 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:28:16 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:28:16 WARN BlockManager: Block input-0-1716661696400 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:28:18 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:28:18 WARN BlockManager: Block input-0-1716661698400 replicated to only 0 peer(s) instead of 1 peers
                                                                                

+--------+--------------------+--------------------+--------------------+-----+------+-------------------+--------+--------------------+--------------------+---------+
|     aid|               title|                 url|              domain|votes|  user|          posted_at|comments|        source_title|         source_text|frontpage|
+--------+--------------------+--------------------+--------------------+-----+------+-------------------+--------+--------------------+--------------------+---------+
|40474517|The day Putin cri...|https://not-enter...|not-entertainment...|    1|microt|2024-05-25 12:18:57|       0|             Vietnam|Vietnam\n\nSkip t...|    false|
|40474520|        Horror Vacui|https://en.wikipe...|       wikipedia.org|    1| EndXA|2024-05-25 12:19:41|       0|Horror vacui (phy...|Horror vacui (phy...|    false|
|40474521|David Heinemeier ...|https://highperfo...|highperformancesq...|    2|  tosh|2024-05-25 12:19:42|       0|David Heinemeier ...|David Heinemeier ...|   

24/05/25 20:28:20 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:28:20 WARN BlockManager: Block input-0-1716661700400 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:28:21 WARN DAGScheduler: Broadcasting large task binary with size 1407.9 KiB
24/05/25 20:28:21 WARN DAGScheduler: Broadcasting large task binary with size 1407.9 KiB


+--------+--------------------+--------------------+--------------------+-----+------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+--------------------+--------------------+--------------------+------------+----------------+--------------------+-----+--------------------+--------------------+----------+--------------------+
|     aid|               title|                 url|              domain|votes|  user|          posted_at|comments|        source_title|         source_text|frontpage|text_length|               words|      filtered_words|         tf_features|      tfidf_features|domain_index|      domain_vec|            features|label|       rawPrediction|         probability|prediction|corrected_prediction|
+--------+--------------------+--------------------+--------------------+-----+------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+----------

24/05/25 20:28:21 WARN DAGScheduler: Broadcasting large task binary with size 1407.4 KiB
24/05/25 20:28:22 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:28:22 WARN BlockManager: Block input-0-1716661702600 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:28:23 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:28:23 WARN BlockManager: Block input-0-1716661703600 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:28:25 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:28:25 WARN BlockManager: Block input-0-1716661704800 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:28:29 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:28:29 WARN BlockManager: Block input-0-1716661708800 replicated to only 0 peer(s) instead of 1 peers
                                                                                

+--------+--------------------+--------------------+--------------+-----+----------+-------------------+--------+--------------------+--------------------+---------+
|     aid|               title|                 url|        domain|votes|      user|          posted_at|comments|        source_title|         source_text|frontpage|
+--------+--------------------+--------------------+--------------+-----+----------+-------------------+--------+--------------------+--------------------+---------+
|40474540|In defense of usi...|https://www.dseba...|dsebastien.net|    2|dSebastien|2024-05-25 12:22:32|       0|Why You Should Ta...|Why You Should Ta...|    false|
|40474545|How we crafted a ...|https://www.craft...|      craft.do|    2|     gklka|2024-05-25 12:23:09|       0|How we crafted a ...|How we crafted a ...|    false|
|40474549|Fauci aide trigge...|https://usrtk.org...|     usrtk.org|    1| miguelazo|2024-05-25 12:23:50|       0|Fauci aide trigge...|Fauci aide trigge...|    false|
|404

24/05/25 20:28:31 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:28:31 WARN BlockManager: Block input-0-1716661710800 replicated to only 0 peer(s) instead of 1 peers
24/05/25 20:28:31 WARN DAGScheduler: Broadcasting large task binary with size 1407.9 KiB
24/05/25 20:28:31 WARN DAGScheduler: Broadcasting large task binary with size 1407.9 KiB


+--------+--------------------+--------------------+--------------+-----+----------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+--------------------+--------------------+--------------------+------------+-------------------+--------------------+-----+--------------------+--------------------+----------+--------------------+
|     aid|               title|                 url|        domain|votes|      user|          posted_at|comments|        source_title|         source_text|frontpage|text_length|               words|      filtered_words|         tf_features|      tfidf_features|domain_index|         domain_vec|            features|label|       rawPrediction|         probability|prediction|corrected_prediction|
+--------+--------------------+--------------------+--------------+-----+----------+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+----------

24/05/25 20:28:31 WARN DAGScheduler: Broadcasting large task binary with size 1407.4 KiB
                                                                                

+--------+--------------------+--------------------+-------------+-----+-----+-------------------+--------+------------+-----------+---------+
|     aid|               title|                 url|       domain|votes| user|          posted_at|comments|source_title|source_text|frontpage|
+--------+--------------------+--------------------+-------------+-----+-----+-------------------+--------+------------+-----------+---------+
|40474564|Neuralink Compres...|https://content.n...|neuralink.com|    1|atrus|2024-05-25 12:25:42|       0|        NULL|       NULL|    false|
+--------+--------------------+--------------------+-------------+-----+-----+-------------------+--------+------------+-----------+---------+



24/05/25 20:28:40 WARN DAGScheduler: Broadcasting large task binary with size 1407.9 KiB
24/05/25 20:28:41 ERROR PythonUDFRunner: Python worker exited unexpectedly (crashed)
org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/Users/xiaodi/anaconda3/spark/spark-3.5.1-bin-hadoop3/python/lib/pyspark.zip/pyspark/worker.py", line 1225, in main
    eval_type = read_int(infile)
                ^^^^^^^^^^^^^^^^
  File "/Users/xiaodi/anaconda3/spark/spark-3.5.1-bin-hadoop3/python/lib/pyspark.zip/pyspark/serializers.py", line 596, in read_int
    raise EOFError
EOFError

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:572)
	at org.apache.spark.sql.execution.python.BasePythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:94)
	at org.apache.spark.sql.execution.python.BasePythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:75)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunne

24/05/25 20:30:35 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/25 20:30:35 WARN BlockManager: Block input-0-1716661835000 replicated to only 0 peer(s) instead of 1 peers
                                                                                

+--------+--------------------+--------------------+-------+-----+----+-------------------+--------+--------------------+--------------------+---------+
|     aid|               title|                 url| domain|votes|user|          posted_at|comments|        source_title|         source_text|frontpage|
+--------+--------------------+--------------------+-------+-----+----+-------------------+--------+--------------------+--------------------+---------+
|40474594|Developers are no...|https://0xff.nu/d...|0xff.nu|   21|hxii|2024-05-25 12:30:12|      13|Developers aren't...|Developers aren't...|     true|
+--------+--------------------+--------------------+-------+-----+----+-------------------+--------+--------------------+--------------------+---------+



24/05/25 20:30:40 WARN DAGScheduler: Broadcasting large task binary with size 1407.9 KiB


+--------+--------------------+--------------------+-------+-----+----+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+--------------------+--------------------+--------------------+------------+------------+--------------------+-----+--------------------+--------------------+----------+--------------------+
|     aid|               title|                 url| domain|votes|user|          posted_at|comments|        source_title|         source_text|frontpage|text_length|               words|      filtered_words|         tf_features|      tfidf_features|domain_index|  domain_vec|            features|label|       rawPrediction|         probability|prediction|corrected_prediction|
+--------+--------------------+--------------------+-------+-----+----+-------------------+--------+--------------------+--------------------+---------+-----------+--------------------+--------------------+--------------------+--------------------+

24/05/25 20:30:41 WARN DAGScheduler: Broadcasting large task binary with size 1407.4 KiB
[Stage 0:>                                                          (0 + 1) / 1]

In [9]:
ssc_t.stop()

----- Stopping... this may take a few seconds -----


24/05/25 20:31:41 WARN SocketReceiver: Error receiving data
java.net.SocketException: Socket closed
	at java.net.SocketInputStream.socketRead0(Native Method)
	at java.net.SocketInputStream.socketRead(SocketInputStream.java:116)
	at java.net.SocketInputStream.read(SocketInputStream.java:171)
	at java.net.SocketInputStream.read(SocketInputStream.java:141)
	at sun.nio.cs.StreamDecoder.readBytes(StreamDecoder.java:284)
	at sun.nio.cs.StreamDecoder.implRead(StreamDecoder.java:326)
	at sun.nio.cs.StreamDecoder.read(StreamDecoder.java:178)
	at java.io.InputStreamReader.read(InputStreamReader.java:184)
	at java.io.BufferedReader.fill(BufferedReader.java:161)
	at java.io.BufferedReader.readLine(BufferedReader.java:324)
	at java.io.BufferedReader.readLine(BufferedReader.java:389)
	at org.apache.spark.streaming.dstream.SocketReceiver$$anon$2.getNext(SocketInputDStream.scala:121)
	at org.apache.spark.streaming.dstream.SocketReceiver$$anon$2.getNext(SocketInputDStream.scala:119)
	at org.apache.spar

In [11]:
# Load saved predictions
predictions_df = spark.read.option("header", "true").csv("/Users/xiaodi/anaconda3/spark/notebooks/predictions")

# Define a functions to convert numbers to boolen
def bool_to_double(value):
    return 1.0 if value == "true" else 0.0 if value == "false" else None

bool_to_double_udf = udf(bool_to_double, DoubleType())
predictions_df = predictions_df.withColumn("frontpage_num", bool_to_double_udf(predictions_df["frontpage"]))

# Check for the lines
predictions_df.show(n=50)

# Compute the correct predictions and accuracy
correct_predictions = predictions_df.filter(predictions_df["frontpage_num"] == predictions_df["corrected_prediction"]).count()
total_predictions = predictions_df.count()

accuracy = correct_predictions / total_predictions

print(f"Correct Predictions: {correct_predictions}")
print(f"Total Predictions: {total_predictions}")
print(f"Accuracy: {accuracy}")


+--------+--------------------+--------------------+--------------------+-----+---------------+-------------------+--------+---------+--------------------+-------------+
|     aid|               title|                 url|              domain|votes|           user|          posted_at|comments|frontpage|corrected_prediction|frontpage_num|
+--------+--------------------+--------------------+--------------------+-----+---------------+-------------------+--------+---------+--------------------+-------------+
|40474069|Triathlon Forum: ...|https://forum.slo...|      slowtwitch.com|    1|           wslh|2024-05-25 10:32:16|       0|    false|                 1.0|          0.0|
|40474234|NASA finds more i...|https://arstechni...|     arstechnica.com|    1|    andreiursan|2024-05-25 11:25:38|       0|    false|                 1.0|          0.0|
|40473550|Big tech has dist...|https://www.thegu...|     theguardian.com|    3|       lastdong|2024-05-25 08:28:56|       0|     true|                

Correct Predictions: 38
Total Predictions: 138
Accuracy: 0.2753623188405797
