In [1]:
!pip install pyspark



In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("ColabSpark").getOrCreate()
print("Spark running:", spark.version)

Spark running: 3.5.1


In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lower, trim
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer, IDF

import pandas as pd
import numpy as np

# 1. Start Spark
spark = SparkSession.builder \
    .appName("NewsTFIDF") \
    .getOrCreate()

# 2. Load CSV
df = spark.read.csv(
    "all-the-news-2-1-cleaned.csv",
    header=True,
    inferSchema=True,
    mode="DROPMALFORMED"
).select("article")

In [6]:
# 3. Normalize text (lowercase + strip)
df = df.withColumn("article", trim(lower(col("article"))))
df = df.filter(col("article").isNotNull() & (col("article") != ""))
df.show(5)

+--------------------+
|             article|
+--------------------+
|"this post is par...|
|davos, switzerlan...|
|paris (reuters) -...|
|"paris hilton arr...|
|caracas (reuters)...|
+--------------------+
only showing top 5 rows



In [13]:
regex_tokenizer = RegexTokenizer(
    inputCol="article",
    outputCol="words",
    pattern="[a-zA-Z]{2,}",  # words of length >= 2
    gaps=False               # extract words instead of splitting
)

words_df = regex_tokenizer.transform(df)
words_df.show(5)

+--------------------+--------------------+
|             article|               words|
+--------------------+--------------------+
|"this post is par...|[this, post, is, ...|
|davos, switzerlan...|[davos, switzerla...|
|paris (reuters) -...|[paris, reuters, ...|
|"paris hilton arr...|[paris, hilton, a...|
|caracas (reuters)...|[caracas, reuters...|
+--------------------+--------------------+
only showing top 5 rows



In [14]:
# 5. Remove stopwords
remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")
filtered_df = remover.transform(words_df)
filtered_df.show(5)

+--------------------+--------------------+--------------------+
|             article|               words|      filtered_words|
+--------------------+--------------------+--------------------+
|"this post is par...|[this, post, is, ...|[post, part, poly...|
|davos, switzerlan...|[davos, switzerla...|[davos, switzerla...|
|paris (reuters) -...|[paris, reuters, ...|[paris, reuters, ...|
|"paris hilton arr...|[paris, hilton, a...|[paris, hilton, a...|
|caracas (reuters)...|[caracas, reuters...|[caracas, reuters...|
+--------------------+--------------------+--------------------+
only showing top 5 rows



In [18]:
# 6. Term Frequency (CountVectorizer)
cv = CountVectorizer(inputCol="filtered_words", outputCol="raw_features", vocabSize=10000)
cv_model = cv.fit(filtered_df)
tf_df = cv_model.transform(filtered_df)
tf_df.show(5)


+--------------------+--------------------+--------------------+--------------------+
|             article|               words|      filtered_words|        raw_features|
+--------------------+--------------------+--------------------+--------------------+
|"this post is par...|[this, post, is, ...|[post, part, poly...|(10000,[3,5,8,9,1...|
|davos, switzerlan...|[davos, switzerla...|[davos, switzerla...|(10000,[1,3,5,10,...|
|paris (reuters) -...|[paris, reuters, ...|[paris, reuters, ...|(10000,[2,5,8,9,1...|
|"paris hilton arr...|[paris, hilton, a...|[paris, hilton, a...|(10000,[14,55,125...|
|caracas (reuters)...|[caracas, reuters...|[caracas, reuters...|(10000,[0,3,6,9,1...|
+--------------------+--------------------+--------------------+--------------------+
only showing top 5 rows



In [19]:
# 7. TF-IDF
idf = IDF(inputCol="raw_features", outputCol="features")
idf_model = idf.fit(tf_df)
tfidf_df = idf_model.transform(tf_df)
tfidf_df.show(5)

+--------------------+--------------------+--------------------+--------------------+--------------------+
|             article|               words|      filtered_words|        raw_features|            features|
+--------------------+--------------------+--------------------+--------------------+--------------------+
|"this post is par...|[this, post, is, ...|[post, part, poly...|(10000,[3,5,8,9,1...|(10000,[3,5,8,9,1...|
|davos, switzerlan...|[davos, switzerla...|[davos, switzerla...|(10000,[1,3,5,10,...|(10000,[1,3,5,10,...|
|paris (reuters) -...|[paris, reuters, ...|[paris, reuters, ...|(10000,[2,5,8,9,1...|(10000,[2,5,8,9,1...|
|"paris hilton arr...|[paris, hilton, a...|[paris, hilton, a...|(10000,[14,55,125...|(10000,[14,55,125...|
|caracas (reuters)...|[caracas, reuters...|[caracas, reuters...|(10000,[0,3,6,9,1...|(10000,[0,3,6,9,1...|
+--------------------+--------------------+--------------------+--------------------+--------------------+
only showing top 5 rows



In [20]:
# 8. Inspect results
print("Vocabulary size:", len(cv_model.vocabulary))
print("Sample feature names:", cv_model.vocabulary[:20])

Vocabulary size: 10000
Sample feature names: ['said', 'trump', 'one', 'people', 'like', 'new', 'also', 'year', 'time', 're', 'president', 'first', 'two', 'us', 'last', 'even', 'think', 'get', 'years', 'percent']


In [21]:
# Collect small sample into Pandas
sample = tfidf_df.select("features").limit(5).collect()
feature_names = cv_model.vocabulary

df_tfidf_head = pd.DataFrame(
    [np.array(row.features.toArray()) for row in sample],
    columns=feature_names
)

print(df_tfidf_head.head())

       said     trump       one    people  like       new      also  year  \
0  0.000000  0.000000  0.000000  1.118502   0.0  4.450080  0.000000   0.0   
1  0.000000  4.409789  0.000000  1.118502   0.0  0.890016  0.000000   0.0   
2  0.000000  0.000000  1.635308  0.000000   0.0  1.780032  0.000000   0.0   
3  0.000000  0.000000  0.000000  0.000000   0.0  0.000000  0.000000   0.0   
4  2.344656  0.000000  0.000000  1.118502   0.0  0.000000  0.938229   0.0   

       time        re  ...  wrestler  ankle  ernst  imprisonment  uphill  \
0  1.139936  4.405035  ...       0.0    0.0    0.0           0.0     0.0   
1  0.000000  0.000000  ...       0.0    0.0    0.0           0.0     0.0   
2  2.279872  2.936690  ...       0.0    0.0    0.0           0.0     0.0   
3  0.000000  0.000000  ...       0.0    0.0    0.0           0.0     0.0   
4  0.000000  1.468345  ...       0.0    0.0    0.0           0.0     0.0   

   leisure  levy  unfolded  societal   nc  
0      0.0   0.0       0.0       0.0