In [1]:
!pip install pyspark



In [2]:
!curl https://raw.githubusercontent.com/mosesyhc/de300-2025sp-class/refs/heads/main/agnews_clean.csv -O

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 33.2M  100 33.2M    0     0  29.9M      0  0:00:01  0:00:01 --:--:-- 29.9M


In [3]:
!echo $JAVA_HOME
!java -version


java version "1.8.0_451"
Java(TM) SE Runtime Environment (build 1.8.0_451-b10)
Java HotSpot(TM) 64-Bit Server VM (build 25.451-b10, mixed mode)


In [4]:
from pyspark.sql import SparkSession

spark = (SparkSession.builder
         .master("local[*]")
         .appName("AG news")
         .getOrCreate()
        )

agnews = spark.read.csv("agnews_clean.csv", inferSchema=True, header=True)

# turning the second column from a string to an array
import pyspark.sql.functions as F
from pyspark.sql.types import ArrayType, StringType
agnews = agnews.withColumn('filtered', F.from_json('filtered', ArrayType(StringType())))

25/05/22 13:58:06 WARN Utils: Your hostname, MacBook-Pro-1120.local resolves to a loopback address: 127.0.0.1; using 10.0.0.8 instead (on interface en0)
25/05/22 13:58:06 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/05/22 13:58:07 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [5]:
# each row contains the document id and a list of filtered words
agnews.show(5, truncate=30)

+---+------------------------------+
|_c0|                      filtered|
+---+------------------------------+
|  0|[wall, st, bears, claw, bac...|
|  1|[carlyle, looks, toward, co...|
|  2|[oil, economy, cloud, stock...|
|  3|[iraq, halts, oil, exports,...|
|  4|[oil, prices, soar, time, r...|
+---+------------------------------+
only showing top 5 rows



25/05/22 13:58:12 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , filtered
 Schema: _c0, filtered
Expected: _c0 but found: 
CSV file: file:///Users/bengutstein/Desktop/homework3/agnews_clean.csv


In [None]:
# STEP 4: Create an RDD with (doc_id, filtered_words)
print("Creating RDD...")
rdd = agnews.select("_c0", "filtered").rdd

# STEP 5: Word-document frequency (for TF)
print("Mapping word-document pairs...")
word_doc_pairs = rdd.flatMap(lambda row: [((word, row["_c0"]), 1) for word in row["filtered"]])

print("Reducing term counts...")
term_counts = word_doc_pairs.reduceByKey(lambda x, y: x + y)

# Document lengths (for normalizing TF)
print("Calculating document lengths...")
doc_lengths = rdd.map(lambda row: (row["_c0"], len(row["filtered"])))

# Monitor partitions to ensure Spark is progressing
doc_lengths.foreachPartition(lambda part: print("Processed one doc_length partition."))

doc_lengths_dict = dict(doc_lengths.collect())

# Compute Term Frequency (TF)
print("Computing TF...")
tf = term_counts.map(lambda x: (x[0], x[1] / doc_lengths_dict[x[0][1]]))

# Compute Document Frequencies for each term
print("Calculating document frequency for each term...")
unique_word_doc = rdd.flatMap(lambda row: [(word, row["_c0"]) for word in set(row["filtered"])])
doc_freq = unique_word_doc.distinct().map(lambda x: (x[0], 1)).reduceByKey(lambda x, y: x + y)

# Log document count
N = agnews.count()
print(f"Total documents: {N}")

Creating RDD...
Mapping word-document pairs...
Reducing term counts...
Calculating document lengths...


25/05/22 13:58:15 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , filtered
 Schema: _c0, filtered
Expected: _c0 but found: 
CSV file: file:///Users/bengutstein/Desktop/homework3/agnews_clean.csv
Processed one doc_length partition.
Processed one doc_length partition.Processed one doc_length partition.

Processed one doc_length partition.
Processed one doc_length partition.
Processed one doc_length partition.
Processed one doc_length partition.
Processed one doc_length partition.
Processed one doc_length partition.
25/05/22 13:58:15 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , filtered
 Schema: _c0, filtered
Expected: _c0 but found: 
CSV file: file:///Users/bengutstein/Desktop/homework3/agnews_clean.csv
                                                                                

Computing TF...
Calculating document frequency for each term...
Total documents: 127600


In [None]:
import math
# IDF Collection
print("Collecting document frequencies and computing IDF in-memory...")
idf_dict = dict(doc_freq.collect())
idf_dict = {term: math.log(N / df) for term, df in idf_dict.items()}

# Join TF and IDF
print("Computing TF-IDF...")
tfidf = tf.map(lambda x: (x[0], x[1] * idf_dict[x[0][0]]))

# Group TF-IDF scores by document
print("Grouping TF-IDF scores by document ID...")
doc_tfidf = tfidf.map(lambda x: (x[0][1], [(x[0][0], x[1])])) \
                 .reduceByKey(lambda x, y: x + y)

# Print first 5 TF-IDF results
print("\nTF-IDF scores for first 5 documents:")
for doc_id, scores in doc_tfidf.take(5):
    print(f"Doc {doc_id}: {sorted(scores, key=lambda x: -x[1])[:5]}")  # top 5 terms

Collecting document frequencies and computing IDF in-memory...


25/05/22 13:58:39 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , filtered
 Schema: _c0, filtered
Expected: _c0 but found: 
CSV file: file:///Users/bengutstein/Desktop/homework3/agnews_clean.csv

CodeCache: size=131072Kb used=23346Kb max_used=23354Kb free=107725Kb
 bounds [0x00000001071f8000, 0x00000001088e8000, 0x000000010f1f8000]
 total_blobs=9360 nmethods=8405 adapters=867
 compilation: disabled (not enough contiguous free space left)


25/05/22 13:58:41 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , filtered
 Schema: _c0, filtered
Expected: _c0 but found: 
CSV file: file:///Users/bengutstein/Desktop/homework3/agnews_clean.csv


Computing TF-IDF...
Grouping TF-IDF scores by document ID...

TF-IDF scores for first 5 documents:




Doc 0: [('cynics', 0.563734318747707), ('wall', 0.5115985326511431), ('claw', 0.499114829314058), ('dwindling', 0.4572386180709258), ('sellers', 0.4468379768438066)]
Doc 9: [('cynics', 0.5340640914451961), ('wall', 0.48467229409055657), ('claw', 0.47284562777121286), ('dwindling', 0.43317342764614025), ('sellers', 0.4233201885888694)]
Doc 18: [('deficit', 0.540640233790213), ('swells', 0.4456552560521169), ('trade', 0.4080689115854956), ('8bn', 0.39631754501941335), ('imports', 0.31527891489726495)]
Doc 27: [('fall', 0.43167912777209483), ('shares', 0.386492653524941), ('tumble', 0.3419973653042592), ('quarter', 0.3411920291869863), ('disappointing', 0.29431935936888143)]
Doc 36: [('google', 0.34612714944621187), ('secrecy', 0.3432093194552543), ('confusing', 0.3299462238286493), ('submitted', 0.28716139264401674), ('news', 0.27869755632552773)]


                                                                                

In [7]:
# Part 2:SVM
!curl https://raw.githubusercontent.com/mosesyhc/de300-2025sp-class/refs/heads/main/w.csv -O
!curl https://raw.githubusercontent.com/mosesyhc/de300-2025sp-class/refs/heads/main/bias.csv -O
!curl https://raw.githubusercontent.com/mosesyhc/de300-2025sp-class/refs/heads/main/data_for_svm.csv -O

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0curl: (6) Could not resolve host: raw.githubusercontent.com
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0curl: (6) Could not resolve host: raw.githubusercontent.com
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0curl: (6) Could not resolve host: raw.githubusercontent.com


In [8]:
# This is an example to read the files. But you should consider using pyspark directly. 
# *Make sure you are not assuming a header*!!
import pandas as pd
data_svm = pd.read_csv('data_for_svm.csv', header=None)
w = pd.read_csv('w.csv', header=None)
bias = pd.read_csv('bias.csv', header=None)

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [10]:
import numpy as np

def loss_SVM(w, b, X, y, lambd=1.0):
    """
    Compute the SVM loss:
    L(w, b) = lambda * ||w||^2 + (1/n) * sum_i max(0, 1 - y_i * (w^T x_i + b))
    
    Parameters:
    - w: weight vector (64, 1)
    - b: bias (scalar)
    - X: feature matrix (n, 64)
    - y: labels (n,)
    - lambd: regularization parameter
    """
    # Convert to numpy arrays
    w = w.values.flatten()
    b = bias.values.flatten()[0]
    X = X.iloc[:, :-1].values
    y = X = data_svm.iloc[:, :-1].values
    y = data_svm.iloc[:, -1].values
    
    # Compute hinge loss terms: max(0, 1 - y_i * (w^T x_i + b))
    margins = 1 - y * (X @ w + b)
    hinge_loss = np.maximum(0, margins)

    # Final loss
    loss = lambd * np.linalg.norm(w)**2 + hinge_loss.mean()
    return loss


In [11]:
loss = loss_SVM(w, bias, data_svm, data_svm.iloc[:, -1])
print("SVM Loss:", loss)

SVM Loss: 1.0029403834857522


In [12]:
def predict_SVM(w, b, X):
    """
    Predicts labels using the SVM decision rule: sign(w^T x + b)
    
    Parameters:
    - w: weight vector (64,)
    - b: scalar bias
    - X: feature matrix (n, 64)
    
    Returns:
    - y_hat: predicted labels (n,), each ∈ {−1, +1}
    """
    w = w.values.flatten()
    b = b.values.flatten()[0]
    X = X.iloc[:, :-1].values  # only features, drop label column

    scores = X.dot(w) + b
    y_hat = np.sign(scores)
    return y_hat


In [13]:
y_pred = predict_SVM(w, bias, data_svm)
print("Predictions:\n", y_pred[:10])

Predictions:
 [-1. -1. -1.  1. -1.  1. -1. -1.  1. -1.]
