In [12]:
from pyspark.ml.classification import LinearSVC
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, VectorAssembler   
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


In [2]:
# read training and test data from csv files
TRAINING_PATH = "./train_selected.csv"
TESTING_PATH = "./test_selected.csv"

In [3]:

spark = SparkSession.builder.appName("SVM").getOrCreate()
sc = spark.sparkContext

In [5]:
training_rdd = spark.read.csv(TRAINING_PATH, header=True, inferSchema=True)
testing_rdd = spark.read.csv(TESTING_PATH, header=True, inferSchema=True)
# print dataset length
print("Training data length: ", training_rdd.count())
print("Testing data length: ", testing_rdd.count())

# print the count of label values in training data
training_rdd.groupBy("label").count().show()
testing_rdd.groupBy("label").count().show()

Training data length:  177041
Testing data length:  58329
+-----+------+
|label| count|
+-----+------+
|    1|101480|
|    0| 75561|
+-----+------+

+-----+-----+
|label|count|
+-----+-----+
|    1|33370|
|    0|24959|
+-----+-----+



In [14]:
#
# Specify the feature columns, excluding the label column
feature_columns = [col for col in training_rdd.columns if col in [ "URLLength", "URLSimilarityIndex"]]
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")

# Assemble features for both train and test datasets
train_df = assembler.transform(training_rdd).select("features", "label")
test_df = assembler.transform(testing_rdd).select("features", "label")

In [15]:
# Initialize the LinearSVC model
svm = LinearSVC(labelCol="label", featuresCol="features", maxIter=100)

# Train the LinearSVC model
svm_model = svm.fit(train_df)

# Make predictions on the test set
predictions = svm_model.transform(test_df)


In [22]:
# training accuracy
training_pred = svm_model.transform(train_df)
training_accuracy = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy").evaluate(training_pred)
print("Training accuracy: ", training_accuracy)


Training accuracy:  0.995667670200688


In [24]:
predictions.groupBy("prediction").count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|       0.0|24691|
|       1.0|33638|
+----------+-----+



In [16]:

# Evaluate the model using accuracy
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print(f"Test Accuracy = {accuracy:.4f}")

# Evaluate the model using additional metrics
precision_evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="weightedPrecision")
precision = precision_evaluator.evaluate(predictions)
print(f"Test Precision = {precision:.4f}")

recall_evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="weightedRecall")
recall = recall_evaluator.evaluate(predictions)
print(f"Test Recall = {recall:.4f}")

f1_evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1")
f1 = f1_evaluator.evaluate(predictions)
print(f"Test F1 Score = {f1:.4f}")

# Plot confusion matrix
predictions.groupBy("label", "prediction").count().show()

Test Accuracy = 0.9954
Test Precision = 0.9954
Test Recall = 0.9954
Test F1 Score = 0.9954
+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|    0|       0.0|24691|
|    1|       1.0|33370|
|    0|       1.0|  268|
+-----+----------+-----+

