In [9]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import LinearSVC
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()
print(spark.version)

3.4.0


In [3]:
spark_session = SparkSession.builder.master("local[*]").appName('Models').getOrCreate()

In [4]:
# Read datasets with spark
train_df_spark = spark_session.read.csv('../datasets/train.csv', header=True, inferSchema=True)
test_df_spark = spark_session.read.csv('../datasets/test.csv', header=True, inferSchema=True)

print("Size of the training data:", train_df_spark.count())
print("Size of the test data:", test_df_spark.count())

Size of the training data: 98550
Size of the test data: 42237


In [5]:
# Encode the features into a vector
featureassemble = VectorAssembler(inputCols=['f1','f2','f3'], outputCol='features')
output = featureassemble.transform(train_df_spark)
output.show(n=5)

testfeatureassemble = VectorAssembler(inputCols=['f1','f2','f3'], outputCol='features')
testoutput = testfeatureassemble.transform(test_df_spark)
testoutput.show(n=5)

+---+----+----+------+---------------+
| f1|  f2|  f3|target|       features|
+---+----+----+------+---------------+
|0.0| 6.0|61.0|   1.0| [0.0,6.0,61.0]|
|1.0|12.0|27.0|   0.0|[1.0,12.0,27.0]|
|0.0|14.0|17.0|   0.0|[0.0,14.0,17.0]|
|0.0| 6.0|24.0|   0.0| [0.0,6.0,24.0]|
|0.0| 6.0|38.0|   0.0| [0.0,6.0,38.0]|
+---+----+----+------+---------------+
only showing top 5 rows

+---+----+----+------+--------------+
| f1|  f2|  f3|target|      features|
+---+----+----+------+--------------+
|0.0| 1.0|26.0|   0.0|[0.0,1.0,26.0]|
|0.0| 8.0|45.0|   1.0|[0.0,8.0,45.0]|
|0.0|15.0| 1.0|   0.0|[0.0,15.0,1.0]|
|0.0| 6.0|36.0|   0.0|[0.0,6.0,36.0]|
|1.0| 4.0|46.0|   0.0|[1.0,4.0,46.0]|
+---+----+----+------+--------------+
only showing top 5 rows



In [6]:
# Select the features and the target column

train = output.select('features', 'target') 
train.show(n=5)

test = testoutput.select('features', 'target')
test.show(n=5)

+---------------+------+
|       features|target|
+---------------+------+
| [0.0,6.0,61.0]|   1.0|
|[1.0,12.0,27.0]|   0.0|
|[0.0,14.0,17.0]|   0.0|
| [0.0,6.0,24.0]|   0.0|
| [0.0,6.0,38.0]|   0.0|
+---------------+------+
only showing top 5 rows

+--------------+------+
|      features|target|
+--------------+------+
|[0.0,1.0,26.0]|   0.0|
|[0.0,8.0,45.0]|   1.0|
|[0.0,15.0,1.0]|   0.0|
|[0.0,6.0,36.0]|   0.0|
|[1.0,4.0,46.0]|   0.0|
+--------------+------+
only showing top 5 rows



In [7]:
classifier = LogisticRegression(labelCol='target').fit(train)

# Evaluate the model on the test data
results = classifier.evaluate(test) 

# Print the accuracy, precision, recall and f1 score
evaluator = MulticlassClassificationEvaluator(labelCol="target", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(results.predictions)
print(f"Accuracy: {accuracy*100:.2f}%")

evaluator = MulticlassClassificationEvaluator(labelCol="target", predictionCol="prediction", metricName="weightedPrecision")
weightedPrecision = evaluator.evaluate(results.predictions)
print(f"Weighted Precision: {weightedPrecision*100:.2f}%")

evaluator = MulticlassClassificationEvaluator(labelCol="target", predictionCol="prediction", metricName="weightedRecall")
weightedRecall = evaluator.evaluate(results.predictions)
print(f"Weighted Recall: {weightedRecall*100:.2f}%")

evaluator = MulticlassClassificationEvaluator(labelCol="target", predictionCol="prediction", metricName="f1")
f1 = evaluator.evaluate(results.predictions)
print(f"F1 Score: {f1*100:.2f}%")

Accuracy: 82.79%
Weighted Precision: 81.38%
Weighted Recall: 82.79%
F1 Score: 81.10%


In [8]:
classifier = NaiveBayes(labelCol='target', featuresCol='features')
classifier = classifier.fit(train)

# Evaluate the model on the test data
results = classifier.transform(test)

# Print the accuracy, precision, recall and f1 score
evaluator = MulticlassClassificationEvaluator(labelCol="target", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(results)
print(f"Accuracy: {accuracy*100:.2f}%")

evaluator = MulticlassClassificationEvaluator(labelCol="target", predictionCol="prediction", metricName="weightedPrecision")
weightedPrecision = evaluator.evaluate(results)
print(f"Weighted Precision: {weightedPrecision*100:.2f}%")

evaluator = MulticlassClassificationEvaluator(labelCol="target", predictionCol="prediction", metricName="weightedRecall")
weightedRecall = evaluator.evaluate(results)
print(f"Weighted Recall: {weightedRecall*100:.2f}%")

evaluator = MulticlassClassificationEvaluator(labelCol="target", predictionCol="prediction", metricName="f1")
f1 = evaluator.evaluate(results)
print(f"F1 Score: {f1*100:.2f}%")

Accuracy: 77.84%
Weighted Precision: 80.28%
Weighted Recall: 77.84%
F1 Score: 78.74%


In [10]:
classifier = RandomForestClassifier(labelCol='target', featuresCol='features')
classifier = classifier.fit(train)

# Evaluate the model on the test data
results = classifier.transform(test)

# Print the accuracy, precision, recall and f1 score
evaluator = MulticlassClassificationEvaluator(labelCol="target", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(results)
print(f"Accuracy: {accuracy*100:.2f}%")

evaluator = MulticlassClassificationEvaluator(labelCol="target", predictionCol="prediction", metricName="weightedPrecision")
weightedPrecision = evaluator.evaluate(results)
print(f"Weighted Precision: {weightedPrecision*100:.2f}%")

evaluator = MulticlassClassificationEvaluator(labelCol="target", predictionCol="prediction", metricName="weightedRecall")
weightedRecall = evaluator.evaluate(results)
print(f"Weighted Recall: {weightedRecall*100:.2f}%")

evaluator = MulticlassClassificationEvaluator(labelCol="target", predictionCol="prediction", metricName="f1")
f1 = evaluator.evaluate(results)
print(f"F1 Score: {f1*100:.2f}%")

Accuracy: 82.87%
Weighted Precision: 81.68%
Weighted Recall: 82.87%
F1 Score: 80.55%


In [11]:
classifier = LinearSVC(labelCol='target', featuresCol='features')
classifier = classifier.fit(train)

# Evaluate the model on the test data
results = classifier.transform(test)

# Print the accuracy, precision, recall and f1 score
evaluator = MulticlassClassificationEvaluator(labelCol="target", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(results)
print(f"Accuracy: {accuracy*100:.2f}%")

evaluator = MulticlassClassificationEvaluator(labelCol="target", predictionCol="prediction", metricName="weightedPrecision")
weightedPrecision = evaluator.evaluate(results)
print(f"Weighted Precision: {weightedPrecision*100:.2f}%")

evaluator = MulticlassClassificationEvaluator(labelCol="target", predictionCol="prediction", metricName="weightedRecall")
weightedRecall = evaluator.evaluate(results)
print(f"Weighted Recall: {weightedRecall*100:.2f}%")

evaluator = MulticlassClassificationEvaluator(labelCol="target", predictionCol="prediction", metricName="f1")
f1 = evaluator.evaluate(results)
print(f"F1 Score: {f1*100:.2f}%")

Accuracy: 82.83%
Weighted Precision: 81.47%
Weighted Recall: 82.83%
F1 Score: 80.87%


# Naive Bayes Classifier using Map-Reduce

In [85]:
# Convert the data to RDD
train_rdd_spark = train_df_spark.rdd

In [86]:
# Count the number of occurence of each class and each value of the features
f1_map = train_rdd_spark.map(lambda x: ((x[0]), 1))
f2_map = train_rdd_spark.map(lambda x: ((x[1]), 1))
f3_map = train_rdd_spark.map(lambda x: ((x[2]), 1))
target_map = train_rdd_spark.map(lambda x: (x[3], 1))

# Reduce the data to count the number of each class
f1_reduce = f1_map.reduceByKey(lambda x, y: x + y)
f2_reduce = f2_map.reduceByKey(lambda x, y: x + y)
f3_reduce = f3_map.reduceByKey(lambda x, y: x + y)
target_reduce = target_map.reduceByKey(lambda x, y: x + y)


# Map the data to count the occurence of each class with the different values of the features
f1_target_map = train_rdd_spark.map(lambda x: ((x[0], x[3]), 1))
f2_target_map = train_rdd_spark.map(lambda x: ((x[1], x[3]), 1))
f3_target_map = train_rdd_spark.map(lambda x: ((x[2], x[3]), 1))

# Reduce the data to count the number of each class
f1_target_reduce = f1_target_map.reduceByKey(lambda x, y: x + y)
f2_target_reduce = f2_target_map.reduceByKey(lambda x, y: x + y)
f3_target_reduce = f3_target_map.reduceByKey(lambda x, y: x + y)

In [87]:
# Need to calculate the propability of each class for each value of the features

# f1
prop_f1_target_reduce = f1_target_reduce.map(lambda x: (x[0][0], (x[0][1], x[1]))) #Convert the schema ((f1, class), count) -> (f1, (class, count))
prop_f1_target_reduce = prop_f1_target_reduce.join(f1_reduce) # Join the RDDs to get the total number of occurence of each f1 (f1, ((class, count), total)), that's why we needed the above step
prop_f1_target_reduce = prop_f1_target_reduce.map(lambda x: (x[0], (x[1][0][0], x[1][0][1]), x[1][1])) # Convert the schema (f1, ((class, count), total)) -> (f1, (class, count), total)
prop_f1_target_reduce = prop_f1_target_reduce.map(lambda x: (x[0], (x[1][0], x[1][1]/x[2])))
prop_f1_target_reduce = prop_f1_target_reduce.groupByKey().mapValues(list)

# f2
prop_f2_target_reduce = f2_target_reduce.map(lambda x: (x[0][0], (x[0][1], x[1]))) #Convert the schema ((f2, class), count) -> (f2, (class, count))
prop_f2_target_reduce = prop_f2_target_reduce.join(f2_reduce) # Join the RDDs to get the total number of occurence of each f2 (f2, ((class, count), total)), that's why we needed the above step
prop_f2_target_reduce = prop_f2_target_reduce.map(lambda x: (x[0], (x[1][0][0], x[1][0][1]), x[1][1])) # Convert the schema (f2, ((class, count), total)) -> (f2, (class, count), total)
prop_f2_target_reduce = prop_f2_target_reduce.map(lambda x: (x[0], (x[1][0], x[1][1]/x[2])))
prop_f2_target_reduce = prop_f2_target_reduce.groupByKey().mapValues(list)


# f3
prop_f3_target_reduce = f3_target_reduce.map(lambda x: (x[0][0], (x[0][1], x[1]))) #Convert the schema ((f3, class), count) -> (f3, (class, count))
prop_f3_target_reduce = prop_f3_target_reduce.join(f3_reduce) # Join the RDDs to get the total number of occurence of each f3 (f3, ((class, count), total)), that's why we needed the above step
prop_f3_target_reduce = prop_f3_target_reduce.map(lambda x: (x[0], (x[1][0][0], x[1][0][1]), x[1][1])) # Convert the schema (f3, ((class, count), total)) -> (f3, (class, count), total)
prop_f3_target_reduce = prop_f3_target_reduce.map(lambda x: (x[0], (x[1][0], x[1][1]/x[2])))
prop_f3_target_reduce = prop_f3_target_reduce.groupByKey().mapValues(list)


# Number of records
N = train_rdd_spark.count()
prop_target_reduce = target_reduce.map(lambda x: (x[0], x[1] / N))

In [88]:
# We want to convert the RDDs to dictionaries to be able to use them in the prediction function
prop_f1_target_reduce_dict = prop_f1_target_reduce.collectAsMap()
prop_f2_target_reduce_dict = prop_f2_target_reduce.collectAsMap()
prop_f3_target_reduce_dict = prop_f3_target_reduce.collectAsMap()
prop_target_reduce_dict = prop_target_reduce.collectAsMap()

# Need to sort the values of the dictionaries by the class to be able to use them in the prediction function
for key in prop_f1_target_reduce_dict:
    prop_f1_target_reduce_dict[key].sort(key=lambda x: x[0])
    
for key in prop_f2_target_reduce_dict:
    prop_f2_target_reduce_dict[key].sort(key=lambda x: x[0])
    
for key in prop_f3_target_reduce_dict:
    prop_f3_target_reduce_dict[key].sort(key=lambda x: x[0])
    
prop_target_reduce_dict = sorted(prop_target_reduce_dict.items(), key=lambda x: x[0])

In [89]:
# Function to predict the target target of a pet given its features
def predict(f1, f2, f3_):
    # f1
    f1_target = prop_f1_target_reduce_dict[f1] # This will return a list of tuples (class, probability)
    f1_target = [x[1] for x in f1_target] # We only want the probabilities (the class is the index of the list)
    # f2
    f2_target = prop_f2_target_reduce_dict[f2]
    f2_target = [x[1] for x in f2_target]
    
    # f3
    f3_target = prop_f3_target_reduce_dict[f3_]
    f3_target = [x[1] for x in f3_target]
    
    # We compute the product of the probabilities of each class given the features
    prob = [a*b*c for a,b,c in zip(f1_target, f2_target, f3_target)]
    
    # We compute the argmax of the probabilities
    prediction = prob.index(max(prob))
    
    return prediction

In [90]:
## Now we can predict the target target of a pet given its features
test_data = pd.read_csv("../datasets/test.csv")


# For each pet in the test set we predict the target target
f1s = test_data["f1"].tolist()
f2s = test_data["f2"].tolist()
f3s = test_data["f3"].tolist()
y_true = test_data["target"].tolist()

y_pred = []

for i in range(len(f1s)):
    f1 = f1s[i]
    f2 = f2s[i]
    f3_ = f3s[i]
    prediction = predict(f1, f2, f3_)
    y_pred.append(prediction)

print(f"Accuracy: {accuracy_score(y_true, y_pred)*100:.2f}%")
print(f"Precision: {precision_score(y_true, y_pred, average='weighted')*100:.2f}%")
print(f"Recall: {recall_score(y_true, y_pred, average='weighted')*100:.2f}%")
print(f"F1: {f1_score(y_true, y_pred, average='weighted')*100:.2f}%")

Accuracy: 80.83%
Precision: 81.79%
Recall: 80.83%
F1: 75.38%
