In [52]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.feature_selection import SelectKBest
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import LinearSVC
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


import findspark
findspark.init()

In [53]:
spark_session = SparkSession.builder.master("local[*]").config("spark.driver.memory", "15g").appName('Models').getOrCreate()

In [54]:
# Read datasets with spark
train_df_spark = spark_session.read.csv('train.csv', header=True, inferSchema=True)
test_df_spark = spark_session.read.csv('test.csv', header=True, inferSchema=True)

In [55]:
input_cols = train_df_spark.columns[:-1]
output_col = train_df_spark.columns[-1]

In [56]:
# Encode the features into a vector
featureassemble = VectorAssembler(inputCols=input_cols, outputCol='features')
output = featureassemble.transform(train_df_spark)
output.show(n=5)

testfeatureassemble = VectorAssembler(inputCols=input_cols, outputCol='features')
testoutput = testfeatureassemble.transform(test_df_spark)
testoutput.show(n=5)

+---+---+---+---+---+---+---+---+---+---+------+--------------------+
| f1| f2| f3| f4| f5| f6| f7| f8| f9|f10|target|            features|
+---+---+---+---+---+---+---+---+---+---+------+--------------------+
|  0| 21| 20| 61| 51|142|141|  8|  4|  0|     0|[0.0,21.0,20.0,61...|
|  0| 14| 15| 29| 35|164|168|  4|  4|  0|     0|[0.0,14.0,15.0,29...|
|  1| 20|  8| 65| 59|221|225|  4|  4|  0|     0|[1.0,20.0,8.0,65....|
|101| 14| 13| 78| 84|111|120|  4|  4|  0|     1|[101.0,14.0,13.0,...|
|  0|  0| 11| 20| 52| 47| 66|  7|  7|  0|     0|[0.0,0.0,11.0,20....|
+---+---+---+---+---+---+---+---+---+---+------+--------------------+
only showing top 5 rows

+---+---+---+---+---+---+---+---+---+---+------+--------------------+
| f1| f2| f3| f4| f5| f6| f7| f8| f9|f10|target|            features|
+---+---+---+---+---+---+---+---+---+---+------+--------------------+
|  0| 14| 13| 50| 47| 87| 65|  0|  4|  0|     0|[0.0,14.0,13.0,50...|
|  0| 14| 21| 65| 67|147|143|  4|  4|  0|     0|[0.0,14.0,21.0,65

In [57]:
# Select the features and the target column

train = output.select('features', 'target') 
train.show(n=5)

test = testoutput.select('features', 'target')
test.show(n=5)

+--------------------+------+
|            features|target|
+--------------------+------+
|[0.0,21.0,20.0,61...|     0|
|[0.0,14.0,15.0,29...|     0|
|[1.0,20.0,8.0,65....|     0|
|[101.0,14.0,13.0,...|     1|
|[0.0,0.0,11.0,20....|     0|
+--------------------+------+
only showing top 5 rows

+--------------------+------+
|            features|target|
+--------------------+------+
|[0.0,14.0,13.0,50...|     0|
|[0.0,14.0,21.0,65...|     0|
|[0.0,34.0,22.0,34...|     0|
|[0.0,14.0,13.0,39...|     0|
|[0.0,14.0,8.0,66....|     0|
+--------------------+------+
only showing top 5 rows



In [58]:
# Select the features and the target column
train = output.select('features', output_col)
train.show(n=5)

test = testoutput.select('features', output_col)
test.show(n=5)

+--------------------+------+
|            features|target|
+--------------------+------+
|[0.0,21.0,20.0,61...|     0|
|[0.0,14.0,15.0,29...|     0|
|[1.0,20.0,8.0,65....|     0|
|[101.0,14.0,13.0,...|     1|
|[0.0,0.0,11.0,20....|     0|
+--------------------+------+
only showing top 5 rows

+--------------------+------+
|            features|target|
+--------------------+------+
|[0.0,14.0,13.0,50...|     0|
|[0.0,14.0,21.0,65...|     0|
|[0.0,34.0,22.0,34...|     0|
|[0.0,14.0,13.0,39...|     0|
|[0.0,14.0,8.0,66....|     0|
+--------------------+------+
only showing top 5 rows



In [59]:
classifier = LogisticRegression(labelCol=output_col).fit(train)

# Evaluate the model on the test data
results = classifier.evaluate(test) 

# Print the accuracy, precision, recall and f1 score
evaluator = MulticlassClassificationEvaluator(labelCol=output_col, predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(results.predictions)
print(f"Accuracy: {accuracy*100:.2f}%")

evaluator = MulticlassClassificationEvaluator(labelCol=output_col, predictionCol="prediction", metricName="weightedPrecision")
weightedPrecision = evaluator.evaluate(results.predictions)
print(f"Weighted Precision: {weightedPrecision*100:.2f}%")

evaluator = MulticlassClassificationEvaluator(labelCol=output_col, predictionCol="prediction", metricName="weightedRecall")
weightedRecall = evaluator.evaluate(results.predictions)
print(f"Weighted Recall: {weightedRecall*100:.2f}%")

evaluator = MulticlassClassificationEvaluator(labelCol=output_col, predictionCol="prediction", metricName="f1")
f1 = evaluator.evaluate(results.predictions)
print(f"F1 Score: {f1*100:.2f}%")

Accuracy: 84.08%
Weighted Precision: 82.99%
Weighted Recall: 84.08%
F1 Score: 82.86%


In [60]:
classifier = NaiveBayes(labelCol=output_col, featuresCol='features')
classifier = classifier.fit(train)

# Evaluate the model on the test data
results = classifier.transform(test)

# Print the accuracy, precision, recall and f1 score
evaluator = MulticlassClassificationEvaluator(labelCol=output_col, predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(results)
print(f"Accuracy: {accuracy*100:.2f}%")

evaluator = MulticlassClassificationEvaluator(labelCol=output_col, predictionCol="prediction", metricName="weightedPrecision")
weightedPrecision = evaluator.evaluate(results)
print(f"Weighted Precision: {weightedPrecision*100:.2f}%")

evaluator = MulticlassClassificationEvaluator(labelCol=output_col, predictionCol="prediction", metricName="weightedRecall")
weightedRecall = evaluator.evaluate(results)
print(f"Weighted Recall: {weightedRecall*100:.2f}%")

evaluator = MulticlassClassificationEvaluator(labelCol=output_col, predictionCol="prediction", metricName="f1")
f1 = evaluator.evaluate(results)
print(f"F1 Score: {f1*100:.2f}%")

Accuracy: 76.79%
Weighted Precision: 79.18%
Weighted Recall: 76.79%
F1 Score: 77.70%


In [61]:
classifier = RandomForestClassifier(numTrees=100, labelCol=output_col, featuresCol='features')
classifier = classifier.fit(train)

# Evaluate the model on the test data
results = classifier.transform(test)

# Print the accuracy, precision, recall and f1 score
evaluator = MulticlassClassificationEvaluator(labelCol=output_col, predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(results)
print(f"Accuracy: {accuracy*100:.2f}%")

evaluator = MulticlassClassificationEvaluator(labelCol=output_col, predictionCol="prediction", metricName="weightedPrecision")
weightedPrecision = evaluator.evaluate(results)
print(f"Weighted Precision: {weightedPrecision*100:.2f}%")

evaluator = MulticlassClassificationEvaluator(labelCol=output_col, predictionCol="prediction", metricName="weightedRecall")
weightedRecall = evaluator.evaluate(results)
print(f"Weighted Recall: {weightedRecall*100:.2f}%")

evaluator = MulticlassClassificationEvaluator(labelCol=output_col, predictionCol="prediction", metricName="f1")
f1 = evaluator.evaluate(results)
print(f"F1 Score: {f1*100:.2f}%")

Accuracy: 83.54%
Weighted Precision: 82.67%
Weighted Recall: 83.54%
F1 Score: 81.37%


In [62]:
classifier = LinearSVC(labelCol=output_col, featuresCol='features')
classifier = classifier.fit(train)

# Evaluate the model on the test data
results = classifier.transform(test)

# Print the accuracy, precision, recall and f1 score
evaluator = MulticlassClassificationEvaluator(labelCol=output_col, predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(results)
print(f"Accuracy: {accuracy*100:.2f}%")

evaluator = MulticlassClassificationEvaluator(labelCol=output_col, predictionCol="prediction", metricName="weightedPrecision")
weightedPrecision = evaluator.evaluate(results)
print(f"Weighted Precision: {weightedPrecision*100:.2f}%")

evaluator = MulticlassClassificationEvaluator(labelCol=output_col, predictionCol="prediction", metricName="weightedRecall")
weightedRecall = evaluator.evaluate(results)
print(f"Weighted Recall: {weightedRecall*100:.2f}%")

evaluator = MulticlassClassificationEvaluator(labelCol=output_col, predictionCol="prediction", metricName="f1")
f1 = evaluator.evaluate(results)
print(f"F1 Score: {f1*100:.2f}%")

Accuracy: 83.93%
Weighted Precision: 82.85%
Weighted Recall: 83.93%
F1 Score: 82.45%


# Naive Bayes Classifier using Map-Reduce

In [63]:
# Convert the data to RDD
train_rdd_spark = train_df_spark.rdd

In [64]:
# Count the number of occurence of each class and each value of the features
f_map = []
for i in range(len(input_cols)):
    f_map.append(train_rdd_spark.map(lambda x: ((x[i]), 1)))
target_map = train_rdd_spark.map(lambda x: (x[len(input_cols)], 1))

# Reduce the data to count the number of each class
f_reduce = []
for i in range(len(input_cols)):
    f_reduce.append(f_map[i].reduceByKey(lambda x, y: x + y))
target_reduce = target_map.reduceByKey(lambda x, y: x + y)


# Map the data to count the occurence of each class with the different values of the features
f_target_map = []
for i in range(len(input_cols)):
    f_target_map.append(train_rdd_spark.map(lambda x: ((x[i], x[len(input_cols)]), 1)))

# Reduce the data to count the number of each class
f_target_reduce = []
for i in range(len(input_cols)):
    f_target_reduce.append(f_target_map[i].reduceByKey(lambda x, y: x + y))

In [65]:
# Calculate the probability of each class for each value of the features
prop_f_target_reduce = []
for i in range(len(input_cols)):
    prop_f_target_reduce.append(f_target_reduce[i].map(lambda x: (x[0][0], (x[0][1], x[1]))))
    prop_f_target_reduce[i] = prop_f_target_reduce[i].join(f_reduce[i])
    prop_f_target_reduce[i] = prop_f_target_reduce[i].map(lambda x: (x[0], (x[1][0][0], x[1][0][1]), x[1][1]))
    prop_f_target_reduce[i] = prop_f_target_reduce[i].map(lambda x: (x[0], (x[1][0], x[1][1] / x[2])))
    prop_f_target_reduce[i] = prop_f_target_reduce[i].groupByKey().mapValues(list)


# Find number of records
N = train_rdd_spark.count()
prop_target_reduce = target_reduce.map(lambda x: (x[0], x[1] / N))

In [66]:
# Convert the RDDs to dictionaries to use them in the prediction function
prop_f_target_reduce_dict = []
for i in range(len(input_cols)):
    prop_f_target_reduce_dict.append(prop_f_target_reduce[i].collectAsMap())
prop_target_reduce_dict = prop_target_reduce.collectAsMap()

# Sort the values of the dictionaries by the class to use them in the prediction function
for i in range(len(input_cols)):
    for key in prop_f_target_reduce_dict[i]:
        prop_f_target_reduce_dict[i][key].sort(key=lambda x: x[0])
    
prop_target_reduce_dict = sorted(prop_target_reduce_dict.items(), key=lambda x: x[0])

In [67]:
# Predict the target given features
def predict(features):
    f_target = []
    for i in range(len(input_cols)):
        if features[i]  in prop_f_target_reduce_dict[i]:
            f_target.append(prop_f_target_reduce_dict[i][features[i]])
            f_target[i] = [x[1] for x in f_target[i]]
            if len(f_target[i]) < len(prop_target_reduce_dict):
                if f_target[i][0] == 0:
                    f_target[i].insert(1, 0)
                else:
                    f_target[i].insert(0, 0)
        else:
            f_target.append([0] * len(prop_target_reduce_dict))
            
    prob = [1] * len(f_target[0])

    for j in range(len(f_target[0])):
        for i in range(len(input_cols)):
            prob[j] *= f_target[i][j]
    # Argmax
    prediction = prob.index(max(prob))
    
    return prediction

In [69]:
test_data = pd.read_csv("test.csv")

test_features = []
for i in range(len(input_cols)):
    test_features.append(test_data[input_cols[i]].tolist())

y_true = test_data[output_col].tolist()
y_pred = []

for i in range(len(test_features[0])):
    features = []
    for j in range(len(input_cols)):
        features.append(test_features[j][i])
    prediction = predict(features)
    y_pred.append(prediction)

print("Naive Bayes Classifier with MapReduce")
print(f"Accuracy: {accuracy_score(y_true, y_pred) * 100:.2f}%")
print(f"Precision: {precision_score(y_true, y_pred, average='weighted') * 100:.2f}%")
print(f"Recall: {recall_score(y_true, y_pred, average='weighted') * 100:.2f}%")
print(f"F1: {f1_score(y_true, y_pred, average='weighted') * 100:.2f}%")
# print(classification_report(y_true, y_pred))

Naive Bayes Classifier with MapReduce
Accuracy: 77.70%
Precision: 82.17%
Recall: 77.70%
F1: 68.11%
