In [1]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from pyspark.ml.feature import VectorAssembler
from pyspark.sql import SparkSession

import findspark
findspark.init()

In [2]:
spark_session = SparkSession.builder.master("local[*]").config("spark.driver.memory", "15g").appName('NB_MapReduce').getOrCreate()

In [3]:
# Read dataset with spark
train_df_spark = spark_session.read.csv('train.csv', header=True, inferSchema=True)

In [4]:
input_cols = train_df_spark.columns[:-1]
output_col = train_df_spark.columns[-1]

In [5]:
# Encode the features into a vector
featureassemble = VectorAssembler(inputCols=input_cols, outputCol='features')
output = featureassemble.transform(train_df_spark)
output.show(n=5)

+---+---+---+---+---+---+---+---+---+---+------+--------------------+
| f1| f2| f3| f4| f5| f6| f7| f8| f9|f10|target|            features|
+---+---+---+---+---+---+---+---+---+---+------+--------------------+
|  0| 21| 20| 61| 51|142|141|  8|  4|  0|     0|[0.0,21.0,20.0,61...|
|  0| 14| 15| 29| 35|164|168|  4|  4|  0|     0|[0.0,14.0,15.0,29...|
|  1| 20|  8| 65| 59|221|225|  4|  4|  0|     0|[1.0,20.0,8.0,65....|
|101| 14| 13| 78| 84|111|120|  4|  4|  0|     1|[101.0,14.0,13.0,...|
|  0|  0| 11| 20| 52| 47| 66|  7|  7|  0|     0|[0.0,0.0,11.0,20....|
+---+---+---+---+---+---+---+---+---+---+------+--------------------+
only showing top 5 rows



In [6]:
# Convert the data to RDD
train_rdd_spark = train_df_spark.rdd

In [7]:
# Count the number of occurence of each class and each value of the features
f_map = []
for i in range(len(input_cols)):
    f_map.append(train_rdd_spark.map(lambda x: ((x[i]), 1)))
target_map = train_rdd_spark.map(lambda x: (x[len(input_cols)], 1))

# Reduce the data to count the number of each class
f_reduce = []
for i in range(len(input_cols)):
    f_reduce.append(f_map[i].reduceByKey(lambda x, y: x + y))
target_reduce = target_map.reduceByKey(lambda x, y: x + y)


# Map the data to count the occurence of each class with the different values of the features
f_target_map = []
for i in range(len(input_cols)):
    f_target_map.append(train_rdd_spark.map(lambda x: ((x[i], x[len(input_cols)]), 1)))

# Reduce the data to count the number of each class
f_target_reduce = []
for i in range(len(input_cols)):
    f_target_reduce.append(f_target_map[i].reduceByKey(lambda x, y: x + y))

In [8]:
# Calculate the probability of each class for each value of the features
prop_f_target_reduce = []
for i in range(len(input_cols)):
    prop_f_target_reduce.append(f_target_reduce[i].map(lambda x: (x[0][0], (x[0][1], x[1]))))
    prop_f_target_reduce[i] = prop_f_target_reduce[i].join(f_reduce[i])
    prop_f_target_reduce[i] = prop_f_target_reduce[i].map(lambda x: (x[0], (x[1][0][0], x[1][0][1]), x[1][1]))
    prop_f_target_reduce[i] = prop_f_target_reduce[i].map(lambda x: (x[0], (x[1][0], x[1][1] / x[2])))
    prop_f_target_reduce[i] = prop_f_target_reduce[i].groupByKey().mapValues(list)


# Find number of records
N = train_rdd_spark.count()
prop_target_reduce = target_reduce.map(lambda x: (x[0], x[1] / N))

In [9]:
# Convert the RDDs to dictionaries to use them in the prediction function
prop_f_target_reduce_dict = []
for i in range(len(input_cols)):
    prop_f_target_reduce_dict.append(prop_f_target_reduce[i].collectAsMap())
prop_target_reduce_dict = prop_target_reduce.collectAsMap()

# Sort the values of the dictionaries by the class to use them in the prediction function
for i in range(len(input_cols)):
    for key in prop_f_target_reduce_dict[i]:
        prop_f_target_reduce_dict[i][key].sort(key=lambda x: x[0])
    
prop_target_reduce_dict = sorted(prop_target_reduce_dict.items(), key=lambda x: x[0])

In [10]:
# Predict the target given features
def predict(features):
    f_target = []
    for i in range(len(input_cols)):
        if features[i]  in prop_f_target_reduce_dict[i]:
            f_target.append(prop_f_target_reduce_dict[i][features[i]])
            f_target[i] = [x[1] for x in f_target[i]]
            if len(f_target[i]) < len(prop_target_reduce_dict):
                if f_target[i][0] == 0:
                    f_target[i].insert(1, 0)
                else:
                    f_target[i].insert(0, 0)
        else:
            f_target.append([0] * len(prop_target_reduce_dict))
            
    prob = [1] * len(f_target[0])

    for j in range(len(f_target[0])):
        for i in range(len(input_cols)):
            prob[j] *= f_target[i][j]
    # Argmax
    prediction = prob.index(max(prob))
    
    return prediction

In [11]:
test_data = pd.read_csv("test.csv")

test_features = []
for i in range(len(input_cols)):
    test_features.append(test_data[input_cols[i]].tolist())

y_true = test_data[output_col].tolist()
y_pred = []

for i in range(len(test_features[0])):
    features = []
    for j in range(len(input_cols)):
        features.append(test_features[j][i])
    prediction = predict(features)
    y_pred.append(prediction)

print("Naive Bayes Classifier with MapReduce")
print(f"Accuracy: {accuracy_score(y_true, y_pred) * 100:.2f}%")
print(f"Precision: {precision_score(y_true, y_pred, average='weighted') * 100:.2f}%")
print(f"Recall: {recall_score(y_true, y_pred, average='weighted') * 100:.2f}%")
print(f"F1: {f1_score(y_true, y_pred, average='weighted') * 100:.2f}%")

Naive Bayes Classifier with MapReduce
Accuracy: 77.70%
Precision: 82.17%
Recall: 77.70%
F1: 68.11%
