### Part of EECS 6893 - Big Data Analytics - Fall 2021 - Final Project 

* Group ID:- 202112-53
* Title:- Assaying MSD
* Contributors:- Karpagam Murugappan; Arya Kasulla

Year Prediction using MLP Classifier

In [2]:
import os
from pyspark import SparkConf                                                                                                                 
from pyspark.context import SparkContext                                                                                                      
from pyspark.sql import SparkSession, SQLContext
from pyspark import *
from pyspark.sql import *
import pandas as pd
import numpy as np
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
import matplotlib.pyplot as plt

plt.rcParams['font.size']= 15

In [3]:
'''
Create spark context and session
'''
sc = SparkContext('local')
spark = SparkSession(sc)

#### 90 input features

In [4]:
'''
Read the input in LibSVM format - same format as required by regression model
'''
dataset = spark.read.format("libsvm").load(os.getcwd()+'/data/reg_data_90_features.txt')

In [5]:
'''
80% Training : 20% Testing split; random seed 100
'''
train, test = dataset.randomSplit([0.8, 0.2], seed = 100)

In [6]:
'''
Input - 90 units
Hidden1 - 180 units
Output - 89 units
block size 128
seed 1234
iterations 100
'''
layers = [90,180,89]
trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128, seed=1234)

In [7]:
'''
Fit the model with training data
'''
model = trainer.fit(train)

In [8]:
#Testing
result = model.transform(test)
predictionAndLabels = result.select("prediction", "label")
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
print("Test set accuracy = " + str(evaluator.evaluate(predictionAndLabels)))


Test set accuracy = 0.07671065635718853


In [9]:
'''
Input - 90 units
Hidden1 - 100 units
Hidden2 - 180 units
Hidden3 - 100 units
Output - 89 units
block size 128
seed 445
iterations 100
'''
layers = [90,100,180,100,89]
trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128, seed=445)

In [10]:
'''
Fit the model with training data
'''
model = trainer.fit(train)

In [11]:
#Testing
result = model.transform(test)
predictionAndLabels = result.select("prediction", "label")
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
print("Test set accuracy = " + str(evaluator.evaluate(predictionAndLabels)))


Test set accuracy = 0.07856132395403367


#### 12 input features

In [12]:
'''
Read the input in LibSVM format - same format as required by regression model
'''
dataset = spark.read.format("libsvm").load(os.getcwd()+'/data/reg_data_12_features.txt')

In [13]:
'''
80% Training : 20% Testing split; random seed 987
'''
train, test = dataset.randomSplit([0.8, 0.2], seed = 987)

In [14]:
'''
Input - 12 units
Hidden1 - 100 units
Output - 89 units
block size 128
seed 223
iterations 100
'''
layers = [12,100,89]
trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128, seed=223)

In [15]:
'''
Fit the model with training data
'''
model = trainer.fit(train)

In [16]:
#Testing
result = model.transform(test)
predictionAndLabels = result.select("prediction", "label")
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
print("Test set accuracy = " + str(evaluator.evaluate(predictionAndLabels)))


Test set accuracy = 0.07913108672580159


In [17]:
tmp = MulticlassClassificationEvaluator(metricName='logLoss')
fpr_0 = tmp.evaluate(result)
print("Log loss: "+str(fpr_0))

Log loss: 3.334881019367738


In [18]:
tmp = MulticlassClassificationEvaluator(metricName='f1')
fpr_0 = tmp.evaluate(result)
print("F1 score: "+str(fpr_0))

F1 score: 0.0514995597366196


In [19]:
tmp = MulticlassClassificationEvaluator(metricName='weightedRecall')
fpr_0 = tmp.evaluate(result)
print("Weighted Recall: "+str(fpr_0))

Weighted Recall: 0.07913108672580157


In [20]:
tmp = MulticlassClassificationEvaluator(metricName='weightedPrecision')
fpr_0 = tmp.evaluate(result)
print("weightedPrecision: "+str(fpr_0))

weightedPrecision: 0.05939671594106123


In [21]:
tmp = MulticlassClassificationEvaluator(metricName='weightedFMeasure')
fpr_0 = tmp.evaluate(result)
print("weightedFMeasure: "+str(fpr_0))

weightedFMeasure: 0.0514995597366196


In [22]:
tmp = MulticlassClassificationEvaluator(metricName='weightedTruePositiveRate')
fpr_0 = tmp.evaluate(result)
print("weightedTruePositiveRate: "+str(fpr_0))

weightedTruePositiveRate: 0.07913108672580157


In [23]:
tmp = MulticlassClassificationEvaluator(metricName='weightedFalsePositiveRate')
fpr_0 = tmp.evaluate(result)
print("weightedFalsePositiveRate: "+str(fpr_0))

weightedFalsePositiveRate: 0.06487606435634571


In [24]:
'''
Input - 12 units
Hidden1 - 24 units
Hidden1 - 144 units
Hidden1 - 100 units
Output - 89 units
block size 128
seed 203
iterations 100
'''
layers = [12,24,144,100,89]
trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128, seed=203)

In [25]:
'''
Fit the model with training data
'''
model = trainer.fit(train)

In [26]:
#Testing
result = model.transform(test)
predictionAndLabels = result.select("prediction", "label")
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
print("Test set accuracy = " + str(evaluator.evaluate(predictionAndLabels)))


Test set accuracy = 0.07813793170598717


In [27]:
tmp = MulticlassClassificationEvaluator(metricName='logLoss')
fpr_0 = tmp.evaluate(result)
print("Log loss: "+str(fpr_0))

Log loss: 3.3539402537912277


In [28]:
tmp = MulticlassClassificationEvaluator(metricName='f1')
fpr_0 = tmp.evaluate(result)
print("F1 score: "+str(fpr_0))

F1 score: 0.03145371092166778


In [29]:
tmp = MulticlassClassificationEvaluator(metricName='weightedRecall')
fpr_0 = tmp.evaluate(result)
print("Weighted Recall: "+str(fpr_0))

Weighted Recall: 0.07813793170598718


In [30]:
tmp = MulticlassClassificationEvaluator(metricName='weightedPrecision')
fpr_0 = tmp.evaluate(result)
print("weightedPrecision: "+str(fpr_0))

weightedPrecision: 0.05209223728449882


In [31]:
tmp = MulticlassClassificationEvaluator(metricName='weightedFMeasure')
fpr_0 = tmp.evaluate(result)
print("weightedFMeasure: "+str(fpr_0))

weightedFMeasure: 0.03145371092166778


In [32]:
tmp = MulticlassClassificationEvaluator(metricName='weightedTruePositiveRate')
fpr_0 = tmp.evaluate(result)
print("weightedTruePositiveRate: "+str(fpr_0))

weightedTruePositiveRate: 0.07813793170598718


In [33]:
tmp = MulticlassClassificationEvaluator(metricName='weightedFalsePositiveRate')
fpr_0 = tmp.evaluate(result)
print("weightedFalsePositiveRate: "+str(fpr_0))

weightedFalsePositiveRate: 0.07173076387327393
