In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType, StructType, StructField, StringType
from pyspark.ml.classification import LinearSVC
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder, CrossValidatorModel

In [2]:
spark = SparkSession.builder.master("local[*]").appName("SparkML").getOrCreate()
spark

In [3]:
dfSpark = spark.read.csv("/content/drive/MyDrive/rddPreprocessed", header=True, inferSchema=True)

In [4]:
csvCols = dfSpark.columns
csvCols.remove('intelligence')
featureAssembler = VectorAssembler(inputCols=csvCols, outputCol="mlFeatures")
vaOutput = featureAssembler.transform(dfSpark)
finalizeData = vaOutput.select("mlFeatures", "intelligence")
finalizeData = finalizeData.dropna()
trainData, testData = finalizeData.randomSplit([0.80, 0.20])
finalizeData.show(5)

+--------------------+------------+
|          mlFeatures|intelligence|
+--------------------+------------+
|[0.32208684,0.367...|           0|
|[0.3193533,0.4034...|           0|
|[0.30920026,0.350...|           0|
|[0.30920026,0.364...|           0|
|[0.31380817,0.452...|           0|
+--------------------+------------+
only showing top 5 rows



In [6]:
#Linear SVC
spark.sparkContext.setJobGroup("LinearSVC", "Initialize and train LinearSVC Model")
svmModel = LinearSVC(featuresCol="mlFeatures", labelCol="intelligence", regParam=0.01)
accuracyEval = MulticlassClassificationEvaluator(labelCol="intelligence", predictionCol="prediction", metricName="accuracy")
f1Eval = MulticlassClassificationEvaluator(labelCol="intelligence", predictionCol="prediction", metricName="f1")
precisionEval = MulticlassClassificationEvaluator(labelCol="intelligence", predictionCol="prediction", metricName="weightedPrecision")
recallEval = MulticlassClassificationEvaluator(labelCol="intelligence", predictionCol="prediction", metricName="weightedRecall")

svmModel = svmModel.fit(trainData)
predictions = svmModel.transform(testData)
print("Accuracy: " + str(accuracyEval.evaluate(predictions)))
print("f1: " + str(f1Eval.evaluate(predictions)))
print("Precision: " + str(precisionEval.evaluate(predictions)))
print("Recall: " + str(recallEval.evaluate(predictions)))

Accuracy: 0.7248628884826326
f1: 0.7232240691651781
Precision: 0.7245396221535223
Recall: 0.7248628884826325


In [7]:
#Logistic Regressor
spark.sparkContext.setJobGroup("LogisticRegression", "Initialize and train LogisticRegression Model")
lrModel = LogisticRegression(featuresCol="mlFeatures", labelCol="intelligence", regParam=0.01)
accuracyEval = MulticlassClassificationEvaluator(labelCol="intelligence", predictionCol="prediction", metricName="accuracy")
f1Eval = MulticlassClassificationEvaluator(labelCol="intelligence", predictionCol="prediction", metricName="f1")
precisionEval = MulticlassClassificationEvaluator(labelCol="intelligence", predictionCol="prediction", metricName="weightedPrecision")
recallEval = MulticlassClassificationEvaluator(labelCol="intelligence", predictionCol="prediction", metricName="weightedRecall")

lrModel = lrModel.fit(trainData)
predictions = lrModel.transform(testData)
print("Accuracy: " + str(accuracyEval.evaluate(predictions)))
print("f1: " + str(f1Eval.evaluate(predictions)))
print("Precision: " + str(precisionEval.evaluate(predictions)))
print("Recall: " + str(recallEval.evaluate(predictions)))

Accuracy: 0.7120658135283364
f1: 0.7111966799418006
Precision: 0.7113361514010925
Recall: 0.7120658135283364


In [8]:
#RF Classifier
spark.sparkContext.setJobGroup("RandomForestClassifier", "Initialize and train RandomForestClassifier Model")
rfClassifier = RandomForestClassifier(
    featuresCol="mlFeatures", labelCol="intelligence",
    numTrees=20, maxDepth=10)
accuracyEval = MulticlassClassificationEvaluator(labelCol="intelligence", predictionCol="prediction", metricName="accuracy")
f1Eval = MulticlassClassificationEvaluator(labelCol="intelligence", predictionCol="prediction", metricName="f1")
precisionEval = MulticlassClassificationEvaluator(labelCol="intelligence", predictionCol="prediction", metricName="weightedPrecision")
recallEval = MulticlassClassificationEvaluator(labelCol="intelligence", predictionCol="prediction", metricName="weightedRecall")

rfClassifier = rfClassifier.fit(trainData)
predictions = rfClassifier.transform(testData)
print("Accuracy: " + str(accuracyEval.evaluate(predictions)))
print("f1: " + str(f1Eval.evaluate(predictions)))
print("Precision: " + str(precisionEval.evaluate(predictions)))
print("Recall: " + str(recallEval.evaluate(predictions)))

Accuracy: 0.8976234003656307
f1: 0.8970956070770195
Precision: 0.8995416897536028
Recall: 0.8976234003656307


In [9]:
# GBT Classifier
spark.sparkContext.setJobGroup("GBTClassifier", "Initialize and train GBTClassifier Model")
gbtClassifier = GBTClassifier(featuresCol='mlFeatures', labelCol='intelligence',
                                 stepSize=0.1, maxIter=50, maxDepth=4)
accuracyEval = MulticlassClassificationEvaluator(labelCol="intelligence", predictionCol="prediction", metricName="accuracy")
f1Eval = MulticlassClassificationEvaluator(labelCol="intelligence", predictionCol="prediction", metricName="f1")
precisionEval = MulticlassClassificationEvaluator(labelCol="intelligence", predictionCol="prediction", metricName="weightedPrecision")
recallEval = MulticlassClassificationEvaluator(labelCol="intelligence", predictionCol="prediction", metricName="weightedRecall")

gbtClassifier = gbtClassifier.fit(trainData)
predictions = gbtClassifier.transform(testData)
print("Accuracy: " + str(accuracyEval.evaluate(predictions)))
print("f1: " + str(f1Eval.evaluate(predictions)))
print("Precision: " + str(precisionEval.evaluate(predictions)))
print("Recall: " + str(recallEval.evaluate(predictions)))

Accuracy: 0.8711151736745887
f1: 0.8704305330639253
Precision: 0.8727623793669059
Recall: 0.8711151736745887


In [10]:
# Naive Bayes
spark.sparkContext.setJobGroup("NaiveBayes", "Initialize and train NaiveBayes Model")
nbModel = NaiveBayes(featuresCol="mlFeatures", labelCol="intelligence", modelType="gaussian")
accuracyEval = MulticlassClassificationEvaluator(labelCol="intelligence", predictionCol="prediction", metricName="accuracy")
f1Eval = MulticlassClassificationEvaluator(labelCol="intelligence", predictionCol="prediction", metricName="f1")
precisionEval = MulticlassClassificationEvaluator(labelCol="intelligence", predictionCol="prediction", metricName="weightedPrecision")
recallEval = MulticlassClassificationEvaluator(labelCol="intelligence", predictionCol="prediction", metricName="weightedRecall")

nbModel = nbModel.fit(trainData)
predictions = nbModel.transform(testData)
print("Accuracy: " + str(accuracyEval.evaluate(predictions)))
print("f1: " + str(f1Eval.evaluate(predictions)))
print("Precision: " + str(precisionEval.evaluate(predictions)))
print("Recall: " + str(recallEval.evaluate(predictions)))

Accuracy: 0.6252285191956124
f1: 0.6236559356851701
Precision: 0.6406645156384818
Recall: 0.6252285191956124
