In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType, StructType, StructField, StringType
from pyspark.ml.classification import LinearSVC
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.classification import NaiveBayes

In [3]:
spark = SparkSession.builder.master("local[*]").appName("SparkML").getOrCreate()
spark

In [4]:
dfSpark = spark.read.csv("/content/drive/MyDrive/rddPreprocessed", header=True, inferSchema=True)

In [5]:
csvCols = dfSpark.columns
csvCols.remove('intelligence')
featureAssembler = VectorAssembler(inputCols=csvCols, outputCol="mlFeatures")
vaOutput = featureAssembler.transform(dfSpark)
finalizeData = vaOutput.select("mlFeatures", "intelligence")
finalizeData = finalizeData.dropna()
trainData, testData = finalizeData.randomSplit([0.80, 0.20])
finalizeData.show(5)

+--------------------+------------+
|          mlFeatures|intelligence|
+--------------------+------------+
|[0.32208684,0.367...|           0|
|[0.3193533,0.4034...|           0|
|[0.30920026,0.350...|           0|
|[0.30920026,0.364...|           0|
|[0.31380817,0.452...|           0|
+--------------------+------------+
only showing top 5 rows



In [6]:
spark.sparkContext.setJobGroup("LinearSVC", "Initialize and train LinearSVC Model")
svm = LinearSVC(featuresCol="mlFeatures", labelCol="intelligence", regParam=0.01)
svmModel = svm.fit(trainData)
predictions = svmModel.evaluate(testData)
predictions.accuracy

0.7079796264855688

In [7]:
spark.sparkContext.setJobGroup("LogisticRegression", "Initialize and train LogisticRegression Model")
lrModel = LogisticRegression(featuresCol="mlFeatures", labelCol="intelligence", regParam=0.01)
lrModel = lrModel.fit(trainData)
predictions = lrModel.evaluate(testData)
predictions.accuracy

0.6884550084889643

In [8]:
spark.sparkContext.setJobGroup("RandomForestClassifier", "Initialize and train RandomForestClassifier Model")
rfClassifier = RandomForestClassifier(
    featuresCol="mlFeatures", labelCol="intelligence",
    numTrees=20, maxDepth=10
)
rfClassifier = rfClassifier.fit(trainData)
predictions = rfClassifier.evaluate(testData)
predictions.accuracy

0.8786078098471987

In [9]:
spark.sparkContext.setJobGroup("GBTClassifier", "Initialize and train GBTClassifier Model")
gbtClassifier = GBTClassifier(featuresCol='mlFeatures', labelCol='intelligence',
                                 stepSize=0.1, maxIter=50, maxDepth=4)
gbtClassifier = gbtClassifier.fit(trainData)
predictions = gbtClassifier.transform(testData)
evaluator = MulticlassClassificationEvaluator(labelCol="intelligence", predictionCol="prediction", metricName="accuracy")
evaluator.evaluate(predictions)

0.8505942275042445

In [10]:
spark.sparkContext.setJobGroup("NaiveBayes", "Initialize and train NaiveBayes Model")
nbModel = NaiveBayes(featuresCol="mlFeatures", labelCol="intelligence", modelType="gaussian")
nbTransformer = nbModel.fit(trainData)
predictions = nbTransformer.transform(testData)
evaluator = MulticlassClassificationEvaluator(labelCol="intelligence", predictionCol="prediction", metricName="accuracy")
evaluator.evaluate(predictions)

0.6570458404074703