In [337]:
#load required libraries
import pyspark
from pyspark.sql import Row
from pyspark.ml.classification import LogisticRegression,DecisionTreeClassifier,RandomForestClassifier,GBTClassifier,NaiveBayes,MultilayerPerceptronClassifier
from pyspark.ml.linalg import Vectors
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

#load data 
data=sc.textFile("data.csv")

#Perform ETL
df=data.map(lambda line: line.split(",")).filter(lambda line: "target" not in line[-1]).map(lambda line: [float((c.strip('"').strip(" "))) for c in line]).map(lambda line: Row(features=Vectors.dense(line[0:-1]),label=int(line[-1])));
df=spark.createDataFrame(df)

#separate training and test data
trainingData, testData = df.randomSplit([0.8, 0.2])

#Multilayer perceptron
layers = [30, 50, 40, 2]
clf=MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128, seed=1234)

#Logistic regression
clf = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)

#Decision trees
clf = DecisionTreeClassifier()

#Naive Bayes
clf = NaiveBayes(smoothing=1.0, modelType="multinomial")

#Gradient bosted trees
clf = GBTClassifier()

#Random forest
clf=RandomForestClassifier()


#traing
model = clf.fit(trainingData)

#predict
predictions = model.transform(testData)

# Select (prediction, true label) and compute metrics
accuracy = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy").evaluate(predictions)
weightedRecall = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="weightedRecall").evaluate(predictions)
weightedPrecision = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="weightedPrecision").evaluate(predictions)
f1 = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1").evaluate(predictions)

#print out results
print("Test Accuracy = %g " % (accuracy))
print("weighted Recall = %s" % weightedRecall)
print("weighted Precision = %s" % weightedPrecision)
print("f1 = %s" % f1)


Test Accuracy = 0.990991 
weighted Recall = 0.9909909909909911
weighted Precision = 0.9912219912219913
f1 = 0.991018425501184
