In [3]:
# Import packages
import os
import time
import pyspark
import findspark
import numpy as np
import pandas as pd

from pyspark import SparkContext, SparkConf
from pyspark.mllib.regression import LabeledPoint

from pyspark.ml import Pipeline
#from pyspark.ml.feature import *
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, StopWordsRemover, CountVectorizer
from pyspark.ml.feature import StringIndexer, VectorIndexer, OneHotEncoder, VectorAssembler, IndexToString

from pyspark.ml.linalg import Vectors
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

#from pyspark.sql.functions import *
import pyspark.sql.functions as F
from pyspark.sql.functions import col, when, explode
from pyspark.sql import SparkSession, SQLContext, Row, HiveContext

In [4]:
findspark.init()

# Creatingt Spark SQL environment
spark =SparkSession\
   .builder\
   .appName("test")\
   .enableHiveSupport().getOrCreate()

sc= spark.sparkContext
sqlContext= SQLContext(sc)

findspark.find()

'C:\\spark\\spark-3.0.1-bin-hadoop2.7'

In [5]:
# spark is an existing SparkSession
train = sqlContext.read.format("csv")\
   .option("header", "true")\
   .load("C:/Users/affiqazrin/Desktop/dataset/titanic/train.csv")

In [6]:
# String to float on some columns of the dataset : creates a new dataset
train = train.select(col("Survived"),
                     col("Sex"),
                     col("Embarked"),
                     col("Pclass").cast("float"),
                     col("Age").cast("float"),
                     col("SibSp").cast("float"),
                     col("Fare").cast("float"))

In [7]:
# dropping null values
train = train.dropna()

In [8]:
# Spliting in train and test set. Beware : It sorts the dataset
(traindf, testdf) = train.randomSplit([0.7,0.3])

In [9]:
# Index labels, adding metadata to the label column.
# Fit on whole dataset to include all labels in index.
genderIndexer = StringIndexer(inputCol="Sex", outputCol="indexedSex")
embarkIndexer = StringIndexer(inputCol="Embarked", outputCol="indexedEmbarked")

surviveIndexer = StringIndexer(inputCol="Survived", outputCol="indexedSurvived")

In [10]:
# One Hot Encoder on indexed features
genderEncoder = OneHotEncoder(inputCol="indexedSex", outputCol="sexVec")
embarkEncoder = OneHotEncoder(inputCol="indexedEmbarked", outputCol="embarkedVec")

In [11]:
# Create the vector structured data (label,features(vector))
assembler = VectorAssembler(inputCols=["Pclass",
                                       "sexVec",
                                       "Age",
                                       "SibSp",
                                       "Fare",
                                       "embarkedVec"],outputCol="features")

In [12]:
# Train a RandomForest model.
rf = RandomForestClassifier(labelCol="indexedSurvived", featuresCol="features")
 
# Chain indexers and forest in a Pipeline
pipeline = Pipeline(stages=[surviveIndexer,
                            genderIndexer,
                            embarkIndexer,
                            genderEncoder,
                            embarkEncoder,
                            assembler,
                            rf]) # genderIndexer,embarkIndexer,genderEncoder,embarkEncoder,
 
# Train model.  This also runs the indexers.
model = pipeline.fit(traindf)
 
# Predictions
predictions = model.transform(testdf)

In [13]:
# Select example rows to display.
predictions.columns 
 
# Select example rows to display.
predictions.select("prediction",
                   "Survived",
                   "features").show()

+----------+--------+--------------------+
|prediction|Survived|            features|
+----------+--------+--------------------+
|       1.0|       0|[3.0,0.0,18.0,0.0...|
|       0.0|       0|(7,[0,2,4],[3.0,3...|
|       1.0|       0|[1.0,0.0,2.0,1.0,...|
|       1.0|       0|[2.0,0.0,24.0,0.0...|
|       1.0|       0|[2.0,0.0,38.0,0.0...|
|       1.0|       0|[2.0,0.0,57.0,0.0...|
|       0.0|       0|[3.0,0.0,2.0,4.0,...|
|       0.0|       0|[3.0,0.0,8.0,3.0,...|
|       0.0|       0|[3.0,0.0,10.0,0.0...|
|       0.0|       0|[3.0,0.0,11.0,4.0...|
|       0.0|       0|[3.0,0.0,18.0,0.0...|
|       0.0|       0|[3.0,0.0,18.0,2.0...|
|       0.0|       0|[3.0,0.0,20.0,0.0...|
|       0.0|       0|[3.0,0.0,20.0,1.0...|
|       0.0|       0|[3.0,0.0,22.0,0.0...|
|       0.0|       0|[3.0,0.0,25.0,0.0...|
|       0.0|       0|[3.0,0.0,25.0,1.0...|
|       1.0|       0|[3.0,0.0,26.0,1.0...|
|       0.0|       0|[3.0,0.0,30.0,1.0...|
|       1.0|       0|[3.0,0.0,31.0,1.0...|
+----------

In [14]:
predictions.toPandas().to_csv('titanic_READY.csv')

In [15]:
# Select (prediction, true label) and compute test error
predictions = predictions.select(col("Survived").cast("Float"),col("prediction"))

evaluator = MulticlassClassificationEvaluator(labelCol="Survived", predictionCol="prediction", metricName="accuracy")

accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy))

Test Error = 0.22


In [16]:
rfModel = model.stages[6]
print(rfModel)  # summary only
 
evaluator = MulticlassClassificationEvaluator(labelCol="Survived", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Accuracy = %g" % accuracy)
 
evaluatorf1 = MulticlassClassificationEvaluator(labelCol="Survived", predictionCol="prediction", metricName="f1")
f1 = evaluatorf1.evaluate(predictions)
print("f1 = %g" % f1)
 
evaluatorwp = MulticlassClassificationEvaluator(labelCol="Survived", predictionCol="prediction", metricName="weightedPrecision")
wp = evaluatorwp.evaluate(predictions)
print("weightedPrecision = %g" % wp)
 
evaluatorwr = MulticlassClassificationEvaluator(labelCol="Survived", predictionCol="prediction", metricName="weightedRecall")
wr = evaluatorwr.evaluate(predictions)
print("weightedRecall = %g" % wr)

RandomForestClassificationModel: uid=RandomForestClassifier_b9c83511cc69, numTrees=20, numClasses=2, numFeatures=7
Accuracy = 0.78
f1 = 0.773593
weightedPrecision = 0.78431
weightedRecall = 0.78


In [None]:
# close sparkcontext
sc.stop()