In [1]:
from __future__ import print_function, division
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
from pyspark.mllib.regression import LabeledPoint

import time
import os
import csv
from numpy import array

In [2]:
spark = SparkSession.builder.master("local[2]") \
   .appName("test") \
   .enableHiveSupport() \
   .getOrCreate()

sc = spark.sparkContext

## Data
https://www.kaggle.com/c/titanic/data

In [3]:
df_train = spark.read.csv("../data/titanic_train.csv", header=True)
df_test = spark.read.csv("../data/titanic_test.csv", header=True)

In [4]:
df_train.count()

891

In [5]:
df_test.count()

418

In [6]:
df_train

DataFrame[PassengerId: string, Survived: string, Pclass: string, Name: string, Sex: string, Age: string, SibSp: string, Parch: string, Ticket: string, Fare: string, Cabin: string, Embarked: string]

In [7]:
df_train.show()

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|  22|    1|    0|       A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|  38|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|  26|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|  35|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|  35|    0|    0|          373450|   8.05| null|       S|
|          6|       0|     3|    Moran, Mr. James|  male|null|    0|    0|      

In [8]:
df_train.printSchema()

root
 |-- PassengerId: string (nullable = true)
 |-- Survived: string (nullable = true)
 |-- Pclass: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- SibSp: string (nullable = true)
 |-- Parch: string (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: string (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [9]:
from pyspark.sql.functions import lit, col
df_train = df_train.withColumn('Mark',lit('train'))
df_test = (df_test.withColumn('Survived',lit(0))
                  .withColumn('Mark',lit('test')))

df_test = df_test[df_train.columns]
## Append Test data to Train data
df = df_train.unionAll(df_test)

In [10]:
df

DataFrame[PassengerId: string, Survived: string, Pclass: string, Name: string, Sex: string, Age: string, SibSp: string, Parch: string, Ticket: string, Fare: string, Cabin: string, Embarked: string, Mark: string]

In [11]:
# Convert Age, SibSp, Parch, Fare to Numeric
df = (df.withColumn('Age',df['Age'].cast("double"))
            .withColumn('SibSp',df['SibSp'].cast("double"))
            .withColumn('Parch',df['Parch'].cast("double"))
            .withColumn('Fare',df['Fare'].cast("double"))
            .withColumn('Survived',df['Survived'].cast("double"))
            )
df.printSchema()

root
 |-- PassengerId: string (nullable = true)
 |-- Survived: double (nullable = true)
 |-- Pclass: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: double (nullable = true)
 |-- Parch: double (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)
 |-- Mark: string (nullable = false)



In [12]:
df.count()

1309

In [13]:
df.groupBy('mark').count().show()

+-----+-----+
| mark|count|
+-----+-----+
|train|  891|
| test|  418|
+-----+-----+



## Impute missing Age and Fare with the Average

In [14]:
numVars = ['Survived','Age','SibSp','Parch','Fare']
def countNull(df, var):
    return df.where(df[var].isNull()).count()
 
missing = {var: countNull(df,var) for var in numVars}


In [15]:
missing

{'Age': 263, 'Fare': 1, 'Parch': 0, 'SibSp': 0, 'Survived': 0}

In [16]:
age_mean = df.groupBy().mean('Age').first()[0]
fare_mean = df.groupBy().mean('Fare').first()[0]
df = df.na.fill({'Age':age_mean,'Fare':fare_mean, 'Parch':0, 'Sex':'male', 'Embarked': 'S'})

In [17]:
missing = {var: countNull(df, var) for var in numVars}
missing



{'Age': 0, 'Fare': 0, 'Parch': 0, 'SibSp': 0, 'Survived': 0}

In [18]:
## Impute missing Embark

df = df.na.fill({'Embarked': 'S'})

In [19]:
df.show()

+-----------+--------+------+--------------------+------+------------------+-----+-----+----------------+-------+-----+--------+-----+
|PassengerId|Survived|Pclass|                Name|   Sex|               Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked| Mark|
+-----------+--------+------+--------------------+------+------------------+-----+-----+----------------+-------+-----+--------+-----+
|          1|     0.0|     3|Braund, Mr. Owen ...|  male|              22.0|  1.0|  0.0|       A/5 21171|   7.25| null|       S|train|
|          2|     1.0|     1|Cumings, Mrs. Joh...|female|              38.0|  1.0|  0.0|        PC 17599|71.2833|  C85|       C|train|
|          3|     1.0|     3|Heikkinen, Miss. ...|female|              26.0|  0.0|  0.0|STON/O2. 3101282|  7.925| null|       S|train|
|          4|     1.0|     1|Futrelle, Mrs. Ja...|female|              35.0|  1.0|  0.0|          113803|   53.1| C123|       S|train|
|          5|     0.0|     3|Allen, Mr. Willia...|  mal

# Extract Title from Name

In [20]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
 
## created user defined function to extract title
getTitle = udf(lambda name: name.split('.')[0].strip(),StringType())
df = df.withColumn('Title', getTitle(df['Name']))
 
df.select('Name','Title').show(3)


+--------------------+---------------+
|                Name|          Title|
+--------------------+---------------+
|Braund, Mr. Owen ...|     Braund, Mr|
|Cumings, Mrs. Joh...|   Cumings, Mrs|
|Heikkinen, Miss. ...|Heikkinen, Miss|
+--------------------+---------------+
only showing top 3 rows



## Index categorical variable

In [21]:
df.select('Sex').show()

+------+
|   Sex|
+------+
|  male|
|female|
|female|
|female|
|  male|
|  male|
|  male|
|  male|
|female|
|female|
|female|
|female|
|  male|
|  male|
|female|
|female|
|  male|
|  male|
|female|
|female|
+------+
only showing top 20 rows



In [22]:
## index Sex variable
from pyspark.ml.feature import StringIndexer
si = StringIndexer(inputCol = 'Sex', outputCol = 'Sex_indexed')
df_indexed = si.fit(df).transform(df).drop('Sex').withColumnRenamed('Sex_indexed','Sex')


In [23]:
df_indexed.select('Sex').show(5)

+---+
|Sex|
+---+
|0.0|
|1.0|
|1.0|
|1.0|
|0.0|
+---+
only showing top 5 rows



In [24]:
## make use of pipeline to index all categorical variables
catVars = ['Pclass','Sex','Embarked','Title']
  
## make use of pipeline to index all categorical variables
def indexer(df,col):
    si = StringIndexer(inputCol = col, outputCol = col+'_indexed').fit(df)
    return si
 
indexers = [indexer(df,col) for col in catVars]
 
from pyspark.ml import Pipeline
pipeline = Pipeline(stages = indexers)
df_indexed = pipeline.fit(df).transform(df)
 
df_indexed.select('Embarked','Embarked_indexed').show(3)


+--------+----------------+
|Embarked|Embarked_indexed|
+--------+----------------+
|       S|             0.0|
|       C|             1.0|
|       S|             0.0|
+--------+----------------+
only showing top 3 rows



## Convert to label/features format

In [25]:
catVarsIndexed = [i+'_indexed' for i in catVars]
featuresCol = numVars+catVarsIndexed
featuresCol.remove('Survived')
labelCol = ['Mark','Survived']

In [26]:
from pyspark.sql import Row
from pyspark.ml.linalg import Vectors
row = Row('mark','label','features')
 
df_indexed = df_indexed[labelCol+featuresCol]


In [27]:
# 0-mark, 1-label, 2-features
# map features to DenseVector
lf = df_indexed.rdd.map(lambda r: (row(r[0], r[1], Vectors.dense(r[2:])))).toDF()
# index label

 
lf.show(3)


+-----+-----+--------------------+
| mark|label|            features|
+-----+-----+--------------------+
|train|  0.0|[22.0,1.0,0.0,7.2...|
|train|  1.0|[38.0,1.0,0.0,71....|
|train|  1.0|[26.0,0.0,0.0,7.9...|
+-----+-----+--------------------+
only showing top 3 rows



In [None]:
# convert numeric label to categorical, which is required by
# decisionTree and randomForest
lf = StringIndexer(inputCol = 'label',outputCol='index').fit(lf).transform(lf)

In [30]:
lf.show()

+-----+-----+--------------------+-----+
| mark|label|            features|index|
+-----+-----+--------------------+-----+
|train|  0.0|[22.0,1.0,0.0,7.2...|  0.0|
|train|  1.0|[38.0,1.0,0.0,71....|  1.0|
|train|  1.0|[26.0,0.0,0.0,7.9...|  1.0|
|train|  1.0|[35.0,1.0,0.0,53....|  1.0|
|train|  0.0|[35.0,0.0,0.0,8.0...|  0.0|
|train|  0.0|[29.8811376673040...|  0.0|
|train|  0.0|[54.0,0.0,0.0,51....|  0.0|
|train|  0.0|[2.0,3.0,1.0,21.0...|  0.0|
|train|  1.0|[27.0,0.0,2.0,11....|  1.0|
|train|  1.0|[14.0,1.0,0.0,30....|  1.0|
|train|  1.0|[4.0,1.0,1.0,16.7...|  1.0|
|train|  1.0|[58.0,0.0,0.0,26....|  1.0|
|train|  0.0|[20.0,0.0,0.0,8.0...|  0.0|
|train|  0.0|[39.0,1.0,5.0,31....|  0.0|
|train|  0.0|[14.0,0.0,0.0,7.8...|  0.0|
|train|  1.0|[55.0,0.0,0.0,16....|  1.0|
|train|  0.0|[2.0,4.0,1.0,29.1...|  0.0|
|train|  1.0|[29.8811376673040...|  1.0|
|train|  0.0|[31.0,1.0,0.0,18....|  0.0|
|train|  1.0|[29.8811376673040...|  1.0|
+-----+-----+--------------------+-----+
only showing top

In [31]:
lf.groupBy('mark').count().show()

+-----+-----+
| mark|count|
+-----+-----+
|train|  891|
| test|  418|
+-----+-----+



In [32]:
train = lf.where(lf.mark =='train')
test = lf.where(lf.mark =='test')
 
# random split further to get train/validate
train,validate = train.randomSplit([0.7,0.3],seed =121)

In [33]:
print('Train Data Number of Row: '+ str(train.count()))
print('Validate Data Number of Row: '+ str(validate.count()))
print('Test Data Number of Row: '+ str(test.count()))


Train Data Number of Row: 637
Validate Data Number of Row: 254
Test Data Number of Row: 418


In [None]:
train.select('index','features').rdd.take(5)

# Apply Models from ML/MLLIB

## Logistic Regression

In [34]:
from pyspark.ml.classification import LogisticRegression
 
# regPara: lasso regularisation parameter (L1)
lr = LogisticRegression(maxIter = 100, regParam = 0.05, labelCol='index').fit(train.select('index','features'))
 
# Evaluate model based on auc ROC(default for binary classification)
from pyspark.ml.evaluation import BinaryClassificationEvaluator
 
def testModel(model, validate = validate):
    pred = model.transform(validate)
    evaluator = BinaryClassificationEvaluator(labelCol = 'index')
    return evaluator.evaluate(pred)
 
print ('AUC ROC of Logistic Regression model is: '+str(testModel(lr)))


AUC ROC of Logistic Regression model is: 0.8160377358490569


In [38]:
from pyspark.ml.classification import DecisionTreeClassifier, RandomForestClassifier
 
dt = DecisionTreeClassifier(maxDepth = 5, labelCol ='index').fit(train)
rf = RandomForestClassifier(numTrees = 10, labelCol = 'index').fit(train)
 

 

In [39]:
models = {'LogisticRegression':lr,
          'DecistionTree':dt,
          'RandomForest':rf}
 
modelPerf = {k:testModel(v) for k,v in models.items()}

In [40]:
modelPerf

{'DecistionTree': 0.5789775624681285,
 'LogisticRegression': 0.8160377358490568,
 'RandomForest': 0.8465068842427337}