<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Read-data" data-toc-modified-id="Read-data-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Read data</a></span></li><li><span><a href="#Select-columns-and-drop-nans" data-toc-modified-id="Select-columns-and-drop-nans-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Select columns and drop nans</a></span></li><li><span><a href="#Encoding" data-toc-modified-id="Encoding-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Encoding</a></span></li><li><span><a href="#Assembler" data-toc-modified-id="Assembler-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Assembler</a></span></li><li><span><a href="#Create-pipeline" data-toc-modified-id="Create-pipeline-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Create pipeline</a></span></li><li><span><a href="#Model-evaluation" data-toc-modified-id="Model-evaluation-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Model evaluation</a></span></li></ul></div>

In [1]:
import numpy as np
import pandas as pd
import pyspark
from pyspark import SparkConf, SparkContext, SQLContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf # @udf("integer") def myfunc(x,y): return x - y
from pyspark.sql import functions as F # stddev format_number date_format, dayofyear, when
from pyspark.sql.types import StructField, StringType, IntegerType, StructType

print([(x.__name__,x.__version__) for x in [np, pd, pyspark]])

spark = pyspark.sql.SparkSession.builder.appName('titanic').getOrCreate()
sc = spark.sparkContext
sqlContext = SQLContext(sc)
sc.setLogLevel("INFO")

[('numpy', '1.17.1'), ('pandas', '0.25.1'), ('pyspark', '2.4.4')]


In [None]:
from pyspark.ml.feature import (VectorAssembler, VectorIndexer,
                               OneHotEncoder, StringIndexer)

from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline

from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Read data

In [2]:
!head -2 ../data/titanic.csv

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S


In [3]:
df = spark.read.csv('../data/titanic.csv',header=True,inferSchema=True)
print(df.count())
df.show()

891
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| null|       S|
|          6|       0|     3|    Moran, Mr. James|  male|null|    0|    0|  

In [4]:
df.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [5]:
print(df.columns)

['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']


# Select columns and drop nans

In [6]:
my_cols = [ 'Survived', 'Pclass', 'Sex', 'Age',
           'SibSp', 'Parch', 'Fare', 'Embarked']
df = df.select(my_cols)

In [7]:
my_final_data = df.dropna()

# Encoding

In [8]:
df.show(1)

+--------+------+----+----+-----+-----+----+--------+
|Survived|Pclass| Sex| Age|SibSp|Parch|Fare|Embarked|
+--------+------+----+----+-----+-----+----+--------+
|       0|     3|male|22.0|    1|    0|7.25|       S|
+--------+------+----+----+-----+-----+----+--------+
only showing top 1 row



In [9]:
from pyspark.ml.feature import (VectorAssembler, VectorIndexer,
                               OneHotEncoder, StringIndexer)

In [10]:
gender_indexer = StringIndexer(inputCol='Sex', outputCol='Sex_index')

gender_encoder = OneHotEncoder(inputCol='Sex_index', outputCol='Sex_vec')

In [11]:
embark_indexer = StringIndexer(inputCol='Embarked', outputCol='Embarked_index')

embark_encoder = OneHotEncoder(inputCol='Embarked_index', outputCol='Embarked_vec')

# Assembler

In [21]:
my_final_data.columns

['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']

In [22]:
assembler = VectorAssembler(inputCols=['Pclass','Sex_vec','Embarked_vec',
                                      'Age','SibSp','Parch','Fare'],
                           outputCol='features')

# Create pipeline

In [23]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline

In [24]:
lr = LogisticRegression(featuresCol='features',labelCol='Survived')

In [25]:
pipeline = Pipeline(stages=[gender_indexer, embark_indexer,
                           gender_encoder, embark_encoder,
                           assembler, lr])

In [26]:
train, test = my_final_data.randomSplit([0.7, 0.3])

In [27]:
lr_model = pipeline.fit(train)

In [31]:
results = lr_model.transform(test)
results.show(2)

+--------+------+------+----+-----+-----+------+--------+---------+--------------+-------------+-------------+--------------------+--------------------+--------------------+----------+
|Survived|Pclass|   Sex| Age|SibSp|Parch|  Fare|Embarked|Sex_index|Embarked_index|      Sex_vec| Embarked_vec|            features|       rawPrediction|         probability|prediction|
+--------+------+------+----+-----+-----+------+--------+---------+--------------+-------------+-------------+--------------------+--------------------+--------------------+----------+
|       0|     1|female| 2.0|    1|    2|151.55|       S|      1.0|           0.0|    (1,[],[])|(2,[0],[1.0])|[1.0,0.0,1.0,0.0,...|[-4.0693712215344...|[0.01680103157033...|       1.0|
|       0|     1|  male|19.0|    1|    0|  53.1|       S|      0.0|           0.0|(1,[0],[1.0])|(2,[0],[1.0])|[1.0,1.0,1.0,0.0,...|[-0.4982229035736...|[0.37795838384935...|       1.0|
+--------+------+------+----+-----+-----+------+--------+---------+--------

# Model evaluation

In [28]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [29]:
bc_eval = BinaryClassificationEvaluator(rawPredictionCol='prediction',
                                       labelCol='Survived')

In [32]:
results.select('Survived','prediction').show()

+--------+----------+
|Survived|prediction|
+--------+----------+
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       1.0|
|       0|       1.0|
+--------+----------+
only showing top 20 rows



In [33]:
auc = bc_eval.evaluate(results)
auc

0.7454212454212453