# Model Selection with PySpark ML

In [1]:
import findspark
findspark.init()
import pyspark

In [2]:
data_path = "../.assets/data/titanic/titanic.csv"

In [3]:
spark = pyspark.sql.SparkSession \
    .builder \
    .appName("TitanicClassifier") \
    .getOrCreate()


In [4]:
from pyspark.sql.types import StructType, StructField, DoubleType, IntegerType, StringType

schema = StructType([
             StructField('PassengerId', StringType()),
             StructField('Survived', IntegerType()),
             StructField('Pclass', IntegerType()),
             StructField('Name', StringType()),
             StructField('Sex', StringType()),
             StructField('Age', IntegerType()),
             StructField('SibSp', IntegerType()),
             StructField('Parch', IntegerType()),
             StructField('Ticket', StringType()),
             StructField('Fare', DoubleType()),
             StructField('Cabin', StringType()),
             StructField('Embarked', StringType())
        ])


In [5]:
data = spark.read.csv(data_path, header=True, schema=schema)


In [6]:
data.show(5)

+-----------+--------+------+--------------------+------+---+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex|Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+---+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male| 22|    1|    0|       A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female| 38|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female| 26|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female| 35|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male| 35|    0|    0|          373450|   8.05| null|       S|
+-----------+--------+------+--------------------+------+---+-----+-----+---------------

In [20]:
from pyspark.ml import Transformer, Estimator, Pipeline, PipelineModel
from pyspark.ml.feature import VectorAssembler, StringIndexer

## Algorithm Selection

In [21]:
from pyspark.ml.classification import DecisionTreeClassifier, RandomForestClassifier, GBTClassifier, LogisticRegression, NaiveBayes

In [28]:
classifiers = {
    "Decision Tree": DecisionTreeClassifier,
    "Random Forest": RandomForestClassifier,
    "Gradient-boosted Trees": GBTClassifier,
    "Logistic Regression": LogisticRegression,
    "Naive Bayes": NaiveBayes
}

In [29]:
preprocessing_stages = [
    StringIndexer(inputCol="Sex", outputCol="Sex_encoded"),
    StringIndexer(inputCol="Embarked", outputCol="Embarked_encoded")
]

In [30]:
assemble_features = VectorAssembler(
    inputCols=["Age", "Fare", "Sex_encoded", "Embarked_encoded"], 
    outputCol="features"
)

In [32]:
pipelines = { 
    Pipeline(
        stages=preprocessing_stages + [assemble_features, classifier_class()]
    )
    for (classifier_name, classifier_class) in classifiers.items()
}

In [26]:
pipelines

{Pipeline_092d1eb1adb6,
 Pipeline_137ae2c21fc8,
 Pipeline_45e8768051f2,
 Pipeline_7cb80ff1f86f,
 Pipeline_8b8ebe08f570}

## Hyperparameter Tuning

---
_This notebook is licensed under a [Creative Commons Attribution 4.0 International License (CC BY 4.0)](https://creativecommons.org/licenses/by/4.0/). Copyright © 2019 [Point 8 GmbH](https://point-8.de)_

