# DataSet acquisition

In [2]:
!powershell Get-Content "../data/train.csv" -Head 3

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38,1,0,PC 17599,71.2833,C85,C


In [210]:
from pyspark.sql import SparkSession, DataFrame

spark = ( SparkSession.builder
    .master("local")
    .appName("titanic")
    .getOrCreate()
)

train_df = spark.read.csv(
    "../data/train.csv", 
    header = True,
    inferSchema = True
)
train_df.show(5)

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| null|       S|
+-----------+--------+------+--------------------+------+----+-----+-----+------

# Feature engineering and preprocessing

In [212]:
import re
from pyspark.sql.functions import regexp_extract, udf
from pyspark.ml.feature import OneHotEncoder, StringIndexer, StandardScaler, VectorAssembler
from pyspark.ml import Pipeline, Transformer, Estimator, Model
from pyspark.sql.types import IntegerType, DoubleType
from pyspark.ml.regression import LinearRegression
from pyspark.ml.linalg import Vectors

class ExtractAccompaniedFeature(Transformer):
    def transform(self, dataset, params=None):
        return dataset.withColumn(
            "Accompanied_encoded", 
            (dataset.SibSp + dataset.Parch >= 1).cast(IntegerType()) 
        )

class HandleMissingEmbarked(Estimator):
    def fit(self, dataset, params=None):
        mostFrequentValue = (dataset.groupby("Embarked")
                             .count()
                             .orderBy("count", ascending=False)
                             .first()
                             .Embarked
                            )
        return HandleMissingEmbarkedModel(mostFrequentValue)
        
class HandleMissingEmbarkedModel(Model):
    
    def __init__(self, mostFrequentValue):
        self.mostFrequentValue = mostFrequentValue
        
    def transform(self, dataset, params=None):
        return dataset.fillna(self.mostFrequentValue, "Embarked")

@udf(returnType=StringType())
def replace_title(s):
    mrs_pattern = "(Mme\.|Ms\.|Countess\.|Lady\.)"
    miss_pattern = "(Mlle\.)"
    mr_pattern = "(Don\.|Major\.|Sir\.|Col\.|Capt\.)"
    if re.search(mrs_pattern, s):
        return re.sub(mrs_pattern, "Mrs.", s)
    if re.search(miss_pattern, s):
        return re.sub(miss_pattern, "Miss.", s)
    if re.search(mr_pattern, s):
        return re.sub(mr_pattern, "Mr.", s)
    return s

@udf
def replace_empty(s):
    if s == "":
        return "No-Title"
    return s

class ExtractTitle(Transformer):
    def transform(self, dataset, params=None):
        titles_extract_pattern = r'(Mr\.|Mrs\.|Miss\.|Master\.|Dr\.|Rev\.)'
        return ( dataset.withColumn("Title", regexp_extract("Name", titles_extract_pattern, 1))
                .withColumn("Title", replace_empty("Title"))
               )

class HandleMissingAge(Estimator):
    def __init__(self):
        vect = VectorAssembler(
            inputCols = ["Pclass_encoded", "Sex_encoded"], 
            outputCol='features_class_sex'
        )
        
        lr = LinearRegression(
            featuresCol="features_class_sex",
            labelCol='Age',
            predictionCol='Age_imputed',
            regParam = 0.3
        )

        self.pipe = Pipeline(
            stages = [
                vect,
                lr
            ])


    def fit(self, dataset, params=None):
        dataset_without_missing = dataset.where(col("Age").isNotNull())
        ageRegressor = self.pipe.fit(dataset_without_missing)
        return HandleMissingAgeModel(ageRegressor)

    
class HandleMissingAgeModel(Model):
    
    def __init__(self, ageRegressor):
        self.ageRegressor = ageRegressor
        
    def transform(self, dataset, params=None):
        null_age_df = dataset.where(col("Age").isNull())
        not_null_age_df = dataset.where(col("Age").isNotNull())

        not_null_age_df = (
            self.ageRegressor
            .transform(null_age_df)
            .drop("features_class_sex")
        )
            
        return not_null_age_df.union(
            not_null_age_df.withColumn("Age_imputed", col("Age"))
        )
    
pipe = Pipeline(
    stages = [
        ExtractAccompaniedFeature(),
        ExtractTitle(),
        HandleMissingEmbarked(),
        StringIndexer(inputCol = "Pclass", outputCol='Pclass_indexed'),
        StringIndexer(inputCol = "Sex", outputCol='Sex_indexed'),
        StringIndexer(inputCol = "Embarked", outputCol='Embarked_indexed'),
        StringIndexer(inputCol = "Title", outputCol='Title_indexed'),
        OneHotEncoder(inputCol = "Pclass_indexed", outputCol='Pclass_encoded', handleInvalid='keep'),
        OneHotEncoder(inputCol = "Sex_indexed", outputCol='Sex_encoded', handleInvalid='keep'),
        OneHotEncoder(inputCol = "Embarked_indexed", outputCol='Embarked_encoded', handleInvalid='keep'),
        OneHotEncoder(inputCol = "Title_indexed", outputCol='Title_encoded', handleInvalid='keep'),
        VectorAssembler(inputCols = ["Fare"], outputCol='Fare_vect'),
        StandardScaler(withMean = True, inputCol = "Fare_vect", outputCol='Fare_std'),
        HandleMissingAge()
])

transformed = pipe.fit(train_df).transform(train_df)
transformed.show(5)

+-----------+--------+------+--------------------+------+----+-----+-----+------+------+-----+--------+-------------------+-----+--------------+-----------+----------------+-------------+--------------+-------------+----------------+-------------+---------+--------------------+------------------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|Ticket|  Fare|Cabin|Embarked|Accompanied_encoded|Title|Pclass_indexed|Sex_indexed|Embarked_indexed|Title_indexed|Pclass_encoded|  Sex_encoded|Embarked_encoded|Title_encoded|Fare_vect|            Fare_std|       Age_imputed|
+-----------+--------+------+--------------------+------+----+-----+-----+------+------+-----+--------+-------------------+-----+--------------+-----------+----------------+-------------+--------------+-------------+----------------+-------------+---------+--------------------+------------------+
|          6|       0|     3|    Moran, Mr. James|  male|null|    0|    0|330877|8.4583| null|       Q|   