In [7]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

In [16]:
data = pd.read_csv('../../data/titanic/train.csv')
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


#### Pipeline:

- pipeline: applies sequentially a list of transformations (apply something, then something else, ...)
- imputer: replace NaN with value (i.e. mean for numerical or most_frequent for categorical)
- encoder/scaler: 

- Transformer: applies in parallel transformations to the columns (i.e. apply something to the numerical features and the non-numerical features at the same time)

In [37]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer


X_train = data["Survived"]
y_train = data.drop(["Survived", "Name", "Ticket", "Cabin"], axis=1)

# one hot encoding for sex and embarked
cat_features = ["Sex", "Embarked"]
cat_processor = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

num_features = ["Pclass", "Age", "SibSp", "Parch", "Fare"]
num_processor = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
    ])

feature_processor = ColumnTransformer(transformers=[
    ('num', num_processor, num_features),
    ('cat', cat_processor, cat_features)], 
    remainder='drop') # drop all other columns not used

# process data with classifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

clf = Pipeline(steps=[
    ('preprocessor', feature_processor),
    ('clf', RandomForestClassifier())
    ])

# grid search to find best parameters
from sklearn.model_selection import GridSearchCV

param_dict = {
    'clf__n_estimators': [100, 200, 300],
    'clf__max_depth': [5, 10, 15, 20]
}

grid = GridSearchCV(clf, param_grid=param_dict, cv=5, scoring='accuracy')
grid.fit(y_train, X_train)

In [38]:
grid.best_params_

{'clf__max_depth': 10, 'clf__n_estimators': 300}

In [41]:
# test data
test_data = pd.read_csv('../../data/titanic/test.csv')
# drop unused columns
test_data = test_data[num_features + cat_features]
test_data.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex,Embarked
0,3,34.5,0,0,7.8292,male,Q
1,3,47.0,1,0,7.0,female,S
2,2,62.0,0,0,9.6875,male,Q
3,3,27.0,0,0,8.6625,male,S
4,3,22.0,1,1,12.2875,female,S


In [44]:
best_clf = grid.best_estimator_

y_pred = best_clf.predict(test_data)
y_pred

array([0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1,
       1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0,
       0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,