# Titanic

In [68]:
# Main packages
import numpy as np
import pandas as pd
import xgboost as xgb

# Classifiers
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingRegressor
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

# Tools
from sklearn.model_selection import cross_val_score

In [118]:
# Importing data
test = pd.read_csv('data/test.csv',index_col='PassengerId')
train = pd.read_csv('data/train.csv',index_col='PassengerId')

## Data processing functions

In [119]:
def data_eng(df):
    # Filling NA of original columns we'll use
    df['Age'] = df['Age'].fillna(df['Age'].mean())
    df['Fare'] = df['Fare'].fillna(df['Fare'].mean())
    
    # Creating columns
    ## Based on the Name column
    df['Mr'] = df['Name'].str.contains('Mr') & ~df['Name'].str.contains('Mrs')
    df['Mrs'] = df['Name'].str.contains('Mrs')
    df['Miss'] = df['Name'].str.contains('Miss')
    
    ## Based on the Embarked column
    df['Cherbourg'] = df['Embarked'].str.contains('C')
    df['Southampton'] = df['Embarked'].str.contains('S')
    df['Queenstown'] = df['Embarked'].str.contains('Q')
       
    ## Based on the Sex column
    df['Sex'] = ~df['Sex'].str.contains('female')
    
    ## Based on SibSp and Parch
    df['Family'] = df['SibSp'] + df['Parch'] + 1
    
    # Combining columns
    df['Pclass*Age'] = (df['Pclass']*df['Age']).apply(lambda x: 1/x if x != 0 else 999999999)
    df['Fare*Age'] = (df['Fare']*df['Age']).apply(lambda x: 1/x if x != 0 else 999999999)
    df['Pclass*Age*Fare'] = df['Pclass*Age']*df['Fare']
    
    return df

def fill_na(df):    
    
    df['Pclass*Age'] = df['Pclass*Age'].fillna(df['Pclass*Age'].mean())
    df['Fare*Age'] = df['Fare*Age'].fillna(df['Fare*Age'].mean())
    df['Pclass*Age*Fare'] = df['Pclass*Age*Fare'].fillna(df['Pclass*Age*Fare'].mean())
    df[['Cherbourg', 'Southampton', 'Queenstown']] = df[['Cherbourg', 'Southampton', 'Queenstown']].fillna(value = False)
    
    return df

def prepare_df(df):
    
    df = data_eng(df)    
    # Dropping columns
    df = df.drop(['Name', 'Ticket', 'Cabin', 'Embarked'], axis=1)
    
    return fill_na(df)

Applying data processing on train and then splitting dataset into **X_train** and **y_train**.

In [None]:
train = prepare_df(train)
X_train = train.drop('Survived', axis=1)
y_train = train['Survived']

Applying data processing on test

In [120]:
test = prepare_df(test)

## Training part

Listing potentiels models

In [169]:
models = {
    'RandomForest' : {
        'model' : RandomForestClassifier(max_depth=3)
    },
    'LogisticRegression' : {
        'model' : LogisticRegression(solver='lbfgs')
    },
    'DecisionTree' : {
        'model' : DecisionTreeClassifier(max_depth=3)
    },
    'AdaBoost' : {
        'model' : AdaBoostClassifier(DecisionTreeClassifier(max_depth=3), algorithm="SAMME.R", learning_rate=0.2)
    },
    'XGBoost' : {
        'model' : xgb.XGBClassifier(max_depth=3)
    },
}

Checking cross validation scores for each model independantely, on **X1_train** and **y1_train**.

In [170]:
import warnings
warnings.filterwarnings('ignore')

# Check scores with Cross Validation
for clf in [x['model'] for x in models.values()]:
    print('-'*60)
    print(f'Classifier :\n{clf}\n\nMean score over 10 folds : {cross_val_score(clf, X1_train, y1_train, cv=10).mean()}\n\n')

------------------------------------------------------------
Classifier :
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=3, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

Mean score over 10 folds : 0.8106429070580015


------------------------------------------------------------
Classifier :
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

Mean score over 10 folds : 0.6719522266692077


------------------------------------------------------------
Cla

Predicting for each model independantely. **Training on X1_train** but **predicting on X2_train**.

In [171]:
# Predictions
for elt in methods.values():
    elt['predictions'] = elt['method'].fit(X1_train, y1_train).predict(X2_train)
    print('-'*60)
    print(f'Classifier : {elt["method"].__class__} is done predicting !\n')

------------------------------------------------------------
Classifier : <class 'sklearn.ensemble.forest.RandomForestClassifier'> is done predicting !

------------------------------------------------------------
Classifier : <class 'sklearn.linear_model.logistic.LogisticRegression'> is done predicting !

------------------------------------------------------------
Classifier : <class 'sklearn.tree.tree.DecisionTreeClassifier'> is done predicting !

------------------------------------------------------------
Classifier : <class 'sklearn.ensemble.weight_boosting.AdaBoostClassifier'> is done predicting !

------------------------------------------------------------
Classifier : <class 'xgboost.sklearn.XGBClassifier'> is done predicting !



Creating a dataframe based on **models X2_train predictions**.

In [172]:
# Building res df
clf_res_dict = {key: value['predictions'] for (key, value) in methods.items()}
clf_res_df = pd.DataFrame(clf_res_dict)
clf_res_df.head()

Unnamed: 0,RandomForest,LogisticRegression,DecisionTree,AdaBoost,XGBoost
0,0,0,0,0,0
1,0,0,0,0,0
2,0,0,0,0,0
3,1,0,1,1,1
4,1,0,1,1,1


Listing potential meta classifiers :

In [None]:
meta_clfs = [
    DecisionTreeClassifier(min_samples_split=40, max_depth=2),
    LogisticRegression(solver='lbfgs'),
    xgb.XGBClassifier(max_depth=3)
]

For each classifier, check its cross validation score with **clf_res_df as input and y2_train as output**.

In [173]:
for meta_clf in meta_clfs:
    print(f"{meta_clf.__class__} score : {cross_val_score(meta_clf, clf_res_df, y2_train, cv=10).mean()}")

<class 'sklearn.tree.tree.DecisionTreeClassifier'> score : 0.8035413985413985
<class 'sklearn.linear_model.logistic.LogisticRegression'> score : 0.8091763191763193
<class 'xgboost.sklearn.XGBClassifier'> score : 0.8063985413985414


Selecting meta classifier :

In [None]:
meta_clf = meta_clfs[1]

Training the selected meta classifier with **clf_res_df as input and y2_train as output**.

In [174]:
meta_clfs[1].fit(clf_res_df, y2_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

## Generating the results

Predicting results from the test dataset.

In [175]:
for elt in methods.values():
    elt['predictions_test'] = elt['method'].predict(test)
    print('-'*60)
    print(f'Classifier : {elt["method"].__class__} is done predicting on the test !\n')

------------------------------------------------------------
Classifier : <class 'sklearn.ensemble.forest.RandomForestClassifier'> is done predicting on the test !

------------------------------------------------------------
Classifier : <class 'sklearn.linear_model.logistic.LogisticRegression'> is done predicting on the test !

------------------------------------------------------------
Classifier : <class 'sklearn.tree.tree.DecisionTreeClassifier'> is done predicting on the test !

------------------------------------------------------------
Classifier : <class 'sklearn.ensemble.weight_boosting.AdaBoostClassifier'> is done predicting on the test !

------------------------------------------------------------
Classifier : <class 'xgboost.sklearn.XGBClassifier'> is done predicting on the test !



Predicting final results using the meta classifier, using the results above.

In [176]:
# Building res df for the test
clf_res_dict_test = {key: value['predictions_test'] for (key, value) in methods.items()}
clf_res_df_test = pd.DataFrame(clf_res_dict_test)
clf_res_df_test.head()

Unnamed: 0,RandomForest,LogisticRegression,DecisionTree,AdaBoost,XGBoost
0,0,0,0,0,0
1,0,0,1,0,0
2,0,0,0,0,0
3,0,0,0,1,0
4,1,0,1,0,0


Generating the .csv file

In [177]:
df_res = pd.DataFrame({'PassengerId':test.index, 'Survived':meta_clf.predict(clf_res_df_test)})

In [178]:
df_res.to_csv('still_have_hope_dt.csv', index=False)