In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# filter warnings
import warnings
warnings.filterwarnings('ignore')

from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore')

%matplotlib inline

In [2]:
gender_submission = pd.read_csv('titanic/gender_submission.csv')
test_df = pd.read_csv('titanic/test.csv')
train_df = pd.read_csv('titanic/train.csv')
train_df_copy = train_df.copy()

## Data Cleaning and Encoding

In [3]:
import patsy as pts

def data_cleaning(df):
    # PassengerId
    df.drop(['PassengerId'],axis=1,inplace=True)

    # Age
    for ind in df[df.Age.isnull()].index:
        # filters
        filter1 = df['Sex'] == df.loc[ind, 'Sex']
        filter2 = df['Pclass'] == df.loc[ind, 'Pclass']
        filter3 = df['SibSp'] == df.loc[ind, 'SibSp']
        filter4 = df['Parch'] == df.loc[ind, 'Parch']
        fill_value = df[filter1][filter2][filter3][filter4]['Age'].median()

        # if filter result is nan, we fill with the global median
        if pd.isna(fill_value):
            fill_value = df['Age'].median()

        # fill in values
        df.loc[ind, 'Age'] = fill_value

    # Cabin
    df['Cabin'] = (df.Cabin
     .apply(lambda x: x[0] if pd.notna(x) else 'Z')
     .apply(lambda x: 'Y' if x in ['C','E','D','B','F'] else x)
     .apply(lambda x: 'X' if x in ['G','A','T'] else x))
    
    # Embarked
    df.Embarked.fillna(df['Embarked'].mode()[0],inplace=True)

    # title
    df['Title'] = df.Name.apply(lambda x: x.split(', ')[1].split('. ')[0])
    df["Title"] = df["Title"].replace(['Lady', 'the Countess','Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    df["Title"] = df["Title"].map({"Master":"Master", "Miss":"Female", "Ms" : "Female" , "Mme":"Female", "Mlle":"Female", "Mrs":"Female", "Mr":"Male", "Rare":"Rare"})

    # family size and alone status
    df['FamilySize'] = df['Parch'] + df['SibSp'] + 1
    df['IsAlone'] = df['FamilySize'].apply(lambda x: 1 if x == 1 else 0)
    
    
    # data encoding
    y, X = pts.dmatrices('Survived ~ Pclass + C(Sex) + Age + SibSp + Parch + Fare + ' +
                        'C(Embarked) + C(Title)', data=df,
                        return_type='dataframe')
    X.columns = [i.replace('[','').replace(']','') for i in X.columns]
    pd.concat([X,y]).info()

    return X, y

In [4]:
# clean it in one line
X, y = data_cleaning(train_df)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1782 entries, 0 to 890
Data columns (total 13 columns):
Age                 891 non-null float64
C(Embarked)T.Q      891 non-null float64
C(Embarked)T.S      891 non-null float64
C(Sex)T.male        891 non-null float64
C(Title)T.Male      891 non-null float64
C(Title)T.Master    891 non-null float64
C(Title)T.Rare      891 non-null float64
Fare                891 non-null float64
Intercept           891 non-null float64
Parch               891 non-null float64
Pclass              891 non-null float64
SibSp               891 non-null float64
Survived            891 non-null float64
dtypes: float64(13)
memory usage: 194.9 KB


## Modeling

In [5]:
from xgboost import XGBClassifier
from sklearn import ensemble, linear_model, svm, naive_bayes, discriminant_analysis, neighbors, tree
from sklearn.preprocessing import normalize
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, learning_curve
import time

In [6]:
# Naive Bayes Parameters tunning 
# NBC = naive_bayes.BernoulliNB()
# nb_param_grid = {
#                     'alpha': range(1,100,20),
#                     'fit_prior': [True,False]}


# RFC Parameters tunning 
RFC = ensemble.RandomForestClassifier()
rf_param_grid = {"max_depth": [None],
              "max_features": [1, 3, 10],
              "min_samples_split": [3, 10],
              "min_samples_leaf": [1, 3, 10],
              "bootstrap": [False],
              "n_estimators" :[100,300],
              "criterion": ["gini"]}


# XGB Parameters tunning, reference
# https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/
xgb = XGBClassifier()
xgb_param_grid = {
                'max_depth':[4,6],
                'min_child_weight':[4,6],
                'gamma':[i/10.0 for i in range(0,5,2)],
                'subsample':[i/10.0 for i in range(6,10,2)],
                'colsample_bytree':[i/10.0 for i in range(6,10,2)],
                'reg_alpha':[1e-2, 0.1, 1],
}

# DA Parameters tunning 
DAC = discriminant_analysis.LinearDiscriminantAnalysis()
dac_param_grid = {
                'solver':['svd','lsqr'],
                'n_components':[i for i in range(0,5,1)],
}

# LRC tunning 
LRC = linear_model.LogisticRegression()
lrc_param_grid = {
    'C': [0.1,1,10],
    'solver':['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
}

# GB tunning 
GBC = ensemble.gradient_boosting.GradientBoostingClassifier()
gbc_param_grid = {
              'learning_rate': [0.1, 0.05],
              'max_depth': [5,6],
              'min_samples_leaf': [31],
              'n_estimators':[30,70],
              'max_features':range(4,8,4),
              'subsample':[0.6,0.75]
              }


In [7]:
# put all models into one box
dic = {
# 'NBC':[NBC,nb_param_grid],
'RFC':[RFC,rf_param_grid],
'XGB':[xgb,xgb_param_grid],
'DAC':[DAC,dac_param_grid],
'LRC':[LRC,lrc_param_grid],
'GBC':[GBC,gbc_param_grid]}

In [8]:
kfold = StratifiedKFold(n_splits=10)

def para_tuning(dic, X, y, n_jobs=4):
    '''dictionary format should be:
    { <Name of Model> : [<model>, <parameter grid>]}
    '''
    model_ls = []
    
    for model in dic:
        # grid search cross validation for hyper-parameter tunings
        gs = GridSearchCV(dic[model][0],param_grid = dic[model][1], cv=kfold, scoring="accuracy", n_jobs = n_jobs, verbose = 1)
        gs.fit(X,y)

        # this is the best classifier
        model_ls.append([model,gs.best_estimator_])
    
    return model_ls

In [9]:
# tune features in one line.. just kidding... we need all the above lines
model_ls = para_tuning(dic,X,y,4)

Fitting 10 folds for each of 36 candidates, totalling 360 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    3.6s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:   12.3s
[Parallel(n_jobs=4)]: Done 360 out of 360 | elapsed:   25.7s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 10 folds for each of 144 candidates, totalling 1440 fits


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    1.7s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    6.4s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:   14.5s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:   26.0s
[Parallel(n_jobs=4)]: Done 1242 tasks      | elapsed:   42.4s
[Parallel(n_jobs=4)]: Done 1440 out of 1440 | elapsed:   49.6s finished


Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 10 folds for each of 15 candidates, totalling 150 fits


[Parallel(n_jobs=4)]: Done 150 out of 150 | elapsed:    1.4s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 10 folds for each of 16 candidates, totalling 160 fits


[Parallel(n_jobs=4)]: Done 160 out of 160 | elapsed:    2.1s finished


In [10]:
from sklearn.ensemble import VotingClassifier
# this is the voting classifier
voting_clf = VotingClassifier(estimators=model_ls, voting='soft')

## Cross validation

In [11]:
# 10 fold cross validation
def cv_test(clf,X,y):
    cv_scores = cross_val_score(clf, X, y = y, scoring = "accuracy", cv = kfold, n_jobs=4)
    score = cv_scores.mean()
    standard_deviation = cv_scores.std()
    print("{:.4%}".format(score))

In [12]:
# test models
for model in model_ls:
    print(model[0])
    cv_test(model[1],X,y)

# this voting classifier does not like to be put into a list...
# it will break down the whole kernel if you do it...
print('Voting Classifier')
cv_test(voting_clf,X,y)

RFC
83.1669%
XGB
84.1832%
DAC
82.9447%
LRC
83.0546%
GBC
82.7149%
Voting Classifier
83.9547%


In [13]:
from sklearn.model_selection import train_test_split

# 20 times train test split and take the mean.
# validation set :)

dic={}

# --------------- loop --------------- #
for model in model_ls:
    for i in range(20):
        X_train, X_test, y_train, y_test = \
            train_test_split(X, y, test_size=0.33)
        
        for model in model_ls:
            model[1].fit(X_train,y_train)
            sc = model[1].score(X_test,y_test)
        
            if model[0] not in dic.keys():
                dic[model[0]] = [sc]
            elif model[0] in dic.keys():
                dic[model[0]].append(sc)
            else:
                print('what?')
        
        voting_clf.fit(X_train,y_train)
        sc = voting_clf.score(X_test,y_test)
        if 'voting' not in dic.keys():
            dic['voting'] = [sc]
        elif 'voting' in dic.keys():
            dic['voting'].append(sc)
        else:
            print('what?')
# --------------- loop --------------- #
            
import numpy as np
for i in dic.keys():
    print(i)
    print("{:.4%}".format(np.mean(dic[i])))

RFC
82.8542%
XGB
82.5322%
DAC
82.5390%
LRC
82.7763%
GBC
81.7153%
voting
83.2136%
