In [1038]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.metrics import confusion_matrix, classification_report

In [1039]:
df_train = pd.read_csv('train.csv', index_col='PassengerId')
df_test = pd.read_csv('test.csv', index_col='PassengerId')
df_full = pd.concat([df_train,df_test])

In [1040]:
df_full.head()

Unnamed: 0_level_0,Age,Cabin,Embarked,Fare,Name,Parch,Pclass,Sex,SibSp,Survived,Ticket
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,22.0,,S,7.25,"Braund, Mr. Owen Harris",0,3,male,1,0.0,A/5 21171
2,38.0,C85,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,1,female,1,1.0,PC 17599
3,26.0,,S,7.925,"Heikkinen, Miss. Laina",0,3,female,0,1.0,STON/O2. 3101282
4,35.0,C123,S,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,1,female,1,1.0,113803
5,35.0,,S,8.05,"Allen, Mr. William Henry",0,3,male,0,0.0,373450


In [1041]:
MEDIAN_AGE = {};
for i in range(1,4):
    MEDIAN_AGE['male' + str(i)]=df_full[(df_full['Pclass']==i) & (df_full['Sex']=='male')]['Age'].median()
    MEDIAN_AGE['female' + str(i)]=df_full[(df_full['Pclass']==i) & (df_full['Sex']=='female')]['Age'].median()

In [1042]:
def impute_age(cols):
    Age = cols[0]
    Pclass = cols[1]
    Sex = cols[2]
    if pd.isnull(Age):
        return MEDIAN_AGE[Sex + str(Pclass)]

    else:
        return Age

In [1043]:
def family_size(df,Sibsp,Parch):
    df['Family Size']=df[Sibsp] + df[Parch] + 1
    return df

In [1044]:
def fill_na_age(df):
    df['Age'] = df[['Age', 'Pclass', 'Sex']].apply(impute_age,axis=1) 
    return df

In [1045]:
def tranform_embarked(df):
    df[['Embarked_C','Emarbarked_Q','Emarked_S']]=pd.get_dummies(df['Embarked'])
    return df

In [1046]:
def transform_age(df):
    df[['female','male']]=pd.get_dummies(df['Sex'])
    return df

In [1047]:
def fill_na_fare(df):
    df['Fare'].fillna(df['Fare'].median(),inplace=True)
    return df

In [1048]:
def keep_numerical(df):
    df = df.drop(['Name','Sex', 'SibSp', 'Parch', 'Ticket','Cabin','Embarked'], axis=1)
    return df    

In [1049]:
df_final=(df_train.pipe(family_size,'SibSp' , 'Parch')
                 .pipe(fill_na_age)
                 .pipe(tranform_embarked)
                 .pipe(transform_age)
                 .pipe(fill_na_fare)
                 .pipe(keep_numerical))

In [1050]:
from sklearn.model_selection import train_test_split

In [1051]:
X = df_final.drop('Survived',axis=1)

In [1052]:
y = df_final['Survived']

In [1053]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [1054]:
from sklearn.ensemble import RandomForestClassifier

In [1055]:
rfc = RandomForestClassifier()

In [1056]:
rfc.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [1057]:
predictions=rfc.predict(X_train)

In [1058]:
print(classification_report(y_train,predictions))

             precision    recall  f1-score   support

          0       0.95      0.98      0.96       374
          1       0.96      0.92      0.94       222

avg / total       0.95      0.95      0.95       596



In [1059]:
print(confusion_matrix(y_train, predictions))

[[365   9]
 [ 18 204]]


In [1060]:
scores = cross_val_score(rfc,X_train,y_train,scoring="accuracy", cv=4)

In [1061]:
print(scores)

[ 0.79333333  0.81333333  0.77027027  0.81756757]


In [1062]:
predictions = rfc.predict(X_test)

In [1063]:
print(classification_report(y_test,predictions))

             precision    recall  f1-score   support

          0       0.79      0.83      0.81       175
          1       0.73      0.68      0.71       120

avg / total       0.77      0.77      0.77       295



In [1064]:
print(confusion_matrix(y_test, predictions))

[[145  30]
 [ 38  82]]


In [1065]:
scores = cross_val_score(rfc,X_test,y_test,scoring="accuracy", cv=4)

In [1066]:
print(scores)

[ 0.82432432  0.81081081  0.77027027  0.80821918]


### Model Tuning

In [1067]:
from sklearn.model_selection import cross_val_score, GridSearchCV

In [1068]:
param_grid = [
    {'n_estimators':[3,10,30,50,100], 'max_features':['auto','log2' ], 'criterion':['gini','entropy']},
    #{'bootstap':[False], 'n_estimators':[3,10,30,50,100], 'max_features':['auto','log2'], 'criterion':['gini','entropy']},
]

In [1069]:
grid_search = GridSearchCV(rfc, param_grid,cv=4)

In [1070]:
grid_search.fit(X_train,y_train)

GridSearchCV(cv=4, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid=[{'n_estimators': [3, 10, 30, 50, 100], 'max_features': ['auto', 'log2'], 'criterion': ['gini', 'entropy']}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [1071]:
scores = cross_val_score(grid_search,X_train,y_train,scoring="accuracy", cv=4)

In [1072]:
print(scores)

[ 0.79333333  0.78        0.77702703  0.82432432]


In [1073]:
grid_search.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=50, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [1074]:
grid_search.best_params_

{'criterion': 'gini', 'max_features': 'auto', 'n_estimators': 50}

In [1075]:
cvres = grid_search.cv_results_
#print(cvres)

In [1076]:
feature_importances = grid_search.best_estimator_.feature_importances_

In [1077]:
attributes = df_final_test.columns

In [1078]:
sorted(zip(feature_importances,attributes), reverse=True)

[(0.25773024552863943, 'Fare'),
 (0.25569324377271568, 'Age'),
 (0.15605761574573138, 'female'),
 (0.12930839383494777, 'male'),
 (0.084502161183319935, 'Pclass'),
 (0.075684552900089408, 'Family Size'),
 (0.019213041302342949, 'Embarked_C'),
 (0.013565377362248787, 'Emarked_S'),
 (0.0082453683699646504, 'Emarbarked_Q')]

In [1079]:
predictions=grid_search.predict(X_test)

In [1080]:
print(classification_report(y_test,predictions))

             precision    recall  f1-score   support

          0       0.81      0.81      0.81       175
          1       0.72      0.72      0.73       120

avg / total       0.78      0.78      0.78       295



In [1081]:
print(confusion_matrix(y_test, predictions))

[[142  33]
 [ 33  87]]


In [1082]:
#Final Prediction
df_final_test=(df_test.pipe(family_size,'SibSp' , 'Parch')
                 .pipe(fill_na_age)
                 .pipe(tranform_embarked)
                 .pipe(transform_age)
                 .pipe(fill_na_fare)
                 .pipe(keep_numerical))

In [1083]:
predictions=rfc.predict(df_final_test)

In [1084]:
df_results = pd.DataFrame(predictions,columns=['Survived'], index=df_final_test.index)

In [1085]:
pd.DataFrame.to_csv(df_results, path_or_buf='Titatic_rfc_v4.csv')

In [1086]:
predictions=grid_search.predict(df_final_test)

In [1087]:
df_results = pd.DataFrame(predictions,columns=['Survived'], index=df_final_test.index)

In [1088]:
pd.DataFrame.to_csv(df_results, path_or_buf='Titatic_gridsearch_v4.csv')