In [71]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
%matplotlib inline

titanic = pd.read_csv('titanic.csv')
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [92]:
def transform(inputdf):
    data = inputdf.copy()
    data['Age'].fillna(data['Age'].mean(), inplace=True)
    data['Family_cnt'] = data['SibSp'] + data['Parch']
    data.drop(['PassengerId', 'SibSp', 'Parch'], axis=1, inplace=True)
    data['Cabin_ind'] = np.where(data['Cabin'].isnull(), 0, 1)
    gender_num = {'male': 0, 'female': 1}
    data['Sex'] = data['Sex'].map(gender_num)
    data.drop(['Cabin', 'Embarked', 'Name', 'Ticket'], axis=1, inplace=True)
    data['Fare'].fillna(data['Fare'].mean(), inplace=True)
    return data

In [73]:
titanic = transform(titanic)
titanic.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Family_cnt,Cabin_ind
0,0,3,0,22.0,7.25,1,0
1,1,1,1,38.0,71.2833,1,1
2,1,3,1,26.0,7.925,0,0
3,1,1,1,35.0,53.1,1,1
4,0,3,0,35.0,8.05,0,0


In [74]:
features = titanic.drop('Survived', axis=1)
labels = titanic['Survived']

X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.4, random_state=42)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

In [75]:
X_train.to_csv('../../../train_features.csv', index=False)
X_val.to_csv('../../../val_features.csv', index=False)
X_test.to_csv('../../../test_features.csv', index=False)

y_train.to_csv('../../../train_labels.csv', index=False)
y_val.to_csv('../../../val_labels.csv', index=False)
y_test.to_csv('../../../test_labels.csv', index=False)

In [76]:
tr_features = pd.read_csv('../../../train_features.csv')
tr_labels = pd.read_csv('../../../train_labels.csv')

val_features = pd.read_csv('../../../val_features.csv')
val_labels = pd.read_csv('../../../val_labels.csv')

te_features = pd.read_csv('../../../test_features.csv')
te_labels = pd.read_csv('../../../test_labels.csv')

In [77]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

In [78]:
rf = RandomForestClassifier()
scores = cross_val_score(rf, tr_features, tr_labels.values.ravel(), cv=5)
print(scores)

[0.82242991 0.82242991 0.79439252 0.81308411 0.82075472]


In [79]:
def print_results(results):
    print(f'BEST PARAMS: {results.best_params_}\n')

    means = results.cv_results_['mean_test_score']
    stds = results.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, results.cv_results_['params']):
        print('{} (+/-{}) for {}'.format(round(mean, 3), round(std * 2, 3), params))

In [80]:
rf = RandomForestClassifier()
parameters = {
    'n_estimators': [5, 50, 100],
    'max_depth': [2, 10, 20, None]
}

cv = GridSearchCV(rf, parameters, cv=5)
cv.fit(tr_features, tr_labels.values.ravel())

print_results(cv)

BEST PARAMS: {'max_depth': 10, 'n_estimators': 50}

0.775 (+/-0.13) for {'max_depth': 2, 'n_estimators': 5}
0.803 (+/-0.111) for {'max_depth': 2, 'n_estimators': 50}
0.802 (+/-0.116) for {'max_depth': 2, 'n_estimators': 100}
0.798 (+/-0.053) for {'max_depth': 10, 'n_estimators': 5}
0.82 (+/-0.051) for {'max_depth': 10, 'n_estimators': 50}
0.818 (+/-0.043) for {'max_depth': 10, 'n_estimators': 100}
0.788 (+/-0.048) for {'max_depth': 20, 'n_estimators': 5}
0.809 (+/-0.042) for {'max_depth': 20, 'n_estimators': 50}
0.813 (+/-0.033) for {'max_depth': 20, 'n_estimators': 100}
0.792 (+/-0.04) for {'max_depth': None, 'n_estimators': 5}
0.809 (+/-0.029) for {'max_depth': None, 'n_estimators': 50}
0.815 (+/-0.034) for {'max_depth': None, 'n_estimators': 100}


In [81]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

rf1 = RandomForestClassifier(n_estimators=10, max_depth=100)
rf1.fit(tr_features, tr_labels.values.ravel())

rf2 = RandomForestClassifier(n_estimators=20, max_depth=50)
rf2.fit(tr_features, tr_labels.values.ravel())

rf3 = RandomForestClassifier(n_estimators=10, max_depth=50)
rf3.fit(tr_features, tr_labels.values.ravel())

rf4 = RandomForestClassifier(n_estimators=10, max_depth=5)
rf4.fit(tr_features, tr_labels.values.ravel())

rf5 = RandomForestClassifier(n_estimators=10, max_depth=100)
rf5.fit(tr_features, tr_labels.values.ravel())

rf6 = RandomForestClassifier(n_estimators=5, max_depth=10)
rf6.fit(tr_features, tr_labels.values.ravel())

for mdl in [rf1, rf2, rf3, rf4, rf5, rf6]:
    y_pred = mdl.predict(val_features)
    accuracy = round(accuracy_score(val_labels, y_pred), 3)
    precision = round(precision_score(val_labels, y_pred), 3)
    recall = round(recall_score(val_labels, y_pred), 3)
    print('MAX DEPTH: {} / # OF EST: {} -- A: {} / P: {} / R: {}'.format(mdl.max_depth,
                                                                         mdl.n_estimators,
                                                                         accuracy,
                                                                         precision,
                                                                         recall))

MAX DEPTH: 100 / # OF EST: 10 -- A: 0.816 / P: 0.841 / R: 0.697
MAX DEPTH: 50 / # OF EST: 20 -- A: 0.816 / P: 0.841 / R: 0.697
MAX DEPTH: 50 / # OF EST: 10 -- A: 0.804 / P: 0.797 / R: 0.724
MAX DEPTH: 5 / # OF EST: 10 -- A: 0.804 / P: 0.836 / R: 0.671
MAX DEPTH: 100 / # OF EST: 10 -- A: 0.821 / P: 0.844 / R: 0.711
MAX DEPTH: 10 / # OF EST: 5 -- A: 0.788 / P: 0.771 / R: 0.711


In [82]:
for mdl in [rf1, rf2, rf3, rf4, rf5, rf6]:
    y_pred = mdl.predict(te_features)
    accuracy = round(accuracy_score(te_labels, y_pred), 3)
    precision = round(precision_score(te_labels, y_pred), 3)
    recall = round(recall_score(te_labels, y_pred), 3)
    print('MAX DEPTH: {} / # OF EST: {} -- A: {} / P: {} / R: {}'.format(mdl.max_depth,
                                                                         mdl.n_estimators,
                                                                         accuracy,
                                                                         precision,
                                                                         recall))

MAX DEPTH: 100 / # OF EST: 10 -- A: 0.77 / P: 0.707 / R: 0.631
MAX DEPTH: 50 / # OF EST: 20 -- A: 0.787 / P: 0.737 / R: 0.646
MAX DEPTH: 50 / # OF EST: 10 -- A: 0.764 / P: 0.702 / R: 0.615
MAX DEPTH: 5 / # OF EST: 10 -- A: 0.764 / P: 0.702 / R: 0.615
MAX DEPTH: 100 / # OF EST: 10 -- A: 0.792 / P: 0.741 / R: 0.662
MAX DEPTH: 10 / # OF EST: 5 -- A: 0.764 / P: 0.717 / R: 0.585


In [96]:
# rf4 and rf6

test = pd.read_csv('test.csv')
test.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [97]:
test_trans = transform(test)
test['Survived'] = rf4.predict(test_trans)
test[['PassengerId','Survived']].to_csv('gender_submission.csv', index=False)