# Titanic dataset from kaggle

In [160]:
# Data link: https://www.kaggle.com/c/titanic/data
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix


In [146]:
train = pd.read_csv('train.csv')
cat_features = ['Sex', 'Embarked']
num_features = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']
X = train[cat_features+num_features]
X = pd.get_dummies(X, columns=cat_features)
y = train['Survived']
X['Age'] = X['Age'].fillna(X['Age'].mean())
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=4)


In [158]:
# Baseline accuracy. Predict no one survived. Predict y=0 for all
print('Baseline test accuracy', 1-y_test.mean())

Baseline test accuracy 0.6726457399103138


## No balancing

In [153]:
rf = RandomForestClassifier(n_estimators=100, criterion='gini', max_depth=5, min_samples_split=5, 
                            min_samples_leaf=5, random_state = 3)
rf.fit(X_train, y_train)      

y_pred_train = rf.predict_proba(X_train)[:, 1]
y_pred_test = rf.predict_proba(X_test)[:, 1]

# use mean of y_train to convert probabailities to binary label
y_train_labels = (y_pred_train>y_train.mean()).astype(int) 
y_test_labels = (y_pred_test>y_train.mean()).astype(int)

print('AUC on train', roc_auc_score(y_train, y_pred_train))
print('AUC on test', roc_auc_score(y_test, y_pred_test))

print('Accuracy on train', (y_train_labels==y_train).mean())
print('Accuracy on test', (y_test_labels==y_test).mean())

print('Confusion matric on train')
print(confusion_matrix(y_train, y_train_labels))
print('Confusion matric on test')
print(confusion_matrix(y_test, y_test_labels))

AUC on train 0.9056749680893684
AUC on test 0.8614155251141553
Accuracy on train 0.8278443113772455
Accuracy on test 0.8161434977578476
Confusion matric on train
[[338  61]
 [ 54 215]]
Confusion matric on test
[[127  23]
 [ 18  55]]


## Balancing by using class_weight='balanced' param in RF

In [154]:
# Do balancing with class_weight='balanced'
rf = RandomForestClassifier(n_estimators=100, criterion='gini', max_depth=5, min_samples_split=5, 
                            min_samples_leaf=5, class_weight='balanced', random_state = 3)
rf.fit(X_train, y_train)      

y_pred_train = rf.predict_proba(X_train)[:, 1]
y_pred_test = rf.predict_proba(X_test)[:, 1]

y_train_labels = (y_pred_train>0.5).astype(int)
y_test_labels = (y_pred_test>0.5).astype(int)

print('AUC on train', roc_auc_score(y_train, y_pred_train))
print('AUC on test', roc_auc_score(y_test, y_pred_test))

print('Accuracy on train', (y_train_labels==y_train).mean())
print('Accuracy on test', (y_test_labels==y_test).mean())

print('Confusion matric on train')
print(confusion_matrix(y_train, y_train_labels))
print('Confusion matric on test')
print(confusion_matrix(y_test, y_test_labels))

AUC on train 0.9058892584621405
AUC on test 0.8655707762557077
Accuracy on train 0.8323353293413174
Accuracy on test 0.8161434977578476
Confusion matric on train
[[344  55]
 [ 57 212]]
Confusion matric on test
[[128  22]
 [ 19  54]]
