In [1]:
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score

In [2]:
train_features = pd.read_csv("dataset/titanic/train_features.csv")
train_labels = pd.read_csv("dataset/titanic/train_labels.csv", header=None)[:-1]

test_features = pd.read_csv("dataset/titanic/test_features.csv")
test_labels = pd.read_csv("dataset/titanic/test_labels.csv", header=None)[:-1]

val_features = pd.read_csv("dataset/titanic/val_features.csv")
val_labels = pd.read_csv("dataset/titanic/val_labels.csv", header=None)[:-1]

print(len(train_features))
print(len(train_labels))
print(len(test_features))
print(len(test_labels))
print(len(val_features))
print(len(val_labels))

534
534
178
178
179
179


In [3]:
def print_results(results):
    print('BEST PARAMS: {}\n'.format(results.best_params_))

    means = results.cv_results_['mean_test_score']
    stds = results.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, results.cv_results_['params']):
        print('{} (+/-{}) for {}'.format(round(mean, 3), round(std * 2, 3), params))

In [4]:
rf_clf = RandomForestClassifier()
parameters = {
    'n_estimators': [5, 50, 100],
    'max_depth': [2, 10, 20, None],
}

cv = GridSearchCV(rf_clf, parameters, cv=5, verbose=0)
cv.fit(train_features, train_labels.values.ravel())

print_results(cv)



BEST PARAMS: {'max_depth': 2, 'n_estimators': 50}

0.618 (+/-0.034) for {'max_depth': 2, 'n_estimators': 5}
0.624 (+/-0.012) for {'max_depth': 2, 'n_estimators': 50}
0.624 (+/-0.014) for {'max_depth': 2, 'n_estimators': 100}
0.592 (+/-0.033) for {'max_depth': 10, 'n_estimators': 5}
0.571 (+/-0.069) for {'max_depth': 10, 'n_estimators': 50}
0.554 (+/-0.063) for {'max_depth': 10, 'n_estimators': 100}
0.549 (+/-0.067) for {'max_depth': 20, 'n_estimators': 5}
0.551 (+/-0.047) for {'max_depth': 20, 'n_estimators': 50}
0.539 (+/-0.057) for {'max_depth': 20, 'n_estimators': 100}
0.508 (+/-0.092) for {'max_depth': None, 'n_estimators': 5}
0.547 (+/-0.098) for {'max_depth': None, 'n_estimators': 50}
0.536 (+/-0.063) for {'max_depth': None, 'n_estimators': 100}


In [5]:
rf_clf_1 = RandomForestClassifier(n_estimators=5, max_depth=2)
rf_clf_1.fit(train_features, train_labels.values.ravel())

rf_clf_2 = RandomForestClassifier(n_estimators=50, max_depth=10)
rf_clf_2.fit(train_features, train_labels.values.ravel())

rf_clf_3 = RandomForestClassifier(n_estimators=100, max_depth=None)
rf_clf_3.fit(train_features, train_labels.values.ravel())

In [6]:
for mdl in [rf_clf_1, rf_clf_2, rf_clf_3]:
    y_pred = mdl.predict(val_features)
    accuracy = round(accuracy_score(val_labels, y_pred), 3)
    precision = round(precision_score(
        val_labels, y_pred, average="weighted"), 3)
    recall = round(recall_score(val_labels, y_pred, average="weighted"), 3)
    print('Max Depth: {} / # of EST: {} -- Accuracy: {}, Precision: {}, Recall: {}'.format(mdl.max_depth, mdl.n_estimators, accuracy, precision, recall))
    

Max Depth: 2 / # of EST: 5 -- Accuracy: 0.581, Precision: 0.501, Recall: 0.581
Max Depth: 10 / # of EST: 50 -- Accuracy: 0.57, Precision: 0.535, Recall: 0.57
Max Depth: None / # of EST: 100 -- Accuracy: 0.514, Precision: 0.498, Recall: 0.514


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [8]:
y_pred = rf_clf_1.predict(test_features)
accuracy = round(accuracy_score(test_labels, y_pred), 3)
precision = round(precision_score(
    test_labels, y_pred, average="weighted"), 3)
recall = round(recall_score(test_labels, y_pred, average="weighted"), 3)
print('Max Depth: {} / # of EST: {} -- Accuracy: {}, Precision: {}, Recall: {}'.format(
    rf_clf_1.max_depth, rf_clf_1.n_estimators, accuracy, precision, recall))


Max Depth: 2 / # of EST: 5 -- Accuracy: 0.584, Precision: 0.428, Recall: 0.584


  _warn_prf(average, modifier, msg_start, len(result))
