# Titanic Notebook

## Random Forest Classifier

In this notebook, we will complete a Supervised Machine Learning project. 

In [8]:
# Import libraries
import pandas as pd

# Importing the machine learning model
from sklearn.ensemble import RandomForestClassifier

# Import GridSearchCV to find the model with the best parameters
from sklearn.model_selection import GridSearchCV

# Importing the the functions to measure metrics for the model
from sklearn.metrics import accuracy_score, precision_score, recall_score

In [9]:
# Import training, validation, and test data sets
trainF = pd.read_csv('./train_features.csv')
trainL = pd.read_csv('./train_labels.csv')

valF = pd.read_csv('./validation_features.csv')
valL = pd.read_csv('./validation_labels.csv')

testF = pd.read_csv('./test_features.csv')
testL = pd.read_csv('./test_labels.csv')

### Tuning the Hyperparameters 

In [18]:
# We create this function to nicely format the
# results of GridSearchCV
def printResults(results):
    print('Best ML Params: {}\n'.format(results.best_params_))

    means = results.cv_results_['mean_test_score']
    stnDvs = results.cv_results_['std_test_score']

    for mean, stnDvs, params in zip(means, stnDvs, results.cv_results_['params']):
        print('{} (+/-{}) for {}'.format(
                round(mean, 3),
                round(stnDvs*2, 3),
                params
            )
        )


In [19]:
rfModel = RandomForestClassifier()

# (Hyper)parameters according to Sckit Learn docs
parameters = {
    'n_estimators': [5, 50, 100],
    'max_depth': [2, 10, 20, None]
}

gridSearch = GridSearchCV(rfModel, parameters, cv=5)
gridSearch.fit(trainF, trainL.values.ravel())

# Note:
# Accuracy is the result of precision and recall
printResults(gridSearch)

Best ML Params: {'max_depth': 10, 'n_estimators': 50}

0.799 (+/-0.039) for {'max_depth': 2, 'n_estimators': 5}
0.801 (+/-0.059) for {'max_depth': 2, 'n_estimators': 50}
0.799 (+/-0.057) for {'max_depth': 2, 'n_estimators': 100}
0.809 (+/-0.096) for {'max_depth': 10, 'n_estimators': 5}
0.833 (+/-0.047) for {'max_depth': 10, 'n_estimators': 50}
0.818 (+/-0.045) for {'max_depth': 10, 'n_estimators': 100}
0.811 (+/-0.04) for {'max_depth': 20, 'n_estimators': 5}
0.818 (+/-0.036) for {'max_depth': 20, 'n_estimators': 50}
0.824 (+/-0.042) for {'max_depth': 20, 'n_estimators': 100}
0.79 (+/-0.041) for {'max_depth': None, 'n_estimators': 5}
0.822 (+/-0.034) for {'max_depth': None, 'n_estimators': 50}
0.82 (+/-0.028) for {'max_depth': None, 'n_estimators': 100}


In [None]:
# The top 3 best hyperparameters:

# {'max_depth': 10, 'n_estimators': 50}
# {'max_depth': None, 'n_estimators': 100}
# {'max_depth': None, 'n_estimators': 50}

In [20]:
# Create 3 RandomForestClassifiers with the best hyperparameters
rfModel1 = RandomForestClassifier(n_estimators=50, max_depth=10)
rfModel1.fit(trainF, trainL.values.ravel())

rfModel2 = RandomForestClassifier(n_estimators=100, max_depth=None)
rfModel2.fit(trainF, trainL.values.ravel())

rfModel3 = RandomForestClassifier(n_estimators=50, max_depth=None)
rfModel3.fit(trainF, trainL.values.ravel())

RandomForestClassifier(n_estimators=50)

In [21]:
# Now look at the metric for validation data set
for mlModel in [rfModel1, rfModel2, rfModel3]:

    # Make each model predict the labels in the validations data set
    predLabel = mlModel.predict(valF)

    # In classificatio, your metrics are (1) accuracy, (2) precision, and (3) recall
    accuracy = round(accuracy_score(valL, predLabel), 3)
    precision = round(precision_score(valL, predLabel), 3)
    recall = round(recall_score(valL, predLabel), 3)

    print(
        'Max depth: {} and Estimators: {} ---> Accuracy: {}, Precision: {}, Recall: {}'
        .format(mlModel.max_depth, mlModel.n_estimators, accuracy, precision, recall)
    )

Max depth: 10 and Estimators: 50 ---> Accuracy: 0.775, Precision: 0.754, Recall: 0.671
Max depth: None and Estimators: 100 ---> Accuracy: 0.742, Precision: 0.696, Recall: 0.658
Max depth: None and Estimators: 50 ---> Accuracy: 0.747, Precision: 0.706, Recall: 0.658


In [22]:
# Use the test data set with the best available model
predLabel = rfModel1.predict(testF)

accuracy = round(accuracy_score(testL, predLabel), 3)
precision = round(precision_score(testL, predLabel), 3)
recall = round(recall_score(testL, predLabel), 3)

print(
        'Max depth: {} and Estimators: {} ---> Accuracy: {}, Precision: {}, Recall: {}'
        .format(rfModel1.max_depth, rfModel1.n_estimators, accuracy, precision, recall)
)

Max depth: 10 and Estimators: 50 ---> Accuracy: 0.831, Precision: 0.791, Recall: 0.768


In [23]:
testF.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Cabin_ind
0,3,0,16.0,4,1,39.6875,1.0,0
1,3,1,29.642093,1,0,14.4542,3.0,0
2,1,1,22.0,0,1,55.0,1.0,1
3,3,0,28.0,0,0,9.5,1.0,0
4,2,0,29.642093,0,0,0.0,1.0,0


In [24]:
# Append the predicted labels to the test data set and export.
finalOutput = testF
finalOutput['survived_predicted'] = predLabel
finalOutput.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Cabin_ind,survived_predicted
0,3,0,16.0,4,1,39.6875,1.0,0,0
1,3,1,29.642093,1,0,14.4542,3.0,0,0
2,1,1,22.0,0,1,55.0,1.0,1,1
3,3,0,28.0,0,0,9.5,1.0,0,0
4,2,0,29.642093,0,0,0.0,1.0,0,0


In [25]:
finalOutput.to_csv('./titanic_predicted.csv', index=False)