# Predicting Titanic Survivors with Random Forest

## Overview
This Jupyter Notebook demonstrates the use of Random Forest, a popular machine learning algorithm, to predict survival on the Titanic. The dataset used is a hypothetical dataset containing information about passengers on the Titanic, such as age, gender, ticket class, and whether they survived or not.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.model_selection import train_test_split

In [44]:
titanic=pd.read_csv("/Users/Titanic.csv")
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [45]:
titanic["Age"].fillna(titanic["Age"].mean(), inplace=True)

In [46]:
titanic["Family_cnt"]=titanic["SibSp"] + titanic["Parch"]

In [47]:
titanic.drop(["PassengerId","SibSp","Parch"], axis =1, inplace=True)

In [48]:
titanic.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Ticket,Fare,Cabin,Embarked,Family_cnt
0,0,3,"Braund, Mr. Owen Harris",male,22.0,A/5 21171,7.25,,S,1
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,PC 17599,71.2833,C85,C,1
2,1,3,"Heikkinen, Miss. Laina",female,26.0,STON/O2. 3101282,7.925,,S,0
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,113803,53.1,C123,S,1
4,0,3,"Allen, Mr. William Henry",male,35.0,373450,8.05,,S,0


In [50]:
titanic.to_csv("/Users/Features_out.csv", index=False)

In [51]:
titanic["Cabin_ind"]=np.where(titanic["Cabin"].isnull(),0,1)

In [52]:
gender_num = {"male":0, "female":1}
titanic["Sex"]  = titanic["Sex"].map(gender_num)

In [53]:
titanic.drop(["Cabin","Embarked","Name", "Ticket"], axis =1, inplace=True)

In [56]:
titanic.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Family_cnt,Cabin_ind
0,0,3,0,22.0,7.25,1,0
1,1,1,1,38.0,71.2833,1,1
2,1,3,1,26.0,7.925,0,0
3,1,1,1,35.0,53.1,1,1
4,0,3,0,35.0,8.05,0,0


In [55]:
titanic.to_csv("/Users/Titanic_cleaned.csv", index=False)

In [57]:
features = titanic.drop("Survived", axis=1)
labels = titanic["Survived"]

X_train, X_test, y_train, y_test=train_test_split(features, labels, test_size=0.4, random_state=42)
X_test, X_val, y_test, y_val=train_test_split(X_test, y_test, test_size=0.5, random_state=42)

In [58]:
for dataset in [y_train, y_test, y_val]:
    print(round(len(dataset)/len(labels), 2))

0.6
0.2
0.2


In [59]:
X_train.to_csv("/Users/Train_features.csv", index=False)
X_val.to_csv("/Users/Validate.csv", index=False)
X_test.to_csv("/Users/Test_features.csv", index=False)
y_train.to_csv("/Users/Train_labels.csv", index=False)
y_val.to_csv("/Users/Validate_labels.csv", index=False)
y_test.to_csv("/Users/Test_labels.csv", index=False)


In [62]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [65]:
tr_features= pd.read_csv("/Users/Features.csv")
tr_labels= pd.read_csv("/Users/Labels.csv")

In [67]:
rf = RandomForestClassifier()
scores= cross_val_score(rf, tr_features, tr_labels.values.ravel(), cv=5)

In [68]:
scores

array([0.82242991, 0.81308411, 0.78504673, 0.79439252, 0.83018868])

In [69]:
from sklearn.model_selection import GridSearchCV
tr_features= pd.read_csv("/Users/Train_features.csv")
tr_labels= pd.read_csv("/Users/Train_labels.csv")

In [71]:
def print_results(results):
    print("BEST PARAMS: {}\n".format(results.best_params))
          
    means = results.cv_results_["mean_test_score"]
    stds = results.cv_results_["std_test_score"]
    for mean, std, params in zip(means, stds, results.cv_results_["params"]):
        print("{} (+/-{}) for {}".format(round(mean, 3), round(std *2, 3), paramd))

In [75]:
rf = RandomForestClassifier()
parameters = {
    "n_estimators": [5, 50, 100],
    "max_depth": [2, 10, 20, None]
}
cv = GridSearchCV(rf, parameters, cv=5)
cv.fit(tr_features, tr_labels.values.ravel())
print_results(cv)

AttributeError: 'GridSearchCV' object has no attribute 'best_params'

In [76]:
tr_features= pd.read_csv("/Users/train_features.csv")
tr_labels= pd.read_csv("/Users/train_labels.csv")

val_features= pd.read_csv("/Users/validate_features.csv")
val_labels= pd.read_csv("/Users/validate_labels.csv")

te_features= pd.read_csv("/Users/test_features.csv")
te_labels= pd.read_csv("/Users/test_labels.csv")

In [77]:
rf1 = RandomForestClassifier(n_estimators=5, max_depth=10)
rf1.fit(tr_features, tr_labels.values.ravel())

rf2 = RandomForestClassifier(n_estimators=100, max_depth=10)
rf2.fit(tr_features, tr_labels.values.ravel())

rf3 = RandomForestClassifier(n_estimators=100, max_depth=None)
rf3.fit(tr_features, tr_labels.values.ravel())

RandomForestClassifier()

In [79]:
#calculate accuracy, precison and recall scores for multiple models using validation dataset
from sklearn.metrics import accuracy_score, precision_score, recall_score
for mdl in [rf1,rf2,rf3]:
    y_pred = mdl.predict(val_features)
    accuracy = round(accuracy_score(val_labels, y_pred), 3)
    precision = round(precision_score(val_labels, y_pred), 3)
    recall = round(recall_score(val_labels, y_pred), 3)
    print('MAX DEPTH: {} / # OF EST: {} -- A: {} / P: {} / R: {}'.format(mdl.max_depth,
                                                                        mdl.n_estimators,
                                                                        accuracy,
                                                                        precision,
                                                                        recall))

MAX DEPTH: 10 / # OF EST: 5 -- A: 0.832 / P: 0.838 / R: 0.75
MAX DEPTH: 10 / # OF EST: 100 -- A: 0.827 / P: 0.881 / R: 0.684
MAX DEPTH: None / # OF EST: 100 -- A: 0.827 / P: 0.846 / R: 0.724


In [81]:
#calculate accuracy, precison and recall scores for multiple models using testing dataset
y_pred = rf2.predict(te_features)
accuracy = round(accuracy_score(te_labels, y_pred), 3)
precision = round(precision_score(te_labels, y_pred), 3)
recall = round(recall_score(te_labels, y_pred), 3)
print('MAX DEPTH: {} / # OF EST: {} -- A: {} / P: {} / R: {}'.format(mdl.max_depth,
                                                                        mdl.n_estimators,
                                                                        accuracy,
                                                                        precision,
                                                                        recall))

MAX DEPTH: None / # OF EST: 100 -- A: 0.798 / P: 0.754 / R: 0.662


In [42]:
features = titanic.drop("Survived", axis=1)
labels = titanic["Survived"]

X_train, X_test, y_train, y_test=train_test_split(features, labels, test_size=0.4, random_state=42)
X_test, X_val, y_test, y_val=train_test_split(X_test, y_test, test_size=0.5, random_state=42)

In [43]:
print(len(labels), len(y_train),len(y_test),len(y_val))

891 534 178 179
