# Random Forest - Titanic Dataset
Predicting the survival of passengers in Titanic Dataset given the other features. For optimizations sake, we are dropping name, ticket and cabin as they do not contribute to the prediction. Our goal is to input a certain detail or a set of details and predict if the person has survived. But the case with passengerID is different as Kaggle needs the passengerIDs to check the output. We shouldn't drop it from the original tab but we can from the training and testing data.

In [4]:
import zipfile
import os

In [5]:
zip_path = "titanic.zip"
extract_to = "titanic_data"

os.makedirs(extract_to, exist_ok=True)

with zipfile.ZipFile(zip_path) as f:
  f.extractall(extract_to)

print(f"Extracted files to: {extract_to}")

Extracted files to: titanic_data


In [6]:
import pandas as pd

train = pd.read_csv("titanic_data/train.csv")
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [7]:
test = pd.read_csv("titanic_data/test.csv")
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [8]:
train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [9]:
print("Train:")
print(f"Unique values in Embarked: {train['Embarked'].unique()}")
print(f"Number of NaN in Embarked: {train['Embarked'].isna().sum()}")
print(f"Number of NaN in Age     : {train['Age'].isna().sum()}")
print(f"Number of NaN in Cabin   : {train['Cabin'].isna().sum()}")
print(f"Number of values in Cabin: {len(train['Cabin'])}")

Train:
Unique values in Embarked: ['S' 'C' 'Q' nan]
Number of NaN in Embarked: 2
Number of NaN in Age     : 177
Number of NaN in Cabin   : 687
Number of values in Cabin: 891


In [10]:
test.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,418.0,418.0,332.0,418.0,418.0,417.0
mean,1100.5,2.26555,30.27259,0.447368,0.392344,35.627188
std,120.810458,0.841838,14.181209,0.89676,0.981429,55.907576
min,892.0,1.0,0.17,0.0,0.0,0.0
25%,996.25,1.0,21.0,0.0,0.0,7.8958
50%,1100.5,3.0,27.0,0.0,0.0,14.4542
75%,1204.75,3.0,39.0,1.0,0.0,31.5
max,1309.0,3.0,76.0,8.0,9.0,512.3292


In [11]:
print("Test:")
print(f"Unique values in Embarked: {test['Embarked'].unique()}")
print(f"Number of NaN in Embarked: {test['Embarked'].isna().sum()}")
print(f"Number of NaN in Age     : {test['Age'].isna().sum()}")
print(f"Number of NaN in Cabin   : {test['Cabin'].isna().sum()}")

Test:
Unique values in Embarked: ['Q' 'S' 'C']
Number of NaN in Embarked: 0
Number of NaN in Age     : 86
Number of NaN in Cabin   : 327


# Becoming one with data
Observations:


*   Training data is 891 in number while testing data is 418 in number.
*   Sex column has string entries that has to be converted to 0s and 1s.
*   Embarked column is text that has to be mapped to appropriate numbers.
*   Age, cabin, Embarked has NaN values that are to be replaced.

Solutions:
*   Label-encoding for sex, embarked column
*   The NaN values in Age are replaced with median. Median is better than mean as it isn't influenced by the outliers in the data.
*   The NaN values are lesser in embarked hence, we can replace them with the mode of the column.
*   Since 77.1% of Cabin is filled with NaN, we cannot replace it with mode coz it is misleading. We can replace it with a different value like 'Unknown'. But, Cabin doesn't contribute to the prediction of survivability, so we drop it.
*   We do not have to normalize as normalization is meant only for neural networks and KNNs. Trees find the feature that best splits the data and finds a threshold to split.
*   Suggested by Gemini: Combine SibSp (Siblings/Spouse) and Parch (Parent/Children) into FamilySize. FamilySize of 0 or individuals have lower survival rates, small families have higher survival rates compared to larger families.
*   Another suggestion from Gemini: Columns like name, ticket and cabin are not needed anyway and they can be dropped.

# Preprocessing

In [12]:
# Label Encoding Sex column
train['Sex'] = train['Sex'].map({'male': 0, 'female': 1})

# Label Encoding Embarked column
train['Embarked'] = train['Embarked'].map({'S': 0, 'C': 1, 'Q': '2'})

# Filling NaN in Age with median
train['Age'] = train['Age'].fillna(train['Age'].median())

# Filling NaN in Embarked with mode
train['Embarked'] = train['Embarked'].fillna(train['Embarked'].mode()[0])
# Simply train['Embarked'].mode() doesn't work train['Embarked'].mode() returns
# a pandas.core.series.Series. We have to extract the string from the series.

# Creating a new column FamilySize to improve the accuracy of prediction
train['FamilySize'] = train['SibSp'] + train['Parch'] + 1

# Dropping columns Cabin, Name, Ticket
train = train.drop(['Cabin', 'Name', 'Ticket'], axis=1)

In [13]:
# Label Encoding Sex column
test['Sex'] = test['Sex'].map({'male': 0, 'female': 1})

# Label Encoding Embarked column
test['Embarked'] = test['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})

# Filling NaN in Age with median
test['Age'] = test['Age'].fillna(train['Age'].median())

# Filling NaN in Embarked with mode
test['Embarked'] = test['Embarked'].fillna(test['Embarked'].mode()[0])

# Creating a new column FamilySize to improve the accuracy of prediction
test['FamilySize'] = test['SibSp'] + test['Parch'] + 1

# Dropping columns Cabin, Name, Ticket
test = test.drop(['Cabin', 'Name', 'Ticket'], axis=1)

In [14]:
train

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,FamilySize
0,1,0,3,0,22.0,1,0,7.2500,0,2
1,2,1,1,1,38.0,1,0,71.2833,1,2
2,3,1,3,1,26.0,0,0,7.9250,0,1
3,4,1,1,1,35.0,1,0,53.1000,0,2
4,5,0,3,0,35.0,0,0,8.0500,0,1
...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,0,27.0,0,0,13.0000,0,1
887,888,1,1,1,19.0,0,0,30.0000,0,1
888,889,0,3,1,28.0,1,2,23.4500,0,4
889,890,1,1,0,26.0,0,0,30.0000,1,1


In [15]:
test

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,FamilySize
0,892,3,0,34.5,0,0,7.8292,2,1
1,893,3,1,47.0,1,0,7.0000,0,2
2,894,2,0,62.0,0,0,9.6875,2,1
3,895,3,0,27.0,0,0,8.6625,0,1
4,896,3,1,22.0,1,1,12.2875,0,3
...,...,...,...,...,...,...,...,...,...
413,1305,3,0,28.0,0,0,8.0500,0,1
414,1306,1,1,39.0,0,0,108.9000,1,1
415,1307,3,0,38.5,0,0,7.2500,0,1
416,1308,3,0,28.0,0,0,8.0500,0,1


# Creating training and testing data

In [16]:
"""Dropping PassengerId because it doesn't contribute to the training and
survived because it is the attribute to be predicted"""
X_train = train.drop(['PassengerId', 'Survived'], axis=1)
X_test = test.drop('PassengerId', axis=1)
y_train = train['Survived']

# What are we really working on?
Create a Random Forest with 100 decision trees, train it on the given data and predict on the testing data. Create a dictionary with submission_ids (with PassengerId from test table) and the predictions. Convert it to csv and upload to kaggle.

In [23]:
from sklearn.ensemble import RandomForestClassifier

# n_estimators are the number of trees to be created in the forest
rstfore = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)
rstfore.fit(X_train, y_train)

prediction = rstfore.predict(X_test)

In [24]:
submissions_ids = test['PassengerId']
submission = pd.DataFrame({'PassengerId': submissions_ids,
                           'Survived': prediction})
submission.to_csv('submission_random_forest.csv', index=False)