In [26]:
import numpy as np
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd

In [27]:
# Load the data samples
train_data_sample = pd.read_csv("datasets/titanic/train.csv")
test_data_sample = pd.read_csv("datasets/titanic/test.csv")


In [28]:
# Get Data insight
train_data_sample.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [29]:
train_data_sample.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [30]:
train_data_sample.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


# First Attempt:
Try to fit data with a RandomForestClassifier, this is just to get a baseline to compare with

In [31]:
# For this to work, it is necessary to do some data cleaning.
# First lets take just the columns with number-like values.
# This is naive, but it would give us something to work with

numeric_samples = train_data_sample.select_dtypes(exclude={'object'})
numeric_samples.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Age          714 non-null    float64
 4   SibSp        891 non-null    int64  
 5   Parch        891 non-null    int64  
 6   Fare         891 non-null    float64
dtypes: float64(2), int64(5)
memory usage: 48.9 KB


In [32]:
# Ok, As seen before, Age has some null values. As this is a really naive attempt,
# I will just go and set those values as the mean of the Age
mean_age = numeric_samples["Age"].mean()
numeric_samples["Age"].fillna(value=mean_age, inplace=True)
numeric_samples.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Age          891 non-null    float64
 4   SibSp        891 non-null    int64  
 5   Parch        891 non-null    int64  
 6   Fare         891 non-null    float64
dtypes: float64(2), int64(5)
memory usage: 48.9 KB


In [33]:
# Ok, all data good, take the sample vector and the answers
X_train, Y_test = train_test_split(numeric_samples, test_size=0.2)
x_train = X_train.iloc[:, 2:]
y_train = X_train.iloc[:, 1]

x_test = Y_test.iloc[:, 2:]
y_test = Y_test.iloc[:, 1]

x_train.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare
379,3,19.0,0,0,7.775
7,3,2.0,3,1,21.075
59,3,11.0,5,2,46.9
555,1,62.0,0,0,26.55
851,3,74.0,0,0,7.775


In [34]:
# Build the classifier
model = RandomForestClassifier(random_state=0)
model.fit(x_train, y_train)

In [35]:
# Measure the accuracy
accuracy_score(y_test, model.predict(x_test))

0.6312849162011173