In [115]:
import numpy as np
import pandas as pd
import os 
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier

In [116]:
df = pd.read_csv("./train.csv")
df_test = pd.read_csv("./test.csv")

In [117]:
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [118]:
df.info() # Checking dtypes and non null values present

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [119]:
df.columns # Columns presents in the dataset

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [120]:
# Splitting dataset to x_train (features) and y_train(predicted values)
y_train = df["Survived"]
x_train = df.drop(columns=["Survived"])

In [121]:
x_train


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...
886,887,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [122]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Pclass       891 non-null    int64  
 2   Name         891 non-null    object 
 3   Sex          891 non-null    object 
 4   Age          714 non-null    float64
 5   SibSp        891 non-null    int64  
 6   Parch        891 non-null    int64  
 7   Ticket       891 non-null    object 
 8   Fare         891 non-null    float64
 9   Cabin        204 non-null    object 
 10  Embarked     889 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 76.7+ KB


In [123]:
x_train.isna().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [124]:
# Since Cabin has a lot of null values, removing the column
x_train = x_train.drop(columns=["Cabin"])

In [125]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Pclass       891 non-null    int64  
 2   Name         891 non-null    object 
 3   Sex          891 non-null    object 
 4   Age          714 non-null    float64
 5   SibSp        891 non-null    int64  
 6   Parch        891 non-null    int64  
 7   Ticket       891 non-null    object 
 8   Fare         891 non-null    float64
 9   Embarked     889 non-null    object 
dtypes: float64(2), int64(4), object(4)
memory usage: 69.7+ KB


In [126]:
# One hot encoding for sex, embarked, pclass which are categorical data

In [127]:
categorical_cols = ["Pclass","Sex","Embarked"]
x_train_categ = pd.get_dummies(x_train, columns = categorical_cols, dtype=int)

In [128]:
x_train_categ

Unnamed: 0,PassengerId,Name,Age,SibSp,Parch,Ticket,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,1,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.2500,0,0,1,0,1,0,0,1
1,2,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,1,0,0,1,0,1,0,0
2,3,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.9250,0,0,1,1,0,0,0,1
3,4,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1000,1,0,0,1,0,0,0,1
4,5,"Allen, Mr. William Henry",35.0,0,0,373450,8.0500,0,0,1,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,"Montvila, Rev. Juozas",27.0,0,0,211536,13.0000,0,1,0,0,1,0,0,1
887,888,"Graham, Miss. Margaret Edith",19.0,0,0,112053,30.0000,1,0,0,1,0,0,0,1
888,889,"Johnston, Miss. Catherine Helen ""Carrie""",,1,2,W./C. 6607,23.4500,0,0,1,1,0,0,0,1
889,890,"Behr, Mr. Karl Howell",26.0,0,0,111369,30.0000,1,0,0,0,1,1,0,0


In [129]:
x_train_categ.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Name         891 non-null    object 
 2   Age          714 non-null    float64
 3   SibSp        891 non-null    int64  
 4   Parch        891 non-null    int64  
 5   Ticket       891 non-null    object 
 6   Fare         891 non-null    float64
 7   Pclass_1     891 non-null    int64  
 8   Pclass_2     891 non-null    int64  
 9   Pclass_3     891 non-null    int64  
 10  Sex_female   891 non-null    int64  
 11  Sex_male     891 non-null    int64  
 12  Embarked_C   891 non-null    int64  
 13  Embarked_Q   891 non-null    int64  
 14  Embarked_S   891 non-null    int64  
dtypes: float64(2), int64(11), object(2)
memory usage: 104.5+ KB


In [130]:
# Try using Fare next time, There is a Nan row in test column
x_train_categ.drop(columns=["PassengerId","Name","Age","Ticket","Fare"],inplace=True)

In [131]:
df_test.drop(columns=["PassengerId","Name","Age","Ticket","Cabin","Fare"],inplace=True)

In [132]:
categorical_cols = ["Pclass","Sex","Embarked"]
x_test_categ = pd.get_dummies(df_test, columns = categorical_cols, dtype=int)

In [133]:
x_test_categ

Unnamed: 0,SibSp,Parch,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,0,0,0,0,1,0,1,0,1,0
1,1,0,0,0,1,1,0,0,0,1
2,0,0,0,1,0,0,1,0,1,0
3,0,0,0,0,1,0,1,0,0,1
4,1,1,0,0,1,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...
413,0,0,0,0,1,0,1,0,0,1
414,0,0,1,0,0,1,0,1,0,0
415,0,0,0,0,1,0,1,0,0,1
416,0,0,0,0,1,0,1,0,0,1


In [149]:
model = RandomForestClassifier(n_estimators=50, max_depth=3, random_state=1)
model.fit(x_train_categ, y_train)

In [150]:
df_submit = pd.read_csv("./gender_submission.csv")
df_submit.drop(columns=["PassengerId"],inplace=True)

In [151]:
model.score(x_test_categ,df_submit)

0.9784688995215312

In [152]:
len(x_train_categ.columns)

10