# Kaggle's Titanic competition first submission

## Setup & load data

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

In [2]:
ID_NAME = "PassengerId"
TARGET_NAME = "Survived"

In [3]:
train = pd.read_csv("train.csv").set_index(ID_NAME)
test = pd.read_csv("test.csv").set_index(ID_NAME)

In [4]:
train.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
test.head()

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


## Feature engineering & selection

In [6]:
train["SibSp_bin"] = (train["SibSp"] > 0).astype(int)
test["SibSp_bin"] = (test["SibSp"] > 0).astype(int)
train["Parch_bin"] = (train["Parch"] > 0).astype(int)
test["Parch_bin"] = (test["Parch"] > 0).astype(int)

In [7]:
train["Title"] = train["Name"].apply(lambda x: x.split(", ")[1].split(".")[0])
test["Title"] = test["Name"].apply(lambda x: x.split(", ")[1].split(".")[0])

In [8]:
train["Title"].value_counts()

Mr              517
Miss            182
Mrs             125
Master           40
Dr                7
Rev               6
Col               2
Major             2
Mlle              2
the Countess      1
Sir               1
Mme               1
Don               1
Jonkheer          1
Capt              1
Ms                1
Lady              1
Name: Title, dtype: int64

In [9]:
test["Title"].value_counts()

Mr        240
Miss       78
Mrs        72
Master     21
Rev         2
Col         2
Dona        1
Dr          1
Ms          1
Name: Title, dtype: int64

In [10]:
num_features = [
    "Age",
    #"SibSp",
    #"Parch",
    "Fare",
]
cat_features = [
    "Pclass",
    "Sex",
    "SibSp_bin",
    "Parch_bin",
    #"Embarked",
]

## Data preprocessing

### Missing data

In [11]:
train[num_features + cat_features + [TARGET_NAME]].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Age        714 non-null    float64
 1   Fare       891 non-null    float64
 2   Pclass     891 non-null    int64  
 3   Sex        891 non-null    object 
 4   SibSp_bin  891 non-null    int64  
 5   Parch_bin  891 non-null    int64  
 6   Survived   891 non-null    int64  
dtypes: float64(2), int64(4), object(1)
memory usage: 55.7+ KB


In [12]:
test[num_features + cat_features].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 892 to 1309
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Age        332 non-null    float64
 1   Fare       417 non-null    float64
 2   Pclass     418 non-null    int64  
 3   Sex        418 non-null    object 
 4   SibSp_bin  418 non-null    int64  
 5   Parch_bin  418 non-null    int64  
dtypes: float64(2), int64(3), object(1)
memory usage: 22.9+ KB


In [13]:
title_age_dict = train.groupby("Title")["Age"].mean().to_dict()

In [14]:
train["Age"] = train["Age"].fillna(train["Title"].map(title_age_dict))

In [15]:
train[num_features + cat_features + [TARGET_NAME]].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Age        891 non-null    float64
 1   Fare       891 non-null    float64
 2   Pclass     891 non-null    int64  
 3   Sex        891 non-null    object 
 4   SibSp_bin  891 non-null    int64  
 5   Parch_bin  891 non-null    int64  
 6   Survived   891 non-null    int64  
dtypes: float64(2), int64(4), object(1)
memory usage: 55.7+ KB


In [16]:
test["Age"] = test["Age"].fillna(test["Title"].map(title_age_dict))

In [17]:
test[num_features + cat_features].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 892 to 1309
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Age        418 non-null    float64
 1   Fare       417 non-null    float64
 2   Pclass     418 non-null    int64  
 3   Sex        418 non-null    object 
 4   SibSp_bin  418 non-null    int64  
 5   Parch_bin  418 non-null    int64  
dtypes: float64(2), int64(3), object(1)
memory usage: 22.9+ KB


In [18]:
test[test["Fare"].isna()]

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,SibSp_bin,Parch_bin,Title
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1044,3,"Storey, Mr. Thomas",male,60.5,0,0,3701,,,S,0,0,Mr


In [19]:
pclass_fare_dict = train.groupby("Pclass")["Fare"].mean().to_dict()

In [20]:
pclass_fare_dict

{1: 84.15468749999992, 2: 20.66218315217391, 3: 13.675550101832997}

In [21]:
test["Fare"] = test["Fare"].fillna(test["Pclass"].map(pclass_fare_dict))

In [22]:
test[num_features + cat_features].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 892 to 1309
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Age        418 non-null    float64
 1   Fare       418 non-null    float64
 2   Pclass     418 non-null    int64  
 3   Sex        418 non-null    object 
 4   SibSp_bin  418 non-null    int64  
 5   Parch_bin  418 non-null    int64  
dtypes: float64(2), int64(3), object(1)
memory usage: 22.9+ KB


### Feature selection

In [23]:
train = train[num_features + cat_features + [TARGET_NAME]]
test = test[num_features + cat_features]

### Transform categorical data

In [24]:
cat_features

['Pclass', 'Sex', 'SibSp_bin', 'Parch_bin']

In [25]:
train = pd.get_dummies(train, columns=["Pclass"], drop_first=True)

In [26]:
train = pd.get_dummies(train, columns=["Sex"], drop_first=True)

In [27]:
train

Unnamed: 0_level_0,Age,Fare,SibSp_bin,Parch_bin,Survived,Pclass_2,Pclass_3,Sex_male
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,22.000000,7.2500,1,0,0,0,1,1
2,38.000000,71.2833,1,0,1,0,0,0
3,26.000000,7.9250,0,0,1,0,1,0
4,35.000000,53.1000,1,0,1,0,0,0
5,35.000000,8.0500,0,0,0,0,1,1
...,...,...,...,...,...,...,...,...
887,27.000000,13.0000,0,0,0,1,0,1
888,19.000000,30.0000,0,0,1,0,0,0
889,21.773973,23.4500,1,1,0,0,1,0
890,26.000000,30.0000,0,0,1,0,0,1


In [28]:
test = pd.get_dummies(test, columns=["Pclass"], drop_first=True)

In [29]:
test = pd.get_dummies(test, columns=["Sex"], drop_first=True)

In [30]:
test

Unnamed: 0_level_0,Age,Fare,SibSp_bin,Parch_bin,Pclass_2,Pclass_3,Sex_male
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
892,34.500000,7.8292,0,0,0,1,1
893,47.000000,7.0000,1,0,0,1,0
894,62.000000,9.6875,0,0,1,0,1
895,27.000000,8.6625,0,0,0,1,1
896,22.000000,12.2875,1,1,0,1,0
...,...,...,...,...,...,...,...
1305,32.368090,8.0500,0,0,0,1,1
1306,39.000000,108.9000,0,0,0,0,0
1307,38.500000,7.2500,0,0,0,1,1
1308,32.368090,8.0500,0,0,0,1,1


## Model

In [31]:
X_train, X_test, y_train, y_test = train_test_split(
    train.drop(columns="Survived"), 
    train["Survived"],
    test_size=0.3
)

In [32]:
rf = RandomForestClassifier()

In [33]:
rf.fit(X_train, y_train)

RandomForestClassifier()

In [34]:
predictions = rf.predict(X_test)

In [35]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.83      0.83      0.83       163
           1       0.74      0.74      0.74       105

    accuracy                           0.80       268
   macro avg       0.79      0.79      0.79       268
weighted avg       0.80      0.80      0.80       268



In [36]:
rf.feature_importances_

array([0.27945235, 0.3026928 , 0.02510076, 0.02608031, 0.02149866,
       0.06939746, 0.27577766])

In [37]:
dict(zip(X_train.columns, rf.feature_importances_))

{'Age': 0.2794523515683768,
 'Fare': 0.30269279967863066,
 'SibSp_bin': 0.02510076208613127,
 'Parch_bin': 0.026080306951129527,
 'Pclass_2': 0.021498660014111264,
 'Pclass_3': 0.06939746150256634,
 'Sex_male': 0.27577765819905425}

In [38]:
X_train = train.drop(columns="Survived")
X_test = test
y_train = train["Survived"]

In [39]:
rf.fit(X_train, y_train)

RandomForestClassifier()

In [40]:
predictions = rf.predict(X_test)

In [48]:
submission = test.reset_index()[[ID_NAME]]
submission[TARGET_NAME] = predictions

In [49]:
submission

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,1
3,895,1
4,896,0
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [50]:
submission.to_csv("first_submission.csv", index=False)