In [1]:
import pandas as pd
import os

from unicodedata2 import category

# Loading the data

In [2]:
DIR_DATASET = os.path.join(os.getcwd(), 'dataset')
DIR_SUBMISSIONS = os.path.join(os.getcwd(), 'submissions')

train = pd.read_csv(os.path.join(DIR_DATASET, 'train.csv'))
display(train)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


# Fill missing values

In [None]:
train_filled = train.copy()

## Age

In [4]:
num_missing_age = train_filled['Age'].isnull().sum()
print(f"BEFORE - Number of rows with missing age: {num_missing_age}")

# Filling missing age with the median age of all passengers
median_age = train_filled['Age'].median()
print(f"Filling missing values with median age: {median_age}")

train_filled.fillna({'Age': median_age}, inplace=True)
print(f"AFTER - Number of rows with missing age: {train_filled['Age'].isnull().sum()}")

BEFORE - Number of rows with missing age: 177
Filling missing values with median age: 28.0
AFTER - Number of rows with missing age: 0


🌟We can eventually improve the fill of missing **Age** by Pclass, gender or name component

## Embarked

In [8]:
num_missing_embarked = train_filled['Embarked'].isnull().sum()
print(f"BEFORE - Number of rows with missing embarked: {num_missing_embarked}")

# Filling missing embarked with the most frequent value (mode) of all passengers
mode_embarked = train_filled['Embarked'].mode()[0]
print(f"Filling missing values with mode embarked: {mode_embarked}")

train_filled.fillna({'Embarked': mode_embarked}, inplace=True)
print(f"AFTER - Number of rows with missing embarked: {train_filled['Embarked'].isnull().sum()}")

BEFORE - Number of rows with missing embarked: 2
Filling missing values with mode embarked: S
AFTER - Number of rows with missing embarked: 0


## Cabin

The idea here is to add a column called **has_cabin** with values **1**: Has cabin number or **0**: Doesn't have cabine number

In [11]:
num_missing_cabin = train_filled['Cabin'].isnull().sum()
print(f"BEFORE - Number of rows with missing Cabin: {num_missing_cabin}")

# Addiing a colum to put value of 1 if there is a cabin values or 0 if it isn't and we keep the original column
train_filled['has_cabin'] = train_filled['Cabin'].apply(lambda x: 0 if pd.isnull(x) else 1)

print(f"AFTER - Number of rows with missing Cabin: {train_filled['has_cabin'].isnull().sum()}")

BEFORE - Number of rows with missing Cabin: 687
AFTER - Number of rows with missing Cabin: 0


# Feature engineering

# Title group

The idea here is to create a categorical feature including the title of the passenger

In [20]:
titles = list(train_filled['Name'].str.extract(' ([A-Za-z]+)\.', expand=False).unique())
print(f"List of titles appearing in Names: {titles}")

List of titles appearing in Names: ['Mr', 'Mrs', 'Miss', 'Master', 'Don', 'Rev', 'Dr', 'Mme', 'Ms', 'Major', 'Lady', 'Sir', 'Mlle', 'Col', 'Capt', 'Countess', 'Jonkheer']


In [21]:
# Add titles for each passenger
train_filled['title'] = train_filled['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

In [23]:
# Uniformizing titles across passengers to reduce diversity
def replace_title(x):
    title = x['title']
    if title in ['Don', 'Major', 'Capt', 'Jonkheer', 'Rev', 'Col', 'Don', 'Sir']:
        return 'Mr'
    elif title in ['Countess', 'Mme', 'Lady']:
        return 'Mrs'
    elif title in ['Mlle', 'Ms']:
        return 'Miss'
    elif title == 'Dr':
        if x['Sex'] == 'Male':
            return 'Mr'
        else:
            return 'Mrs'
    else:
        return title

train_filled['title_group'] = train_filled.apply(replace_title, axis=1)
print(f"List of titles groups appearing in Names: {list(train_filled['title_group'].unique())}")


List of titles groups appearing in Names: ['Mr', 'Mrs', 'Miss', 'Master']


## Traveling alone

The idea here is to create a a variable to detect whether the passenger traveled alone or with other persons like parents, spouse, etc

In [27]:
train_filled['is_alone'] = train_filled.apply(lambda x: 1 if x['Parch'] + x['SibSp'] == 0 else 0, axis=1)

b# Preprocess the data

To get the data ready to train ML models

In [36]:
train_prep = train_filled.copy()

## Drop unnecessary columns

In [37]:
# Drop columns
train_prep.drop(['Name', 'Ticket', 'Cabin', 'title'], axis=1, inplace=True)

## Apply one-hot encoding to categorical features

In [38]:
categorical_col = ['Sex', 'Embarked', 'title_group']
train_ml = pd.get_dummies(train_prep, columns=categorical_col, dtype='int')

# Split the train dataset and save the datab

In [41]:
# Split the processed data into features dataset and predicted dataset
X_train = train_ml.drop('Survived', axis=1)
y_train = train_ml['Survived']

# Save the dataset into a CSV file
X_train.to_csv(os.path.join(DIR_DATASET, 'X_train.csv'), index=False)
y_train.to_csv(os.path.join(DIR_DATASET, 'y_train.csv'), index=False)