In [731]:
import sys
import os
import numpy as np
import pandas as pd
import tensorflow as tf
import sklearn
from sklearn.preprocessing import RobustScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
import joblib

# Version Register
print("Python version:", sys.version)
print("Pandas version:", pd.__version__)
print("TensorFlow version:", tf.__version__)
print("Scikit-learn version:", sklearn.__version__)
print("Joblib version:", joblib.__version__)

Python version: 3.11.7 (tags/v3.11.7:fa7a6f2, Dec  4 2023, 19:24:49) [MSC v.1937 64 bit (AMD64)]
Pandas version: 2.1.4
TensorFlow version: 2.15.0
Scikit-learn version: 1.3.2
Joblib version: 1.3.2


## Dataset

The first step in creating a Machine Learning model is to load in the dataset used in the problem. In the code cell below, you can see that we load in the csv file containing the training data. The PassangerId is set as the dataframe index and the first five rows are displayed.

In [732]:
# Load the training set
train_df = pd.read_csv(os.path.join('data', 'train.csv'))

# Load the test set
test_df = pd.read_csv(os.path.join('data', 'test.csv'))

# Set 'PassengerId' as the index for both DataFrames
train_df.set_index('PassengerId', inplace=True)
test_df.set_index('PassengerId', inplace=True)

# Add a 'Survived' column to the test DataFrame and fill it with NaN (since it's the target variable)
test_df['Survived'] = np.nan

# Concatenate the train and test DataFrames
df = pd.concat([train_df, test_df])

# Display the first few rows of the combined DataFrame
display(df.head())

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


### Checking for 'NaN' values

A good practise is to always check for NaN values in your data. In my solution, I have checked for the percentage of NaN entries in each column to detirmine how I want to fix the issue.

As visible from the output, the age column contains quite a lot of NaN values, but because my hypothesis is that it is an important classifier, I will be solving it by filling in the missing values with the mean. Filling the values with the mean is a popular and effective approach for numerical data.

Cabin might be an interesting classifier, but in it's current state it is completely ineffective. The first problem is the fact that 77.1% of the people did not have a cabin, but even in the people who had a cabin, the decks were on different stories of the boat, making escape from deck 'G' significantly harder than escaping from deck 'A'. To fix this, I will be classifying every person without a deck as 0, A as 1, B as 2 and so forth. This way, the general cabin location can be used as categorical data.

Embarked is to be studied upon furthur, but a measure of wealth might be connected to the origin of a person. Due to this reason I will be turning this into categorical data and then I will be filling the 'NaN' entries with the most popular value (mode).

In [733]:
# Calculate the percentage of rows with NaN values for each column
nan_percentage = df.isna().mean() * 100

# Display the percentage of rows with NaN values
print("\nPercentage of rows with NaN values for each column:")
print(nan_percentage)


Percentage of rows with NaN values for each column:
Survived    31.932773
Pclass       0.000000
Name         0.000000
Sex          0.000000
Age         20.091673
SibSp        0.000000
Parch        0.000000
Ticket       0.000000
Fare         0.076394
Cabin       77.463713
Embarked     0.152788
dtype: float64


### Dropping Useless Information

Some information might be unusable in a dataset and in this case, I deemed the names of people to be unnecessary, as it does not provide any value in wether a person might survive in the end. Same goes for the ticked value, because the ticket value seems to have too much randomness going for it to be preprocessed into a usefull feature.

In [734]:
# Drop the unusable columns
df.drop('Name', axis=1, inplace=True)
df.drop('Ticket', axis=1, inplace=True)

# Display the first few rows of the DataFrame
display(df.head())

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,0.0,3,male,22.0,1,0,7.25,,S
2,1.0,1,female,38.0,1,0,71.2833,C85,C
3,1.0,3,female,26.0,0,0,7.925,,S
4,1.0,1,female,35.0,1,0,53.1,C123,S
5,0.0,3,male,35.0,0,0,8.05,,S


### Preprocessing 'Sex'

The first value that needs to be represented differently is sex. The sex should be represented as a categorical field where male is 0, female is 1. From inspecting the dataset, we can conclude that there are no missing values and that the sex is either male or female. 

In [735]:
# Map 'male' to 0 and 'female' to 1 in the 'Sex' column
df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})

# Display the new header
display(df.head())

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,0.0,3,0,22.0,1,0,7.25,,S
2,1.0,1,1,38.0,1,0,71.2833,C85,C
3,1.0,3,1,26.0,0,0,7.925,,S
4,1.0,1,1,35.0,1,0,53.1,C123,S
5,0.0,3,0,35.0,0,0,8.05,,S


### Preprocessing 'Cabin'

The process of preprocessing Cabin was described earlier together with the choice in why I wanted to preprocess this data using this method. The result was stored in a new column 'CabinDeck' and 'cabin' itself was dropped.

In [736]:
# Extract the first letter (deck) from 'Cabin'
df['CabinDeck'] = df['Cabin'].str[0]

# Map each deck to a numeric value
deck_mapping = {'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5, 'F': 6, 'G': 7}

# Map 'CabinDeck' column and set to 0 for values not in the deck_mapping keys
df['CabinDeck'] = df['CabinDeck'].map(deck_mapping).fillna(0)

# Convert 'CabinDeck' column to integer type
df['CabinDeck'] = df['CabinDeck'].astype(int)

# Drop the 'Cabin' column
df.drop('Cabin', axis=1, inplace=True)

# Display the new header
display(df.head())

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,CabinDeck
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,0.0,3,0,22.0,1,0,7.25,S,0
2,1.0,1,1,38.0,1,0,71.2833,C,3
3,1.0,3,1,26.0,0,0,7.925,S,0
4,1.0,1,1,35.0,1,0,53.1,S,3
5,0.0,3,0,35.0,0,0,8.05,S,0


### Preprocessing 'Embarked'

In this code cell, the missing values for 'Embarked' were handled as explained before. Now the data is ready to be one hot encoded.

In [737]:
# Convert 'Embarked' to categorical data
df['Embarked'] = df['Embarked'].astype('category')

# Fill NaN entries in 'Embarked' with the mode
mode_embarked = df['Embarked'].mode()[0]
df['Embarked'].fillna(mode_embarked, inplace=True)

# Display the updated DataFrame
display(df.head())

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,CabinDeck
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,0.0,3,0,22.0,1,0,7.25,S,0
2,1.0,1,1,38.0,1,0,71.2833,C,3
3,1.0,3,1,26.0,0,0,7.925,S,0
4,1.0,1,1,35.0,1,0,53.1,S,3
5,0.0,3,0,35.0,0,0,8.05,S,0


### Preprocessing 'Numerical Data'

Before scaling, I handled the missing values in Age.

To scale the numerical data, I chose to use RobustScaling to handle cases of extreme values in the data. After scaling a column, I also saved the scaler for future use of the model.

In [738]:
# Filling in the NaN values in Age
df['Age'].fillna(df['Age'].mean(), inplace=True)

# Specify the columns with numerical data
numerical = ['Age', 'SibSp', 'Parch', 'Fare']

# Specify the folder to store scaler instances
preprocessing_folder = 'preprocessors'

# Create the folder if it doesn't exist
os.makedirs(preprocessing_folder, exist_ok=True)

# Create a dictionary to store scaler instances
scalers = {}

# Create and save scaler instances for 'Age' and 'Fare' using RobustScaler
for col in numerical:
    scaler = RobustScaler()
    df[col] = scaler.fit_transform(df[[col]])
    
    # Save the scaler to a file in the 'preprocessing' folder
    scaler_filename = os.path.join(preprocessing_folder, f'{col}_scaler.joblib')
    joblib.dump(scaler, scaler_filename)
    
    # Store the scaler instance in the dictionary
    scalers[col] = scaler_filename

# Display the new header
display(df.head())

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,CabinDeck
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,0.0,3,0,-0.606241,1.0,0.0,-0.308146,S,0
2,1.0,1,1,0.624528,1.0,0.0,2.430755,C,3
3,1.0,3,1,-0.298549,0.0,0.0,-0.279274,S,0
4,1.0,1,1,0.393759,1.0,0.0,1.652999,S,3
5,0.0,3,0,0.393759,0.0,0.0,-0.273927,S,0


### Preprocessing 'Categorical Data'

To use categorical data in a neural network, it needs to be one hot encoded. To do this, we simply use the One hot encoder from sklearn, which we can also store for later use.

In [739]:
# Specify the columns with categorical data
categorical_columns = ['Pclass', 'Embarked', 'CabinDeck']

# Create and fit the OneHotEncoder
onehot_encoder = OneHotEncoder(handle_unknown='ignore')
onehot_encoder.fit(df[categorical_columns])

# Save the scaler to a file in the 'preprocessing' folder
onehot_encoder_filename = os.path.join(preprocessing_folder, f'onehot_encoder.joblib')
joblib.dump(onehot_encoder, onehot_encoder_filename)

# Transform the categorical columns
onehot_encoded = onehot_encoder.transform(df[categorical_columns]).toarray()

# Separate the combined dataset back into training and test (or future) datasets
df_encoded = pd.DataFrame(onehot_encoded[:len(df)])

# Display the updated training and test (or future) DataFrames
display(df_encoded.head())

# Drop the columns from the original dataframe
df.drop('Pclass', axis=1, inplace=True)
df.drop('Embarked', axis=1, inplace=True)
df.drop('CabinDeck', axis=1, inplace=True)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Extracting the Features and Labels

The final step in preprocessing is to extract the Labels and Features and also to split them into train and test data. The split is done at the standard 70% to 30% ratio.

In [740]:
# Extract the 'Survived' column as the labels
Y = df['Survived'].to_numpy()
df.drop('Survived', axis=1, inplace=True)
print("Labels (Y) shape:", Y.shape)

# Concatenate the original and encoded DataFrames into a single NumPy array
X = np.concatenate([df.to_numpy(), df_encoded.to_numpy()], axis=1)
print("Features (X) shape:", X.shape)

# Set test_size according to your desired split ratio
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

Labels (Y) shape: (1309,)
Features (X) shape: (1309, 19)
