In [2]:
import pandas as pd
import matplotlib.pyplot as mpl
import numpy as np

In [3]:
from pathlib import Path

train_dataset = pd.read_csv(Path("dataset/train.csv"))
test_dataset = pd.read_csv(Path("dataset/test.csv"))
female_surivor_only_example = pd.read_csv(Path("dataset/gender_submission.csv"))

In [4]:
train_dataset.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

# Fixing the data
1. Need to encode the categorical data, and potentially include the names as a numeric hash
2. Need to impute missing values, since it's a small dataset anyway we don't want to throw away data and instead to glean as much information from the datase as possible we'll impute with the median value for each column

In [105]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from hashlib import md5

def hash(input: str) -> int:
    return int(md5(input.encode('utf-8')).hexdigest(), 16)

embark_encoder = OneHotEncoder(sparse_output=False).set_output(transform="pandas")
cabin_level_encoder = OneHotEncoder(sparse_output=False).set_output(transform="pandas")
sex_encoder = OneHotEncoder(sparse_output=False).set_output(transform="pandas")
simpleimputer = SimpleImputer(strategy="median").set_output(transform="pandas")
standardscaler = StandardScaler().set_output(transform="pandas")

def preprocess_dataset(train_dataset: pd.DataFrame, fit=True) -> pd.DataFrame:
    cabin_data = train_dataset['Cabin'].str.extract(r'([A-Za-z])(\d*)', expand=True)
    room_numbers = pd.to_numeric(cabin_data[1]).to_frame(name="Room Numbers")

    def getTransformerFn(encoder):
        if fit:
            return encoder.fit_transform
        return encoder.transform
    
    filled_class_level = cabin_data[0].fillna(cabin_data[0].mode().iloc[0]) # fill in with mode class level
    cabin_levels = getTransformerFn(cabin_level_encoder)(filled_class_level.to_frame(name="Cabin Level"))
    
    filled_embarked = train_dataset["Embarked"].fillna(train_dataset["Embarked"].mode().iloc[0]) # fill in with mode embark location
    embark_data = getTransformerFn(embark_encoder)(filled_embarked.to_frame())

    filled_sex = train_dataset["Sex"].fillna(train_dataset["Sex"].mode().iloc[0]) # fill in with mode sex
    sex_data = getTransformerFn(sex_encoder)(filled_sex.to_frame(name="Sex"))
    
    hashed_names = train_dataset["Name"].apply(hash).to_frame("Name Hash")
    numeric_data = train_dataset.select_dtypes(include=[np.number])
    frame = pd.concat([hashed_names, numeric_data, room_numbers, sex_data, embark_data, cabin_levels], axis=1)
    imputed = getTransformerFn(simpleimputer)(frame)
    scaled = getTransformerFn(standardscaler)(imputed)
    return scaled

preprocess_dataset(train_dataset)

Unnamed: 0,Name Hash,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Room Numbers,Sex_female,...,Embarked_Q,Embarked_S,Cabin Level_A,Cabin Level_B,Cabin Level_C,Cabin Level_D,Cabin Level_E,Cabin Level_F,Cabin Level_G,Cabin Level_T
0,0.081711,-1.730108,-0.789272,0.827377,-0.565736,0.432793,-0.473674,-0.502445,-0.123210,-0.737695,...,-0.307562,0.615838,-0.130856,-0.235981,0.440874,-0.196116,-0.193009,-0.121681,-0.067153,-0.03352
1,-0.260810,-1.726220,1.266990,-1.566107,0.663861,0.432793,-0.473674,0.786845,2.472540,1.355574,...,-0.307562,-1.623803,-0.130856,-0.235981,0.440874,-0.196116,-0.193009,-0.121681,-0.067153,-0.03352
2,-0.403718,-1.722332,1.266990,0.827377,-0.258337,-0.474545,-0.473674,-0.488854,-0.123210,1.355574,...,-0.307562,0.615838,-0.130856,-0.235981,0.440874,-0.196116,-0.193009,-0.121681,-0.067153,-0.03352
3,-0.006703,-1.718444,1.266990,-1.566107,0.433312,0.432793,-0.473674,0.420730,4.689135,1.355574,...,-0.307562,0.615838,-0.130856,-0.235981,0.440874,-0.196116,-0.193009,-0.121681,-0.067153,-0.03352
4,-1.276577,-1.714556,-0.789272,0.827377,0.433312,-0.474545,-0.473674,-0.486337,-0.123210,-0.737695,...,-0.307562,0.615838,-0.130856,-0.235981,0.440874,-0.196116,-0.193009,-0.121681,-0.067153,-0.03352
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,-0.380495,1.714556,-0.789272,-0.369365,-0.181487,-0.474545,-0.473674,-0.386671,-0.123210,-0.737695,...,-0.307562,0.615838,-0.130856,-0.235981,0.440874,-0.196116,-0.193009,-0.121681,-0.067153,-0.03352
887,1.229184,1.718444,1.266990,-1.566107,-0.796286,-0.474545,-0.473674,-0.044381,-0.035712,1.355574,...,-0.307562,0.615838,-0.130856,4.237623,-2.268221,-0.196116,-0.193009,-0.121681,-0.067153,-0.03352
888,-0.826613,1.722332,-0.789272,0.827377,-0.104637,0.432793,2.008933,-0.176263,-0.123210,1.355574,...,-0.307562,0.615838,-0.130856,-0.235981,0.440874,-0.196116,-0.193009,-0.121681,-0.067153,-0.03352
889,0.681788,1.726220,1.266990,-1.566107,-0.258337,-0.474545,-0.473674,-0.044381,6.147422,-0.737695,...,-0.307562,-1.623803,-0.130856,-0.235981,0.440874,-0.196116,-0.193009,-0.121681,-0.067153,-0.03352
