In [1]:
import pandas as pd
import numpy as np

td = pd.read_csv('data/train.csv')
td.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [29]:
def enhance_csv(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df["CabinGroup"] = df["Cabin"].str[0].fillna("U")  # take first letter of cabin
    df['CabinPresent'] = (df['Cabin'].notna()).astype(int)
    df['Embarked'] = df['Embarked'].fillna('S')  # fill NaN entries with most common value
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1  # size of family aboard
    df['IsAlone'] = (df['FamilySize'] == 1).astype(int)  # is the passenger alone?
    df.fillna({"Fare": df['Fare'].median()}, inplace=True)  # fill NaN fare with median fare
    df['FareBin'] = pd.qcut(df['Fare'], 4, labels=False)  # binning fare into quartiles
    df['Title'] = df['Name'].str.extract(r' ([A-Za-z]+)\.', expand=False)  # extract title from name
    df['Title'] = df['Title'].replace(['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir'], 'Noble')
    df['Title'] = df['Title'].fillna('Rare')

    df["Age"] = df.groupby("Title")["Age"].transform(lambda x: x.fillna(x.median()))  # fill NaN ages with median age of title group
    df.fillna({"Age": df['Age'].median()}, inplace=True)  # fill any remaining NaN ages with overall median age
    df['AgeGroup'] = pd.cut(df['Age'], bins=[0, 12, 20, 40, 60, 80], labels=['Child', 'Teenager', 'Adult', 'Senior', 'Elderly'])
    return df

In [30]:
df_enhanced = enhance_csv(td)
df_enhanced.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 19 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   PassengerId   891 non-null    int64   
 1   Survived      891 non-null    int64   
 2   Pclass        891 non-null    int64   
 3   Name          891 non-null    object  
 4   Sex           891 non-null    object  
 5   Age           891 non-null    float64 
 6   SibSp         891 non-null    int64   
 7   Parch         891 non-null    int64   
 8   Ticket        891 non-null    object  
 9   Fare          891 non-null    float64 
 10  Cabin         204 non-null    object  
 11  Embarked      891 non-null    object  
 12  CabinGroup    891 non-null    object  
 13  CabinPresent  891 non-null    int64   
 14  FamilySize    891 non-null    int64   
 15  IsAlone       891 non-null    int64   
 16  FareBin       891 non-null    int64   
 17  Title         891 non-null    object  
 18  AgeGroup  

In [31]:
def sex_array(df: pd.DataFrame) -> np.ndarray:
    return df['Sex'].map({'male': 0, 'female': 1}).to_numpy()

def pclass_array(df: pd.DataFrame) -> np.ndarray:
    return (df['Pclass'].to_numpy() - 1)/2  # convert to 0-indexed

def age_array(df: pd.DataFrame) -> np.ndarray:
    return df['Age'].to_numpy()/80  # normalize age to [0, 1] range

def fare_array(df: pd.DataFrame) -> np.ndarray:
    return df['Fare'].to_numpy()/df['Fare'].max()  # normalize fare to [0, 1] range

def is_alone_array(df: pd.DataFrame) -> np.ndarray:
    return df['IsAlone'].to_numpy()

def is_cabin_present_array(df: pd.DataFrame) -> np.ndarray:
    return df['CabinPresent'].to_numpy()

def title_array(df: pd.DataFrame) -> np.ndarray:
    title_mapping = {title: idx for idx, title in enumerate(df['Title'].unique())}
    return df['Title'].map(title_mapping).to_numpy()/len(title_mapping)  # normalize to [0, 1] range


def create_matrix(df: pd.DataFrame) -> np.ndarray:
    return np.column_stack((sex_array(df), pclass_array(df), age_array(df), fare_array(df), is_alone_array(df), is_cabin_present_array(df), title_array(df)))


In [32]:
df_train = pd.read_csv('data/train.csv')
df_train_enhanced = enhance_csv(df_train)
X_train = create_matrix(df_train_enhanced)
y_train = df_train_enhanced['Survived'].to_numpy()

df_test = pd.read_csv('data/test.csv')
print(df_test['Age'].isna().sum())  # check for NaN values in Age column
df_test_enhanced = enhance_csv(df_test)
for column in df_test_enhanced.columns:
    print(f"{column}: {df_test_enhanced[column].isna().sum()} NaN entries")
X_test = create_matrix(df_test_enhanced)

86
PassengerId: 0 NaN entries
Pclass: 0 NaN entries
Name: 0 NaN entries
Sex: 0 NaN entries
Age: 0 NaN entries
SibSp: 0 NaN entries
Parch: 0 NaN entries
Ticket: 0 NaN entries
Fare: 0 NaN entries
Cabin: 327 NaN entries
Embarked: 0 NaN entries
CabinGroup: 0 NaN entries
CabinPresent: 0 NaN entries
FamilySize: 0 NaN entries
IsAlone: 0 NaN entries
FareBin: 0 NaN entries
Title: 0 NaN entries
AgeGroup: 0 NaN entries


  return np.nanmean(a, axis, out=out, keepdims=keepdims)


In [33]:
np.save('data/Xe_train.npy', X_train)
np.save('data/ye_train.npy', y_train)
np.save('data/Xe_test.npy', X_test)