In [3]:
import pandas as pd
import numpy as np

In [4]:
td = pd.read_csv('data/train.csv')

In [5]:
td.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [6]:
def convert_data(td):

    n_features = 9 # ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']

    n_passengers = td.shape[0]

    X = np.zeros((n_passengers, n_features))

    X[:,0] = td['Pclass']/3  # normalize to [0,1]
    X[:,1] = td['Sex'].map({'male': 0, 'female': 1})
    X[:,2] = td['Age']/100  # normalize to [0,1]
    X[:,2] = np.nan_to_num(X[:,2], nan=-1)  # replace NaN with a distinct value
    X[:,3] = td['SibSp']/8  # normalize to [0,1]
    X[:,4] = td['Parch']/6  # normalize to [0,1]

    tickets = td['Ticket'].to_numpy()
    unique_tickets, count = np.unique(tickets, return_counts=True)
    ticket_dict = dict(zip(unique_tickets, count))

    X[:,5] = np.array([ticket_dict[t] for t in tickets])/7  # normalize to [0,1]

    max_fare = td['Fare'].max()
    X[:,6] = td['Fare']/max_fare  # normalize to [0,1]
    X[:,6] = np.nan_to_num(X[:,2], nan=-1)

    X[:,7] = td['Cabin'].isna()  # 0 if no cabin info, else 1

    X[:,8] = td['Embarked'].map({'C':0, 'Q':0.5, 'S':1})
    X[:,8] = np.nan_to_num(X[:,8], nan=-1)  # replace NaN with a distinct value

    return X

In [7]:
X_train = convert_data(td)
X_test = convert_data(pd.read_csv('data/test.csv'))

In [8]:
y_train = td['Survived'].to_numpy()

In [9]:
np.save("data/X_train.npy", X_train)
np.save("data/X_test.npy", X_test)
np.save("data/y_train.npy", y_train)

In [31]:
def convert_data_version2(td):

    n_features = 9 # ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']

    n_passengers = td.shape[0]

    X = np.zeros((n_passengers, n_features))

    X[:,0] = td['Pclass']/3  # normalize to [0,1]
    X[:,1] = td['Sex'].map({'male': 0, 'female': 1})
    X[:,2] = td['Age']  

    mask = np.isnan(X[:,2])
    names = td['Name'].to_numpy()

    mean_ages = pd.read_csv('data/title_mean_ages.csv')

    titles = mean_ages['Titles'].to_numpy()
    ages = mean_ages['Mean Age'].to_numpy()

    for i, name in enumerate(names):
        if np.isnan(X[:,2][i]):  
            for title, age in zip(titles, ages):
                if title in name:
                    X[:,2][i] = age
                    break
    

    X[:,2] = X[:,2]/100  # normalize to [0,1]


    X[:,3] = td['SibSp']/8  # normalize to [0,1]
    X[:,4] = td['Parch']/6  # normalize to [0,1]

    tickets = td['Ticket'].to_numpy()
    unique_tickets, count = np.unique(tickets, return_counts=True)
    ticket_dict = dict(zip(unique_tickets, count))

    X[:,5] = np.array([ticket_dict[t] for t in tickets])/7  # normalize to [0,1]

    max_fare = td['Fare'].max()
    X[:,6] = td['Fare']/max_fare  # normalize to [0,1]
    X[:,6] = np.nan_to_num(X[:,2], nan=-1)

    X[:,7] = td['Cabin'].isna()  # 0 if no cabin info, else 1

    X[:,8] = td['Embarked'].map({'C':0, 'Q':0.5, 'S':1})
    X[:,8] = np.nan_to_num(X[:,8], nan=-1)  # replace NaN with a distinct value

    return X

In [32]:
X_train = convert_data_version2(td)
X_test = convert_data_version2(pd.read_csv('data/test.csv'))

np.save("data/X_train2.npy", X_train)
np.save("data/X_test2.npy", X_test)