In [76]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

In [77]:
train_data = pd.read_csv('data/train.csv')
test_data = pd.read_csv('data/test.csv')

In [78]:
train_data.dropna(axis=0, subset=['Cabin'], inplace=True)

In [79]:
def get_X_y(data):
    X = train_data.drop(['Transported'], axis=1)
    y = train_data['Transported']
    data.dropna(axis=0, subset=['Cabin'], inplace=True)
    return X, y

In [80]:
def drop_pointless_columns(X):
    X = X.drop(['PassengerId', 'Name'], axis=1)
    return X

In [81]:
def snake_case_columns(df):
    df.rename(columns={
    'CryoSleep' : 'cryo_sleep', 
    'HomePlanet' : 'home_planet',
    'Destination' : 'destination',
    'Cabin' : 'cabin', 
    'Age' : 'age', 
    'VIP' : 'vip',
    'RoomService' : 'room_service',
    'FoodCourt' : 'food_court',
    'ShoppingMall' : 'shopping_mall',
    'Spa' : 'spa',
    'VRDeck' : 'vr_deck'
    }, inplace=True)

    return df

In [82]:
def engineer_cabin_cols(df):
    cabin_imputer = SimpleImputer(strategy='constant', fill_value='F/1/P')
    cabin_imputer.fit_transform(df[['cabin']])
    df['cabin_deck'] = df.cabin.str[0]
    df['cabin_num'] = df.cabin.str[2:-2]
    df['cabin_side'] = df.cabin.str[-1]
    df.drop('cabin', axis=1, inplace=True)
    return df

In [83]:
def impute_services(df):
    df['room_service'] = df.room_service.fillna(0)
    df['food_court'] = df.food_court.fillna(0)
    df['shopping_mall'] = df.shopping_mall.fillna(0)
    df['spa'] = df.spa.fillna(0)
    df['vr_deck'] = df.vr_deck.fillna(0)
    return df

In [84]:
def calculate_service_total(df):
    df['service_total'] = df.room_service + df.food_court + df.shopping_mall + df.spa + df.vr_deck
    return df

In [85]:
def impute_vip(df):
    df.vip = df.vip.astype(bool)
    media_diff = 1981.0
    media_diff = df[df.vip == True].service_total.median() - df[df.vip == False].service_total.median()
    df.vip = df.apply(lambda row: 1 if row.service_total > media_diff and pd.isna(row.vip) else row.vip, axis=1)
    df.vip.fillna(0, inplace=True)
    return df

In [86]:
def impute_age(df):
    age_imputer = SimpleImputer(strategy='median')
    df['age'] = age_imputer.fit_transform(df[['age']])
    return df

In [87]:
def impute_cryo_sleep(df):
    df.cryo_sleep = df.cryo_sleep.astype(bool)
    if df.cryo_sleep is None and df.service_total > 0:
        cryo_sleep = 1
    else:
        cryo_sleep = 0
    return df

In [88]:
def scale_and_ohe(df):
    scaler = StandardScaler()
    df[['age', 'cabin_num']] = scaler.fit_transform(df[['age', 'cabin_num']])

    df = pd.get_dummies(df, columns=['cabin_deck', 'cabin_side', 'home_planet', 'destination'])
    return df

In [89]:
def process_df(df):
    df = drop_pointless_columns(df)
    df = snake_case_columns(df)
    df = engineer_cabin_cols(df)
    df = impute_services(df)
    df = calculate_service_total(df)
    df = impute_vip(df)
    df = impute_age(df)
    df = impute_cryo_sleep(df)
    df = scale_and_ohe(df)
    return df

In [90]:
proc_df = process_df(test_data)

In [91]:
X, y = get_X_y(train_data)

In [92]:
X_train = process_df(X)

In [93]:
model = XGBClassifier(learning_rate=0.02, max_depth=6, n_estimators=700)

model.fit(X_train, y)

y_pred = model.predict(proc_df)


In [75]:
X_train.isna().sum()  

cryo_sleep       0
age              0
vip              0
room_service     0
food_court       0
shopping_mall    0
spa              0
vr_deck          0
cabin_num        0
service_total    0
cabin_deck_A     0
cabin_deck_B     0
cabin_deck_C     0
cabin_deck_D     0
cabin_deck_E     0
cabin_deck_F     0
cabin_deck_G     0
cabin_deck_T     0
cabin_side_P     0
cabin_side_S     0
dtype: int64

In [19]:
y_pred = y_pred.astype(bool)
ids = test_data['PassengerId']

In [23]:
submission = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Transported': y_pred})

In [24]:
submission.to_csv('submission_4.csv', index=False)