In [5]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

In [6]:
train_data = pd.read_csv('data/train.csv')
test_data = pd.read_csv('data/test.csv')

In [3]:
train_data.dropna(axis=0, subset=['Cabin'], inplace=True)

In [7]:
def get_X_y(data):
    X = train_data.drop(['Transported'], axis=1)
    y = train_data['Transported']
    #data.dropna(axis=0, subset=['Cabin'], inplace=True)
    return X, y

In [17]:
def drop_pointless_columns(X):
    X = X.drop(['PassengerId', 'Name'], axis=1)
    return X

In [18]:
def snake_case_columns(df):
    df.rename(columns={
    'CryoSleep' : 'cryo_sleep', 
    'HomePlanet' : 'home_planet',
    'Destination' : 'destination',
    'Cabin' : 'cabin', 
    'Age' : 'age', 
    'VIP' : 'vip',
    'RoomService' : 'room_service',
    'FoodCourt' : 'food_court',
    'ShoppingMall' : 'shopping_mall',
    'Spa' : 'spa',
    'VRDeck' : 'vr_deck'
    }, inplace=True)

    return df

In [19]:
def engineer_cabin_cols(df):
    cabin_imputer = SimpleImputer(strategy='constant', fill_value='F/1/P')
    cabin_imputer.fit_transform(df[['cabin']])
    df['cabin_deck'] = df.cabin.str[0]
    df['cabin_num'] = df.cabin.str[2:-2]
    df['cabin_side'] = df.cabin.str[-1]
    df.drop('cabin', axis=1, inplace=True)
    return df

In [20]:
def impute_services(df):
    df['room_service'] = df.room_service.fillna(0)
    df['food_court'] = df.food_court.fillna(0)
    df['shopping_mall'] = df.shopping_mall.fillna(0)
    df['spa'] = df.spa.fillna(0)
    df['vr_deck'] = df.vr_deck.fillna(0)
    return df

In [21]:
def calculate_service_total(df):
    df['service_total'] = df.room_service + df.food_court + df.shopping_mall + df.spa + df.vr_deck
    return df

In [22]:
def impute_vip(df):
    df.vip = df.vip.astype(bool)
    media_diff = 1981.0
    media_diff = df[df.vip == True].service_total.median() - df[df.vip == False].service_total.median()
    df.vip = df.apply(lambda row: 1 if row.service_total > media_diff and pd.isna(row.vip) else row.vip, axis=1)
    df.vip.fillna(0, inplace=True)
    return df

In [23]:
def impute_age(df):
    age_imputer = SimpleImputer(strategy='median')
    df['age'] = age_imputer.fit_transform(df[['age']])
    return df

In [24]:
def impute_cryo_sleep(df):
    df.cryo_sleep = df.cryo_sleep.astype(bool)
    if df.cryo_sleep is None and df.service_total > 0:
        cryo_sleep = 1
    else:
        cryo_sleep = 0
    return df

In [25]:
def scale_and_ohe(df):
    scaler = StandardScaler()
    df[['age', 'cabin_num']] = scaler.fit_transform(df[['age', 'cabin_num']])

    df = pd.get_dummies(df, columns=['cabin_deck', 'cabin_side', 'home_planet', 'destination'])
    return df

In [26]:
def process_df(df):
    df = drop_pointless_columns(df)
    df = snake_case_columns(df)
    df = engineer_cabin_cols(df)
    df = impute_services(df)
    df = calculate_service_total(df)
    df = impute_vip(df)
    df = impute_age(df)
    df = impute_cryo_sleep(df)
    df = scale_and_ohe(df)
    return df

In [27]:
proc_df = process_df(test_data)

In [28]:
X, y = get_X_y(train_data)

In [29]:
X_train = process_df(X)

In [30]:
model = XGBClassifier(learning_rate=0.02, max_depth=6, n_estimators=700)

model.fit(X_train, y)

y_pred = model.predict(proc_df)


In [31]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 26 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   cryo_sleep                 8693 non-null   bool   
 1   age                        8693 non-null   float64
 2   vip                        8693 non-null   bool   
 3   room_service               8693 non-null   float64
 4   food_court                 8693 non-null   float64
 5   shopping_mall              8693 non-null   float64
 6   spa                        8693 non-null   float64
 7   vr_deck                    8693 non-null   float64
 8   cabin_num                  8494 non-null   float64
 9   service_total              8693 non-null   float64
 10  cabin_deck_A               8693 non-null   uint8  
 11  cabin_deck_B               8693 non-null   uint8  
 12  cabin_deck_C               8693 non-null   uint8  
 13  cabin_deck_D               8693 non-null   uint8

In [113]:
y_pred = y_pred.astype(bool)
ids = test_data['PassengerId']

In [114]:
submission = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Transported': y_pred})

In [115]:
submission.to_csv('submission_8.csv', index=False)