In [None]:
# Import library

import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.model_selection import KFold
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SequentialFeatureSelector

from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier

from catboost import CatBoostClassifier

# Training data preparation

## Read data

In [None]:
train_data = pd.read_csv('train.csv')

## Check missing values

In [None]:
train_data.isnull().sum()

## Preprocess data

In [None]:
def data_pipeline(df):
    
    # Split column "Cabin" into 3 columns: "CabinDeck", "CabinNum", "CabinSide"
    df[['CabinDeck', 'CabinNum', 'CabinSide']] = df.Cabin.str.split('/', expand = True)

    # Drop 3 columns "PassengerId", "Name" and "Cabin"
    df = df.drop(['PassengerId', 'Name', 'Cabin'], axis='columns')

    # Create a new feature "TotalExpense"
    amenities = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
    df['TotalExpense'] = df[amenities].sum(axis=1)

    # Encode categorical data
    cat_cols = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'CabinDeck', 'CabinSide']
    encoder = OrdinalEncoder().fit(df[cat_cols])
    df[cat_cols] = encoder.transform(df[cat_cols])

    if 'Transported' in df.columns:
        df['Transported'] = list(map(int, df['Transported']))

    # Impute missing values
    miss_cols = df.isnull().sum()
    miss_cols = list(miss_cols[miss_cols>0].index)
    tf = ColumnTransformer([("imp", SimpleImputer(strategy='mean'), miss_cols)])
    df[miss_cols] = tf.fit_transform(df[miss_cols])

    return df

train_data = data_pipeline(train_data)

In [None]:
X_train_data = train_data.drop('Transported', axis='columns')
Y_train_data = train_data['Transported']

# Testing data preparation

## Read data

In [None]:
X_test_data = pd.read_csv('test.csv')

# For writing to submission file
PassengerIdTest = X_test_data['PassengerId']

## Preprocess data

In [None]:
X_test_data = data_pipeline(X_test_data)

# Training and testing

## CatBoost

In [None]:
# Feature selection

# model_fs = CatBoostClassifier(verbose=False)
# sf = SequentialFeatureSelector(model_fs, scoring='accuracy', direction='backward', n_features_to_select='auto', tol=None)
# sf.fit(X_train_data, Y_train_data)
# best_features = list(sf.get_feature_names_out())

# Avoid running the code again
best_features = ['CryoSleep', 'RoomService', 'Spa', 'VRDeck', 'CabinDeck', 'CabinSide', 'TotalExpense']

In [None]:
# Model training and prediction

clf = CatBoostClassifier(verbose=False)
clf.fit(X_train_data[best_features], Y_train_data)
prediction = clf.predict(X_test_data[best_features])

In [None]:
# List parameters of the model

clf.get_all_params()

In [None]:
# Write to submission file

res = pd.DataFrame(
        {
            'PassengerId': list(PassengerIdTest),
            'Transported': [(p == 1) for p in list(prediction)]
        }
    )
res.to_csv('submission.csv', index=False)

# Unused code

## k-fold cross validation

In [None]:
# learning_rate_list = [0.05, 0.1, 0.5, 1, 2]
# n_estimators_list = [100, 200, 500, 1000]
# subsample_list = [0.5, 0.75, 1]

# score_lists = []

# kf = KFold(n_splits=5, shuffle=True)
# folds_index = enumerate(kf.split(X_train_data))

# # for each parameter
# for l in learning_rate_list:
#     score_lists.append([])
#     for n in n_estimators_list:
#         score_lists[-1].append([])
#         for s in subsample_list:
#             print("-", l, n, s)
#             fold_score = []

#             # for each fold of a parameter
#             for i, (train_index, validate_index) in folds_index:
#                 X_fold_train_data = X_train_data.iloc[train_index]
#                 Y_fold_train_data = Y_train_data.iloc[train_index]

#                 X_fold_validate_data = X_train_data.iloc[validate_index]
#                 Y_fold_validate_data = Y_train_data.iloc[validate_index]

#                 clf = GradientBoostingClassifier(learning_rate=l, n_estimators=n, subsample=s)
#                 clf.fit(X_fold_train_data, Y_fold_train_data)
#                 fold_score.append(
#                     clf.score(X_fold_validate_data, Y_fold_validate_data)
#                 )
                
#             score_lists[-1][-1].append(np.mean(fold_score))

# score_lists

## Imputation by logic

In [None]:
# # train_data[['PassengerGroup', 'HomePlanet']].groupby('PassengerGroup').nunique()
# # sns.countplot(data=train_data, x='CabinDeck', hue='HomePlanet')

# def impute_homeplanet(df):

#     # LastName => HomePlanet
#     LastName_HomePlanet_dict = df.loc[(df['LastName'].notnull() & df['HomePlanet'].notnull()), ['LastName', 'HomePlanet']].set_index('LastName').to_dict()['HomePlanet']
#     df.loc[(df['LastName'].notnull() & df['HomePlanet'].isna()), 'HomePlanet'] = df.loc[(df['LastName'].notnull() & df['HomePlanet'].isna()), 'LastName'].map(LastName_HomePlanet_dict)

#     # PassengerGroup => HomePlanet
#     PassengerGroup_HomePlanet_dict = df.loc[df['HomePlanet'].notnull(), ['PassengerGroup', 'HomePlanet']].set_index('PassengerGroup').to_dict()['HomePlanet']
#     df.loc[df['HomePlanet'].isna(), 'HomePlanet'] = df.loc[df['HomePlanet'].isna(), 'PassengerGroup'].map(PassengerGroup_HomePlanet_dict)

#     # CabinDeck = A/B/C/T => HomePlanet = Europa
#     df.loc[df['CabinDeck']=='A', 'HomePlanet'] = df.loc[df['CabinDeck']=='A', 'HomePlanet'].fillna('Europa')
#     df.loc[df['CabinDeck']=='B', 'HomePlanet'] = df.loc[df['CabinDeck']=='B', 'HomePlanet'].fillna('Europa')
#     df.loc[df['CabinDeck']=='C', 'HomePlanet'] = df.loc[df['CabinDeck']=='C', 'HomePlanet'].fillna('Europa')
#     df.loc[df['CabinDeck']=='T', 'HomePlanet'] = df.loc[df['CabinDeck']=='T', 'HomePlanet'].fillna('Europa')

#     # CabinDeck = G => HomePlanet = Earth
#     df.loc[df['CabinDeck']=='G', 'HomePlanet'] = df.loc[df['CabinDeck']=='G', 'HomePlanet'].fillna('Earth')

#     return df

In [None]:
# # train_data['SumAmenitiesZero'] = (train_data[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].sum(axis=1) == 0)
# # sns.countplot(data=train_data, x='SumAmenitiesZero', hue='CryoSleep')

# def impute_cryosleep(df):
#     amenities = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

#     # Sum of amenities > 0 => missing CryoSleep = False
#     df.loc[df[amenities].sum(axis=1)>0, 'CryoSleep'] = df.loc[df[amenities].sum(axis=1)>0, 'CryoSleep'].fillna(False)

#     # Sum of amenities = 0 => missing CryoSleep = True
#     df.loc[df[amenities].sum(axis=1)==0, 'CryoSleep'] = df.loc[df[amenities].sum(axis=1)==0, 'CryoSleep'].fillna(True)
    
#     return df

In [None]:
# # train_data['RoomServiceZero'] = (train_data['RoomService'] == 0)
# # sns.countplot(data=train_data, x='RoomServiceZero', hue='CryoSleep')

# def impute_amenities(df):
#     amenities = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
    
#     # CryoSleep = True => missing amenities = 0
#     df.loc[df['CryoSleep']==True, amenities] = df.loc[df['CryoSleep']==True, amenities].fillna(0)

#     # Sum of amenities = 0 => missing amenities = 0
#     df.loc[df[amenities].sum(axis=1)==0, amenities] = df.loc[df[amenities].sum(axis=1)==0, amenities].fillna(0)

#     # CryoSleep = False => missing amenities = mean of column
#     for a in amenities:
#         df.loc[df['CryoSleep']==False, a] = df.loc[df['CryoSleep']==False, a].fillna(df[a].mean())

#     return df

In [None]:
# # train_data['Under18'] = (train_data['Age'] < 18)
# # sns.countplot(data=train_data, x='Under18', hue='VIP')

# # train_data['FromEarth'] = (train_data['HomePlanet'] == 'Earth')
# # sns.countplot(data=train_data, x='FromEarth', hue='VIP')

# # train_data['DeckG'] = (train_data['CabinDeck'] == 'G')
# # sns.countplot(data=train_data, x='DeckG', hue='VIP')
# # train_data['DeckT'] = (train_data['CabinDeck'] == 'T')
# # sns.countplot(data=train_data, x='DeckT', hue='VIP')

# def impute_vip(df):

#     # Age < 18 => VIP = False
#     df.loc[df['Age']<18, 'VIP'] = df.loc[df['Age']<18, 'VIP'].fillna(False)

#     # HomePlanet = Earth => VIP = False
#     df.loc[df['HomePlanet']=='Earth', 'VIP'] = df.loc[df['HomePlanet']=='Earth', 'VIP'].fillna(False)

#     # CabinDeck = G or T => VIP False
#     df.loc[df['CabinDeck']=='G', 'VIP'] = df.loc[df['CabinDeck']=='G', 'VIP'].fillna(False)
#     df.loc[df['CabinDeck']=='T', 'VIP'] = df.loc[df['CabinDeck']=='T', 'VIP'].fillna(False)

#     # (CabinDeck = not A to D) and (CryoSleep = False) => VIP True
#     # df.loc[(df['CabinDeck'] not in ['A','B','C','D']) and (df['CryoSleep'] == False), 'VIP'] = df.loc[(df['CabinDeck'] not in ['A','B','C','D']) and (df['CryoSleep'] == False), 'VIP'].fillna(True)

#     # CryoSleep = True => VIP = False (*** NOT SURE ***)
#     # df.loc[df['CryoSleep']==True, 'VIP'] = df.loc[df['CryoSleep']==True, 'VIP'].fillna(False)

#     # to be continued...

#     return df

In [None]:
# def impute_age(df):
#     amenities = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

#     # Age by amenities expenses and CryoSleep
#     df.loc[df[amenities].sum(axis=1)>0, 'Age'] = df.loc[df[amenities].sum(axis=1)>0, 'Age'].fillna(df.loc[df[amenities].sum(axis=1)>0, 'Age'].median())
#     df.loc[(df[amenities].sum(axis=1)==0) & (df['CryoSleep']==False), 'Age'] = df.loc[(df[amenities].sum(axis=1)==0) & (df['CryoSleep']==False), 'Age'].fillna(df.loc[(df[amenities].sum(axis=1)==0) & (df['CryoSleep']==False), 'Age'].median())
#     df.loc[(df[amenities].sum(axis=1)==0) & (df['CryoSleep']==True), 'Age'] = df.loc[(df[amenities].sum(axis=1)==0) & (df['CryoSleep']==True), 'Age'].fillna(df.loc[(df[amenities].sum(axis=1)==0) & (df['CryoSleep']==True), 'Age'].median())

#     return df