In [64]:
import pandas as pd

df = pd.read_csv('train.csv')

In [65]:
df.describe()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
count,8514.0,8512.0,8510.0,8485.0,8510.0,8505.0
mean,28.82793,224.687617,458.077203,173.729169,311.138778,304.854791
std,14.489021,666.717663,1611.48924,604.696458,1136.705535,1145.717189
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,0.0,0.0,0.0,0.0,0.0
50%,27.0,0.0,0.0,0.0,0.0,0.0
75%,38.0,47.0,76.0,27.0,59.0,46.0
max,79.0,14327.0,29813.0,23492.0,22408.0,24133.0


In [66]:
df

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False


In [67]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


# data preprocessing

In [68]:
df_transformed = df

## impute numeric data

In [69]:
numeric_list = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

In [70]:
from sklearn.impute import SimpleImputer
import numpy as np

ipt = SimpleImputer(missing_values=np.nan, strategy='median')
df_transformed[numeric_list] = ipt.fit_transform(df[numeric_list])

## drop na

In [71]:
df_transformed.dropna(inplace=True)

## data augmentation

In [72]:
# add group number
df_transformed[['group', 'id']] = df['PassengerId'].str.split('_', expand = True)
df_transformed.drop(columns=['id', 'PassengerId'], inplace=True)
df_transformed['group'] = df_transformed['group'].apply(int)

In [73]:
# split cabin number
df_transformed[['deck', 'cabin number', 'side']] = df_transformed['Cabin'].str.split('/', expand=True)
df_transformed['cabin number'] = df_transformed['cabin number'].apply(int)
df_transformed.drop(columns='Cabin', inplace=True)

In [74]:
# add total expenses column
df_transformed['total expenses'] = df_transformed['RoomService']+ df_transformed['FoodCourt']+ df_transformed['ShoppingMall']+ df_transformed['Spa']+ df_transformed['VRDeck']

In [75]:
# add number of family members column
df_transformed[['first name', 'last name']] = df_transformed['Name'].str.split(' ', expand=True)
df_family = pd.DataFrame(df_transformed.groupby(['group','last name']).size())
df_family.rename(columns={0: 'members'}, inplace=True)

df_transformed = df_transformed.merge(right=df_family, how='left', on=['group', 'last name'])
df_transformed.drop(columns=['first name', 'last name', 'Name'], inplace=True)

# encoding and standardisation

In [76]:
# mapping boolean values
boolean_list_transformed = ['CryoSleep', 'VIP']
bool_dict_transformed = {'True': 1, 'False': 0}

df_transformed[boolean_list_transformed] = df_transformed[boolean_list_transformed].astype(int)

In [77]:
# categorical data one hot encoding
categorical_list_transformed = ['HomePlanet', 'Destination', 'deck', 'side']

df_transformed = pd.get_dummies(df_transformed, columns=categorical_list_transformed)
df_transformed

Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,group,...,deck_A,deck_B,deck_C,deck_D,deck_E,deck_F,deck_G,deck_T,side_P,side_S
0,0,39.0,0,0.0,0.0,0.0,0.0,0.0,False,1,...,0,1,0,0,0,0,0,0,1,0
1,0,24.0,0,109.0,9.0,25.0,549.0,44.0,True,2,...,0,0,0,0,0,1,0,0,0,1
2,0,58.0,1,43.0,3576.0,0.0,6715.0,49.0,False,3,...,1,0,0,0,0,0,0,0,0,1
3,0,33.0,0,0.0,1283.0,371.0,3329.0,193.0,False,3,...,1,0,0,0,0,0,0,0,0,1
4,0,16.0,0,303.0,70.0,151.0,565.0,2.0,True,4,...,0,0,0,0,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7554,0,41.0,1,0.0,6819.0,0.0,1643.0,74.0,False,9276,...,1,0,0,0,0,0,0,0,1,0
7555,1,18.0,0,0.0,0.0,0.0,0.0,0.0,False,9278,...,0,0,0,0,0,0,1,0,0,1
7556,0,26.0,0,0.0,0.0,1872.0,1.0,0.0,True,9279,...,0,0,0,0,0,0,1,0,0,1
7557,0,32.0,0,0.0,1049.0,0.0,353.0,3235.0,False,9280,...,0,0,0,0,1,0,0,0,0,1


In [78]:
# numeric data
std_list_transformed = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'total expenses']
norm_list_transformed = ['members']

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

numeric_colTrans = ColumnTransformer([
    ('normalisation', MinMaxScaler(), norm_list_transformed ),
    ('stabdardisation', StandardScaler(), std_list_transformed)
])

df_num_transformed = numeric_colTrans.fit_transform(df_transformed)
df_num_transformed = pd.DataFrame(df_num_transformed, columns=norm_list_transformed + std_list_transformed)
df_transformed[norm_list_transformed + std_list_transformed] = df_num_transformed

# test train split

In [79]:
from sklearn.model_selection import train_test_split

x = df_transformed.drop(columns='Transported')
y = df_transformed['Transported'].astype(int)

x_train, x_val, y_train, y_val = train_test_split(x,y, test_size=0.1, random_state=42)

In [80]:
y_train

437     1
1606    0
4271    1
149     1
4254    0
       ..
5191    0
5226    0
5390    0
860     1
7270    1
Name: Transported, Length: 6803, dtype: int64

# features evaluation

In [81]:
corr_mat = df_transformed.corr()
corr_mat['Transported'].abs().sort_values(ascending=False)

Transported                  1.000000
CryoSleep                    0.467912
RoomService                  0.244713
Spa                          0.216565
VRDeck                       0.207821
total expenses               0.193545
HomePlanet_Europa            0.181667
HomePlanet_Earth             0.170906
deck_B                       0.150519
Destination_55 Cancri e      0.113288
deck_C                       0.105577
side_P                       0.103000
side_S                       0.103000
Destination_TRAPPIST-1e      0.102016
deck_E                       0.097598
deck_F                       0.096792
members                      0.078607
Age                          0.078134
FoodCourt                    0.049384
cabin number                 0.042840
VIP                          0.037456
deck_D                       0.034651
group                        0.026825
deck_G                       0.022826
HomePlanet_Mars              0.015940
ShoppingMall                 0.011073
deck_A      

most of the features are pretty significant, a slight regularisation should be sufficient

# build models

In [82]:
def gs_eval(model):
    print(f'ROCAUC score: {model.best_score_}')
    print(model.best_params_)

In [83]:
def rocauc(model):
    from sklearn.metrics import roc_auc_score
    print('ROCAUC: ' + str(roc_auc_score(model.predict(x_val), y_val)))

## model 1 elastic net

In [84]:
# from sklearn.linear_model import LogisticRegression
# from sklearn.model_selection import GridSearchCV
#
# param_grid = [
#     {
#         'penalty': ['elastic_net'],
#         'l1_ratio': list(0.1 * x for x in range(0,10)),
#         'C': list(0.1 * x for x in range(0,20)),
#         'solver': ['saga']
#     },
#     {
#         'penalty': ['l2', 'l1'],
#         'C': list(0.1 * x for x in range(0,20))
#     }
# ]
#
# gs_elastic_net = GridSearchCV(LogisticRegression(), param_grid, cv=5,
#                               scoring='roc_auc',
#                               return_train_score=True)
#
# gs_elastic_net.fit(x_train, y_train)
# gs_eval(gs_elastic_net)

In [85]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(penalty='l2', C=1.4)
lr.fit(x_train, y_train)
rocauc(lr)

ROCAUC: 0.8


## model 2 random forest

In [86]:
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.model_selection import GridSearchCV
#
# param_grid = {'n_estimators': list(range(10, 200, 10)),
#               'max_depth': list(range(10,30)),
#               }
#
# gs_rdf = GridSearchCV(RandomForestClassifier(n_jobs=-1), param_grid, cv=5,
#                       scoring='roc_auc', return_train_score=True)
# gs_rdf.fit(x_train, y_train)
# gs_eval(gs_rdf)

In [87]:
from sklearn.ensemble import RandomForestClassifier

rdf = RandomForestClassifier(n_estimators=140, max_depth=11)
rdf.fit(x_train, y_train)
rocauc(rdf)

ROCAUC: 0.810270680495826


## model 3 XGBoost

In [88]:
# from xgboost import XGBClassifier
# from sklearn.model_selection import RandomizedSearchCV
# import warnings
# from sklearn.metrics import roc_auc_score
#
# warnings.simplefilter(action='ignore')
#
# param_grid = {
#     'n_estimators': list(range(100,300)),
#     'max_depth': list(range(10, 30)),
#     'learning_rate': list(0.1* x for x in range(1, 8))
#     }
#
# gs_xgb = RandomizedSearchCV(XGBClassifier(n_jobs=-1, use_label_encoder=False,
#                                     eval_metric = 'auc'), param_grid,
#                                     cv=5, return_train_score=True)
# gs_xgb.fit(x_train, y_train)
# gs_eval(gs_xgb)

In [89]:
from xgboost import XGBClassifier
xgb = XGBClassifier(n_estimators=193, max_depth=18, learning_rate = 0.2)
xgb.fit(x_train, y_train)
rocauc(xgb)

ROCAUC: 0.8034055727554179


## model 4 SVC
too slow and poor performance, commented out

In [90]:
# from sklearn.svm import SVC
# from sklearn.model_selection import GridSearchCV
#
# param_grid = {
#     'degree': list(range(1,3)),
#     'C': list(x * 0.1 for x in range (1,5))
# }
#
# gs_SVC = GridSearchCV(SVC(kernel='poly', verbose=True), param_grid, cv=5,
#                       return_train_score=True, scoring='roc_auc')
# gs_SVC.fit(x_train, y_train)
# gs_eval(gs_SVC)

In [91]:
# from sklearn.svm import SVC
#
# svc = SVC(kernel='linear', C=0.4)
# svc.fit(x_train, y_train)
# rocauc(svc)

# Stacking the models

In [92]:
# from sklearn.linear_model import LogisticRegression
# from sklearn.ensemble import RandomForestClassifier
# from xgboost import XGBClassifier
# from mlxtend.classifier import StackingClassifier
# from sklearn.model_selection import cross_val_score
# from sklearn.svm import SVC
# import warnings
#
# warnings.simplefilter('ignore')
#
# clf1 = LogisticRegression(penalty='l2', C=1.4)
# clf2 = RandomForestClassifier(n_estimators=140, max_depth=12)
# clf3 = XGBClassifier(n_estimators=175, max_depth=18, learning_rate = 0.3, eval_metric='auc')
# clf4 = SVC(kernel='linear', C=0.4)
# lr = LogisticRegression()
#
# sclf = StackingClassifier(classifiers=[clf1, clf2, clf3, clf4], meta_classifier=lr)
#
# print('5-fold cross validation: \n')
#
# for clf, label in zip([clf1, clf2, clf3, clf4, sclf],
#                       ['Ridge','Random forest', 'XGBoost','SVC', 'Stacked']):
#     scores = cross_val_score(clf, x_train, y_train, cv=5, scoring='roc_auc')
#     print('ROCAUC: %.2f[%s]'% (scores.mean(), label))

5-fold cross validation score from training set:

ROCAUC: 0.88[Ridge]
ROCAUC: 0.89[Random forest]
ROCAUC: 0.89[XGBoost]
ROCAUC: 0.87[SVC]
ROCAUC: 0.85[Stacked]

# voting classifier

In [94]:
from sklearn.ensemble import VotingClassifier

clf1 = LogisticRegression(penalty='l2', C=1.4)
clf2 = RandomForestClassifier(n_estimators=140, max_depth=12)
clf3 = XGBClassifier(n_estimators=175, max_depth=18, learning_rate = 0.3, eval_metric='auc')

## finding the optimal weights

## first tuning

In [100]:
import warnings
warnings.simplefilter('ignore')

df = pd.DataFrame(columns=('w1', 'w2','w3', 'mean', 'sd') )

from sklearn.model_selection import cross_val_score

for w1 in range(1,4):
    for w2 in range(1,4):
        for w3 in range(1,4):
            if (w1 == w2 and w2 == w3 and w1!=1):
                continue

            vclf = VotingClassifier(estimators=[('lr',clf1), ('rf',clf2), ('xgb',clf3)],
                        weights=[w1,w2,w3], flatten_transform=True, voting='soft',
                        n_jobs=-1, verbose=True)

            scores = cross_val_score(estimator=vclf, X=x_train, y=y_train,
                                     scoring='roc_auc', cv=5, n_jobs=-1)

            df = df.append({'w1': w1, 'w2': w2, 'w3':w3,
                       'mean': scores.mean(), 'sd': scores.std()}, ignore_index=True)

In [104]:
df.sort_values(['mean'], ascending=False)

Unnamed: 0,w1,w2,w3,mean,sd
15,2.0,3.0,2.0,0.899465,0.00652
16,2.0,3.0,3.0,0.899367,0.006123
8,1.0,3.0,3.0,0.899245,0.006011
4,1.0,2.0,2.0,0.899224,0.006608
13,2.0,2.0,3.0,0.899093,0.006186
3,1.0,2.0,1.0,0.899003,0.006381
14,2.0,3.0,1.0,0.899,0.00657
24,3.0,3.0,2.0,0.898977,0.00634
6,1.0,3.0,1.0,0.898962,0.005962
22,3.0,2.0,3.0,0.898823,0.00589


## second tuning

In [107]:
import warnings
warnings.simplefilter('ignore')

df = pd.DataFrame(columns=('w1', 'w2','w3', 'mean', 'sd') )

from sklearn.model_selection import cross_val_score

for w1 in range(18,22):
    for w2 in range(28,32):
        for w3 in range(18,22):
            if (w1 == w2 and w2 == w3):
                continue

            vclf = VotingClassifier(estimators=[('lr',clf1), ('rf',clf2), ('xgb',clf3)],
                        weights=[w1,w2,w3], flatten_transform=True, voting='soft',
                        n_jobs=-1, verbose=True)

            scores = cross_val_score(estimator=vclf, X=x_train, y=y_train,
                                     scoring='roc_auc', cv=5, n_jobs=-1)

            df = df.append({'w1': w1*0.1, 'w2': w2*0.1, 'w3':w3*0.1,
                       'mean': scores.mean(), 'sd': scores.std()}, ignore_index=True)

In [108]:
df.sort_values(['mean'], ascending=False)

Unnamed: 0,w1,w2,w3,mean,sd
61,2.1,3.1,1.9,0.899873,0.006618
31,1.9,3.1,2.1,0.899784,0.006214
21,1.9,2.9,1.9,0.899748,0.006461
41,2.0,3.0,1.9,0.899724,0.006421
44,2.0,3.1,1.8,0.89972,0.006575
...,...,...,...,...,...
11,1.8,3.0,2.1,0.899045,0.006377
62,2.1,3.1,2.0,0.899008,0.005917
27,1.9,3.0,2.1,0.898974,0.006739
6,1.8,2.9,2.0,0.898947,0.006484


In [109]:
vclf = VotingClassifier(estimators=[('lr',clf1), ('rf',clf2), ('xgb',clf3)],
                        weights=[2.1,3.1,1.9], flatten_transform=True, voting='soft',
                        n_jobs=-1, verbose=True)
vclf.fit(x_train, y_train)
rocauc(vclf)

ROCAUC: 0.8134488188976379


In [129]:
from joblib import dump
dump(vclf, 'final_model.joblib')

['final_model.joblib']

since the stacked model perform worse than baseline models, voting classifier will be used instead

# test data preprocessing

In [224]:
test = pd.read_csv('test.csv')
test_transformed = test

## impute numeric data

In [225]:
test_transformed[numeric_list] = ipt.fit_transform(test[numeric_list])

## data augmentation

In [226]:
# add group number
test_transformed[['group', 'id']] = test['PassengerId'].str.split('_', expand = True)
test_transformed.drop(columns=['id', 'PassengerId'], inplace=True)
test_transformed['group'] = test_transformed['group'].apply(int)

### impute missing categorical values

In [227]:
test_transformed.isnull().any()

HomePlanet       True
CryoSleep        True
Cabin            True
Destination      True
Age             False
VIP              True
RoomService     False
FoodCourt       False
ShoppingMall    False
Spa             False
VRDeck          False
Name             True
group           False
dtype: bool

In [228]:
from sklearn.impute import SimpleImputer

missing = ['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP']

ipt_cat = SimpleImputer(strategy='most_frequent', missing_values=np.nan)
test_transformed[missing] = ipt_cat.fit_transform(test_transformed[missing])

In [229]:
# split cabin number
test_transformed[['deck', 'cabin number', 'side']] = test_transformed['Cabin'].str.split('/', expand=True)
test_transformed['cabin number'] = test_transformed['cabin number'].apply(int)
test_transformed.drop(columns='Cabin', inplace=True)

In [230]:
# add total expenses column
test_transformed['total expenses'] = test_transformed['RoomService']+ test_transformed['FoodCourt']+ test_transformed['ShoppingMall']+ test_transformed['Spa']+ test_transformed['VRDeck']

In [231]:
#impute missing names
missing_name_df = test_transformed[test_transformed.isnull().any(axis=1)]
missing_name_df['Name'] = 'missing name'

In [232]:
# add number of family members column
test_transformed[['first name', 'last name']] = test_transformed['Name'].str.split(' ', expand=True)
test_family = pd.DataFrame(test_transformed.groupby(['group','last name']).size())
test_family.rename(columns={0: 'members'}, inplace=True)

test_transformed = test_transformed.merge(right=test_family, how='left', on=['group', 'last name'])
test_transformed.drop(columns=['first name', 'last name', 'Name'], inplace=True)

In [242]:
test_transformed = test_transformed.fillna(1)

## encoding and standardisation

In [244]:
# mapping boolean values
test_transformed[boolean_list_transformed] = test_transformed[boolean_list_transformed].astype(int)

In [245]:
# categorical data one hot encoding
test_transformed = pd.get_dummies(test_transformed, columns=categorical_list_transformed)
test_transformed

Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,group,cabin number,...,deck_A,deck_B,deck_C,deck_D,deck_E,deck_F,deck_G,deck_T,side_P,side_S
0,1,27.0,0,0.0,0.0,0.0,0.0,0.0,13,3,...,0,0,0,0,0,0,1,0,0,1
1,0,19.0,0,0.0,9.0,0.0,2823.0,0.0,18,4,...,0,0,0,0,0,1,0,0,0,1
2,1,31.0,0,0.0,0.0,0.0,0.0,0.0,19,0,...,0,0,1,0,0,0,0,0,0,1
3,0,38.0,0,0.0,6652.0,0.0,181.0,585.0,21,1,...,0,0,1,0,0,0,0,0,0,1
4,0,20.0,0,10.0,0.0,635.0,0.0,0.0,23,5,...,0,0,0,0,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4272,1,34.0,0,0.0,0.0,0.0,0.0,0.0,9266,1496,...,0,0,0,0,0,0,1,0,0,1
4273,0,42.0,0,0.0,847.0,17.0,10.0,144.0,9269,160,...,0,0,0,0,0,0,1,0,1,0
4274,1,26.0,0,0.0,0.0,0.0,0.0,0.0,9271,296,...,0,0,0,1,0,0,0,0,1,0
4275,0,26.0,0,0.0,2680.0,0.0,0.0,523.0,9273,297,...,0,0,0,1,0,0,0,0,1,0


In [246]:
# numeric data
test_num_transformed = numeric_colTrans.fit_transform(test_transformed)
test_num_transformed = pd.DataFrame(test_num_transformed, columns=norm_list_transformed + std_list_transformed)
test_transformed[norm_list_transformed + std_list_transformed] = test_num_transformed

# make predictions

In [256]:
test = pd.read_csv('test.csv')

In [258]:
pred = vclf.predict(test_transformed)

In [264]:
PassengerId = test['PassengerId']
final = pd.DataFrame(pred, index=PassengerId, columns=[ 'Transported'])
final['Transported'] = final['Transported'].astype(bool)
final

Unnamed: 0_level_0,Transported
PassengerId,Unnamed: 1_level_1
0013_01,True
0018_01,False
0019_01,True
0021_01,True
0023_01,False
...,...
9266_02,True
9269_01,False
9271_01,True
9273_01,True


In [265]:
final.to_csv('prediction.csv')