In [105]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import KNNImputer

In [106]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
df_test['Transported'] = False
df = pd.concat([df_train, df_test], sort = False)
df = df.drop(columns = ['Name','PassengerId'], axis = 1)
df.head()

Unnamed: 0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported
0,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False
1,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,True
2,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False
3,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False
4,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,True


In [107]:
df.shape[0] == df_train.shape[0] + df_test.shape[0]

True

In [120]:
df.isna().sum()

Transported                  0
Age                          0
VIP                          0
Num                          0
CryoSleep                    0
Side                         0
Deck                         0
RoomService                  0
FoodCourt                    0
ShoppingMall                 0
Spa                          0
VRDeck                       0
HomePlanet_Earth             0
HomePlanet_Europa            0
HomePlanet_Mars              0
HomePlanet_U                 0
Destination_55 Cancri e      0
Destination_PSO J318.5-22    0
Destination_TRAPPIST-1e      0
Destination_U                0
amount_spent                 0
std_amount_spent             0
mean_amount_spent            0
3_high_cols                  0
3_low_cols                   0
dtype: int64

In [109]:
df[['Deck', 'Num', 'Side']] = df['Cabin'].str.split('/', expand = True)

In [110]:
df = df.drop(columns = ['Cabin'])

In [111]:
df.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Deck,Num,Side
0,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False,B,0,P
1,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,True,F,0,S
2,Europa,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False,A,0,S
3,Europa,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False,A,0,S
4,Earth,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,True,F,1,S


In [112]:
df['Deck'] = df['Deck'].fillna('U')
df['Num'] = df['Num'].fillna(-1)
df['Side'] = df['Side'].fillna('U')

In [113]:
df['Side'].value_counts()

Side
S    6381
P    6290
U     299
Name: count, dtype: int64

In [114]:
df['Deck'] = df['Deck'].map({'F':1, 'G':0, 'E':2, 'B':5, 'C':4, 'D':3, 'A':6, 'U':7, 'T':8})
df['Side'] = df['Side'].map({'S':2, 'P':1, 'U':-1})

In [115]:
impute_list = ['Age', 'VIP', 'Num', 'CryoSleep', 'Side', 'Deck', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
rest = list(set(df.columns) - set(impute_list))
df_rest = df[rest]
imp = KNNImputer()
df_imputed = imp.fit_transform(df[impute_list])
df_imputed = pd.DataFrame(df_imputed, columns = impute_list)
df = pd.concat([df_rest.reset_index(drop = True), df_imputed.reset_index(drop = True)], axis = 1)

In [116]:
df['HomePlanet'] = df['HomePlanet'].fillna('U')
df['Destination'] = df['Destination'].fillna('U')
categorical = ['HomePlanet', 'Destination']

for col in categorical:
    df = pd.concat([df, pd.get_dummies(df[col], prefix = col)], axis = 1)

df = df.drop(columns = categorical)

In [117]:
money = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
df['amount_spent'] = df[money].sum(axis = 1)
df['std_amount_spent'] = df[money].std(axis = 1)
df['mean_amount_spent'] = df[money].mean(axis = 1)

df['3_high_cols'] = df['CryoSleep'] + df['HomePlanet_Europa'] + df['Destination_55 Cancri e']
df['3_low_cols'] = df['mean_amount_spent'] + df['amount_spent'] + df['HomePlanet_Earth']

In [118]:
df.head()

Unnamed: 0,Transported,Age,VIP,Num,CryoSleep,Side,Deck,RoomService,FoodCourt,ShoppingMall,...,HomePlanet_U,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,Destination_U,amount_spent,std_amount_spent,mean_amount_spent,3_high_cols,3_low_cols
0,False,39.0,0.0,0.0,0.0,1.0,5.0,0.0,0.0,0.0,...,False,False,False,True,False,0.0,0.0,0.0,1.0,0.0
1,True,24.0,0.0,0.0,0.0,2.0,1.0,109.0,9.0,25.0,...,False,False,False,True,False,736.0,227.807375,147.2,0.0,884.2
2,False,58.0,1.0,0.0,0.0,2.0,6.0,43.0,3576.0,0.0,...,False,False,False,True,False,10383.0,3013.383198,2076.6,1.0,12459.6
3,False,33.0,0.0,0.0,0.0,2.0,6.0,0.0,1283.0,371.0,...,False,False,False,True,False,5176.0,1373.410427,1035.2,1.0,6211.2
4,True,16.0,0.0,1.0,0.0,2.0,1.0,303.0,70.0,151.0,...,False,False,False,True,False,1091.0,223.988169,218.2,0.0,1310.2


In [119]:
df.corr()['Transported'].sort_values(ascending = False)

Transported                  1.000000
CryoSleep                    0.324335
3_high_cols                  0.284152
HomePlanet_Europa            0.131977
Destination_55 Cancri e      0.083625
Deck                         0.077959
Side                         0.059872
FoodCourt                    0.034746
HomePlanet_U                 0.006403
HomePlanet_Mars              0.005643
ShoppingMall                 0.004154
Destination_PSO J318.5-22    0.000760
Destination_U               -0.000554
VIP                         -0.018720
Num                         -0.035240
Age                         -0.050450
Destination_TRAPPIST-1e     -0.072731
HomePlanet_Earth            -0.119644
std_amount_spent            -0.121135
amount_spent                -0.140425
mean_amount_spent           -0.140425
3_low_cols                  -0.140448
VRDeck                      -0.142783
Spa                         -0.154759
RoomService                 -0.174781
Name: Transported, dtype: float64