In [179]:
# Import library

import pandas as pd
from sklearn.impute import KNNImputer, SimpleImputer
from category_encoders import HashingEncoder

In [180]:
# Read data

train_data = pd.read_csv('train.csv')
train_data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [181]:
## Missing value
cat_col = ['HomePlanet','CryoSleep', 'Destination', 'VIP']
num_col = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

# Using Simple Imputer to deal with missing value of categorical variables
imputer = SimpleImputer(strategy="most_frequent")
imputer.fit(train_data[cat_col])
train_data[cat_col] = imputer.transform(train_data[cat_col])

# Using KNN Imputer to deal with missing value of numerical variables
imputer = KNNImputer(n_neighbors=7)
imputer.fit(train_data[num_col])
train_data[num_col] = imputer.transform(train_data[num_col])

# Remove missing value of cabin and name (because can not fill those missing value)
train_data = train_data.dropna(axis='index')

train_data.shape

(8296, 14)

In [182]:
# Create a column "PassengerGroup" from "PassengerId" 
train_data['PassengerGroup'] = train_data['PassengerId'].str.split('_').str[0]

# Create column "LastName" from "Name"
train_data['LastName'] = train_data['Name'].str.split(' ').str[1]

# Split column "Cabin" into 3 columns "CabinDeck", "CabinNum", "CabinSide"
train_data[['CabinDeck', 'CabinNum', 'CabinSide']] = train_data.Cabin.str.split('/', expand = True)

# Drop 3 columns "PassengerId", "Name" and "Cabin"
train_data = train_data.drop(['PassengerId', 'Name', 'Cabin'], axis='columns')

train_data.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,PassengerGroup,LastName,CabinDeck,CabinNum,CabinSide
0,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False,1,Ofracculy,B,0,P
1,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,True,2,Vines,F,0,S
2,Europa,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False,3,Susent,A,0,S
3,Europa,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False,3,Susent,A,0,S
4,Earth,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,True,4,Santantines,F,1,S


In [183]:
# Convert boolean to integer (0 = False, 1 = True)

train_data['CryoSleep'] = list(map(int, train_data['CryoSleep']))
train_data['VIP'] = list(map(int, train_data['VIP']))
train_data['Transported'] = list(map(int, train_data['Transported']))
train_data.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,PassengerGroup,LastName,CabinDeck,CabinNum,CabinSide
0,Europa,0,TRAPPIST-1e,39.0,0,0.0,0.0,0.0,0.0,0.0,0,1,Ofracculy,B,0,P
1,Earth,0,TRAPPIST-1e,24.0,0,109.0,9.0,25.0,549.0,44.0,1,2,Vines,F,0,S
2,Europa,0,TRAPPIST-1e,58.0,1,43.0,3576.0,0.0,6715.0,49.0,0,3,Susent,A,0,S
3,Europa,0,TRAPPIST-1e,33.0,0,0.0,1283.0,371.0,3329.0,193.0,0,3,Susent,A,0,S
4,Earth,0,TRAPPIST-1e,16.0,0,303.0,70.0,151.0,565.0,2.0,1,4,Santantines,F,1,S


In [184]:
# Create dummy variables and drop original ones

HomePlanetDummies = pd.get_dummies(train_data['HomePlanet'], prefix='HomePlanet')
train_data = pd.concat([train_data, HomePlanetDummies], axis='columns')

DestinationDummies = pd.get_dummies(train_data['Destination'], prefix='Destination')
train_data = pd.concat([train_data, DestinationDummies], axis='columns')

CabinSideDummies = pd.get_dummies(train_data['CabinSide'], prefix='CabinSide')
train_data = pd.concat([train_data, CabinSideDummies], axis='columns')

train_data = train_data.drop(['HomePlanet', 'Destination', 'CabinSide'], axis='columns')

train_data.head()

Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,PassengerGroup,...,CabinDeck,CabinNum,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,CabinSide_P,CabinSide_S
0,0,39.0,0,0.0,0.0,0.0,0.0,0.0,0,1,...,B,0,0,1,0,0,0,1,1,0
1,0,24.0,0,109.0,9.0,25.0,549.0,44.0,1,2,...,F,0,1,0,0,0,0,1,0,1
2,0,58.0,1,43.0,3576.0,0.0,6715.0,49.0,0,3,...,A,0,0,1,0,0,0,1,0,1
3,0,33.0,0,0.0,1283.0,371.0,3329.0,193.0,0,3,...,A,0,0,1,0,0,0,1,0,1
4,0,16.0,0,303.0,70.0,151.0,565.0,2.0,1,4,...,F,1,1,0,0,0,0,1,0,1


In [185]:
# Using feature hashing to encode PassengerGroup, CabinDeck and LastName

encoder = HashingEncoder(cols='PassengerGroup',n_components=5)
PassengerGroupDummies = pd.DataFrame(encoder.fit_transform(train_data['PassengerGroup']))
PassengerGroupDummies = PassengerGroupDummies.add_prefix('PassengerGroup_')
train_data = pd.concat([train_data, PassengerGroupDummies], axis='columns')

encoder = HashingEncoder(cols='CabinDeck',n_components=5)
CabinDeckDummies = pd.DataFrame(encoder.fit_transform(train_data['CabinDeck']))
CabinDeckDummies = CabinDeckDummies.add_prefix('CabinDeck_')
train_data = pd.concat([train_data, CabinDeckDummies], axis='columns')

encoder = HashingEncoder(cols='LastName',n_components=5)
LastNameDummies = pd.DataFrame(encoder.fit_transform(train_data['LastName']))
LastNameDummies = LastNameDummies.add_prefix('LastName_')
train_data = pd.concat([train_data, LastNameDummies], axis='columns')

train_data = train_data.drop(['PassengerGroup', 'CabinDeck', 'LastName'], axis='columns')

train_data.head()

Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,CabinNum,...,CabinDeck_col_0,CabinDeck_col_1,CabinDeck_col_2,CabinDeck_col_3,CabinDeck_col_4,LastName_col_0,LastName_col_1,LastName_col_2,LastName_col_3,LastName_col_4
0,0,39.0,0,0.0,0.0,0.0,0.0,0.0,0,0,...,0,0,0,1,0,0,0,1,0,0
1,0,24.0,0,109.0,9.0,25.0,549.0,44.0,1,0,...,1,0,0,0,0,0,0,0,0,1
2,0,58.0,1,43.0,3576.0,0.0,6715.0,49.0,0,0,...,1,0,0,0,0,0,0,0,0,1
3,0,33.0,0,0.0,1283.0,371.0,3329.0,193.0,0,0,...,1,0,0,0,0,0,0,0,0,1
4,0,16.0,0,303.0,70.0,151.0,565.0,2.0,1,1,...,1,0,0,0,0,1,0,0,0,0
