In [18]:
# Import library

import pandas as pd
from sklearn.impute import KNNImputer, SimpleImputer
from category_encoders import HashingEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier

In [19]:
# Read data

train_data = pd.read_csv('train.csv')
train_data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [20]:
## Missing value
cat_col = ['HomePlanet','CryoSleep', 'Destination', 'VIP']
num_col = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

# Using Simple Imputer to deal with missing value of categorical variables
imputer = SimpleImputer(strategy="most_frequent")
imputer.fit(train_data[cat_col])
train_data[cat_col] = imputer.transform(train_data[cat_col])

# Using KNN Imputer to deal with missing value of numerical variables
imputer = KNNImputer(n_neighbors=7)
imputer.fit(train_data[num_col])
train_data[num_col] = imputer.transform(train_data[num_col])

# Remove missing value of cabin and name (because can not fill those missing value)
# train_data = train_data.dropna(axis='index')

In [21]:
""" scaler = MinMaxScaler()
# transform data
train_data[num_col] = scaler.fit_transform(train_data[num_col]) """

' scaler = MinMaxScaler()\n# transform data\ntrain_data[num_col] = scaler.fit_transform(train_data[num_col]) '

In [22]:
# Create a column "PassengerGroup" from "PassengerId" 
train_data['PassengerGroup'] = train_data['PassengerId'].str.split('_').str[0]

# Create column "LastName" from "Name"
train_data['LastName'] = train_data['Name'].str.split(' ').str[1]

# Split column "Cabin" into 3 columns "CabinDeck", "CabinNum", "CabinSide"
train_data[['CabinDeck', 'CabinNum', 'CabinSide']] = train_data.Cabin.str.split('/', expand = True)

# Drop 3 columns "PassengerId", "Name" and "Cabin"
train_data = train_data.drop(['PassengerId', 'Name', 'Cabin', 'CabinNum'], axis='columns')

train_data.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,PassengerGroup,LastName,CabinDeck,CabinSide
0,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False,1,Ofracculy,B,P
1,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,True,2,Vines,F,S
2,Europa,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False,3,Susent,A,S
3,Europa,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False,3,Susent,A,S
4,Earth,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,True,4,Santantines,F,S


In [23]:
# Convert boolean to integer (0 = False, 1 = True)

train_data['CryoSleep'] = list(map(int, train_data['CryoSleep']))
train_data['VIP'] = list(map(int, train_data['VIP']))
train_data['Transported'] = list(map(int, train_data['Transported']))
train_data.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,PassengerGroup,LastName,CabinDeck,CabinSide
0,Europa,0,TRAPPIST-1e,39.0,0,0.0,0.0,0.0,0.0,0.0,0,1,Ofracculy,B,P
1,Earth,0,TRAPPIST-1e,24.0,0,109.0,9.0,25.0,549.0,44.0,1,2,Vines,F,S
2,Europa,0,TRAPPIST-1e,58.0,1,43.0,3576.0,0.0,6715.0,49.0,0,3,Susent,A,S
3,Europa,0,TRAPPIST-1e,33.0,0,0.0,1283.0,371.0,3329.0,193.0,0,3,Susent,A,S
4,Earth,0,TRAPPIST-1e,16.0,0,303.0,70.0,151.0,565.0,2.0,1,4,Santantines,F,S


In [24]:
# Create dummy variables and drop original ones

HomePlanetDummies = pd.get_dummies(train_data['HomePlanet'], prefix='HomePlanet')
train_data = pd.concat([train_data, HomePlanetDummies], axis='columns')

DestinationDummies = pd.get_dummies(train_data['Destination'], prefix='Destination')
train_data = pd.concat([train_data, DestinationDummies], axis='columns')

CabinDeckDummies = pd.get_dummies(train_data['CabinDeck'], prefix='CabinDeck')
train_data = pd.concat([train_data, CabinDeckDummies], axis='columns')

CabinSideDummies = pd.get_dummies(train_data['CabinSide'], prefix='CabinSide')
train_data = pd.concat([train_data, CabinSideDummies], axis='columns')

train_data = train_data.drop(['HomePlanet', 'Destination', 'CabinDeck', 'CabinSide'], axis='columns')

In [25]:
# Using feature hashing to encode PassengerGroup, CabinDeck and LastName

encoder = HashingEncoder(cols='PassengerGroup',n_components=15)
PassengerGroupDummies = pd.DataFrame(encoder.fit_transform(train_data['PassengerGroup']))
PassengerGroupDummies = PassengerGroupDummies.add_prefix('PassengerGroup_')
train_data = pd.concat([train_data, PassengerGroupDummies], axis='columns')

# encoder = HashingEncoder(cols='CabinDeck',n_components=5)
# CabinDeckDummies = pd.DataFrame(encoder.fit_transform(train_data['CabinDeck']))
# CabinDeckDummies = CabinDeckDummies.add_prefix('CabinDeck_')
# train_data = pd.concat([train_data, CabinDeckDummies], axis='columns')

encoder = HashingEncoder(cols='LastName',n_components=15)
LastNameDummies = pd.DataFrame(encoder.fit_transform(train_data['LastName']))
LastNameDummies = LastNameDummies.add_prefix('LastName_')
train_data = pd.concat([train_data, LastNameDummies], axis='columns')

# train_data = train_data.drop(['PassengerGroup', 'CabinDeck', 'LastName'], axis='columns')
train_data = train_data.drop(['PassengerGroup', 'LastName'], axis='columns')

train_data.head()

Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,HomePlanet_Earth,...,LastName_col_5,LastName_col_6,LastName_col_7,LastName_col_8,LastName_col_9,LastName_col_10,LastName_col_11,LastName_col_12,LastName_col_13,LastName_col_14
0,0,39.0,0,0.0,0.0,0.0,0.0,0.0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,24.0,0,109.0,9.0,25.0,549.0,44.0,1,1,...,0,0,0,0,1,0,0,0,0,0
2,0,58.0,1,43.0,3576.0,0.0,6715.0,49.0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,0,33.0,0,0.0,1283.0,371.0,3329.0,193.0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,0,16.0,0,303.0,70.0,151.0,565.0,2.0,1,1,...,0,0,0,0,0,1,0,0,0,0


# Testing

In [26]:
clf = GradientBoostingClassifier().fit(train_data.drop(['Transported'], axis='columns'), train_data['Transported'])

In [27]:
# Read data

train_data = pd.read_csv('test.csv')

passengerIdTest = train_data['PassengerId']

train_data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez


In [28]:
## Missing value
cat_col = ['HomePlanet','CryoSleep', 'Destination', 'VIP']
num_col = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

# Using Simple Imputer to deal with missing value of categorical variables
imputer = SimpleImputer(strategy="most_frequent")
imputer.fit(train_data[cat_col])
train_data[cat_col] = imputer.transform(train_data[cat_col])

# Using KNN Imputer to deal with missing value of numerical variables
imputer = KNNImputer(n_neighbors=7)
imputer.fit(train_data[num_col])
train_data[num_col] = imputer.transform(train_data[num_col])

# Remove missing value of cabin and name (because can not fill those missing value)
# train_data = train_data.dropna(axis='index')

In [29]:
""" scaler = MinMaxScaler()
# transform data
train_data[num_col] = scaler.fit_transform(train_data[num_col]) """

' scaler = MinMaxScaler()\n# transform data\ntrain_data[num_col] = scaler.fit_transform(train_data[num_col]) '

In [30]:
# Create a column "PassengerGroup" from "PassengerId" 
train_data['PassengerGroup'] = train_data['PassengerId'].str.split('_').str[0]

# Create column "LastName" from "Name"
train_data['LastName'] = train_data['Name'].str.split(' ').str[1]

# Split column "Cabin" into 3 columns "CabinDeck", "CabinNum", "CabinSide"
train_data[['CabinDeck', 'CabinNum', 'CabinSide']] = train_data.Cabin.str.split('/', expand = True)

# Drop 3 columns "PassengerId", "Name" and "Cabin"
train_data = train_data.drop(['PassengerId', 'Name', 'Cabin', 'CabinNum'], axis='columns')

train_data.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,PassengerGroup,LastName,CabinDeck,CabinSide
0,Earth,True,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,13,Carsoning,G,S
1,Earth,False,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,18,Peckers,F,S
2,Europa,True,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,19,Unhearfus,C,S
3,Europa,False,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,21,Caltilter,C,S
4,Earth,False,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,23,Harperez,F,S


In [31]:
# Convert boolean to integer (0 = False, 1 = True)

train_data['CryoSleep'] = list(map(int, train_data['CryoSleep']))
train_data['VIP'] = list(map(int, train_data['VIP']))
train_data.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,PassengerGroup,LastName,CabinDeck,CabinSide
0,Earth,1,TRAPPIST-1e,27.0,0,0.0,0.0,0.0,0.0,0.0,13,Carsoning,G,S
1,Earth,0,TRAPPIST-1e,19.0,0,0.0,9.0,0.0,2823.0,0.0,18,Peckers,F,S
2,Europa,1,55 Cancri e,31.0,0,0.0,0.0,0.0,0.0,0.0,19,Unhearfus,C,S
3,Europa,0,TRAPPIST-1e,38.0,0,0.0,6652.0,0.0,181.0,585.0,21,Caltilter,C,S
4,Earth,0,TRAPPIST-1e,20.0,0,10.0,0.0,635.0,0.0,0.0,23,Harperez,F,S


In [32]:
# Create dummy variables and drop original ones

HomePlanetDummies = pd.get_dummies(train_data['HomePlanet'], prefix='HomePlanet')
train_data = pd.concat([train_data, HomePlanetDummies], axis='columns')

DestinationDummies = pd.get_dummies(train_data['Destination'], prefix='Destination')
train_data = pd.concat([train_data, DestinationDummies], axis='columns')

CabinDeckDummies = pd.get_dummies(train_data['CabinDeck'], prefix='CabinDeck')
train_data = pd.concat([train_data, CabinDeckDummies], axis='columns')

CabinSideDummies = pd.get_dummies(train_data['CabinSide'], prefix='CabinSide')
train_data = pd.concat([train_data, CabinSideDummies], axis='columns')

train_data = train_data.drop(['HomePlanet', 'Destination', 'CabinDeck', 'CabinSide'], axis='columns')

In [33]:
# Using feature hashing to encode PassengerGroup, CabinDeck and LastName

encoder = HashingEncoder(cols='PassengerGroup',n_components=15)
PassengerGroupDummies = pd.DataFrame(encoder.fit_transform(train_data['PassengerGroup']))
PassengerGroupDummies = PassengerGroupDummies.add_prefix('PassengerGroup_')
train_data = pd.concat([train_data, PassengerGroupDummies], axis='columns')

# encoder = HashingEncoder(cols='CabinDeck',n_components=5)
# CabinDeckDummies = pd.DataFrame(encoder.fit_transform(train_data['CabinDeck']))
# CabinDeckDummies = CabinDeckDummies.add_prefix('CabinDeck_')
# train_data = pd.concat([train_data, CabinDeckDummies], axis='columns')

encoder = HashingEncoder(cols='LastName',n_components=15)
LastNameDummies = pd.DataFrame(encoder.fit_transform(train_data['LastName']))
LastNameDummies = LastNameDummies.add_prefix('LastName_')
train_data = pd.concat([train_data, LastNameDummies], axis='columns')

# train_data = train_data.drop(['PassengerGroup', 'CabinDeck', 'LastName'], axis='columns')
train_data = train_data.drop(['PassengerGroup', 'LastName'], axis='columns')

train_data.head()

Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,HomePlanet_Earth,HomePlanet_Europa,...,LastName_col_5,LastName_col_6,LastName_col_7,LastName_col_8,LastName_col_9,LastName_col_10,LastName_col_11,LastName_col_12,LastName_col_13,LastName_col_14
0,1,27.0,0,0.0,0.0,0.0,0.0,0.0,1,0,...,0,1,0,0,0,0,0,0,0,0
1,0,19.0,0,0.0,9.0,0.0,2823.0,0.0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,1,31.0,0,0.0,0.0,0.0,0.0,0.0,0,1,...,0,0,0,0,0,0,0,0,0,1
3,0,38.0,0,0.0,6652.0,0.0,181.0,585.0,0,1,...,0,0,0,0,0,1,0,0,0,0
4,0,20.0,0,10.0,0.0,635.0,0.0,0.0,1,0,...,0,0,0,0,0,1,0,0,0,0


In [34]:
res = pd.DataFrame(
        {

            'PassengerId': list(passengerIdTest),
            'Transported': [(x == 1) for x in list(clf.predict(train_data))]
        }
    )
res.to_csv('prediction.csv', index=False)