In [1]:
# Import library
#!pip install --upgrade category_encoders
from sklearn.impute import KNNImputer, SimpleImputer
from category_encoders import HashingEncoder
import pandas as pd
import numpy as np

In [2]:
# Read data
train_data = pd.read_csv('train.csv')
train_data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [4]:
## Missing value
cat_col = ['HomePlanet','CryoSleep', 'Destination', 'VIP']
num_col = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

# Using Simple Imputer to deal with missing value of categorical variables
imputer = SimpleImputer(strategy="most_frequent")
imputer.fit(train_data[cat_col])
train_data[cat_col] = imputer.transform(train_data[cat_col])

# Using KNN Imputer to deal with missing value of numerical variables
imputer = KNNImputer(n_neighbors=7)
imputer.fit(train_data[num_col])
train_data[num_col] = imputer.transform(train_data[num_col])

# Remove missing value of cabin and name (because can not fill those missing value)
#train_data = train_data.dropna(axis='index')

train_data.shape

(8693, 14)

In [11]:
# Create a column "PassengerGroup" from "PassengerId" 
train_data['PassengerGroup'] = train_data['PassengerId'].str.split('_').str[0]

# Create column "LastName" from "Name"
train_data['LastName'] = train_data['Name'].str.split(' ').str[1]

# Split column "Cabin" into 3 columns "CabinDeck", "CabinNum", "CabinSide"
train_data[['CabinDeck', 'CabinNum', 'CabinSide']] = train_data.Cabin.str.split('/', expand = True)

# Drop 3 columns "PassengerId", "Name" and "Cabin"
train_data = train_data.drop(['PassengerId', 'Name', 'Cabin'], axis='columns')

train_data.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,...,HomePlanet_Europa,HomePlanet_Mars,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,PassengerGroup,LastName,CabinDeck,CabinNum,CabinSide
0,Europa,0,TRAPPIST-1e,39.0,0,0.0,0.0,0.0,0.0,0.0,...,1,0,0,0,1,1,Ofracculy,B,0,P
1,Earth,0,TRAPPIST-1e,24.0,0,109.0,9.0,25.0,549.0,44.0,...,0,0,0,0,1,2,Vines,F,0,S
2,Europa,0,TRAPPIST-1e,58.0,1,43.0,3576.0,0.0,6715.0,49.0,...,1,0,0,0,1,3,Susent,A,0,S
3,Europa,0,TRAPPIST-1e,33.0,0,0.0,1283.0,371.0,3329.0,193.0,...,1,0,0,0,1,3,Susent,A,0,S
4,Earth,0,TRAPPIST-1e,16.0,0,303.0,70.0,151.0,565.0,2.0,...,0,0,0,0,1,4,Santantines,F,1,S


In [12]:
# Convert boolean to integer (0 = False, 1 = True)

train_data['CryoSleep'] = list(map(int, train_data['CryoSleep']))
train_data['VIP'] = list(map(int, train_data['VIP']))
train_data['Transported'] = list(map(int, train_data['Transported']))
train_data.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,...,HomePlanet_Europa,HomePlanet_Mars,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,PassengerGroup,LastName,CabinDeck,CabinNum,CabinSide
0,Europa,0,TRAPPIST-1e,39.0,0,0.0,0.0,0.0,0.0,0.0,...,1,0,0,0,1,1,Ofracculy,B,0,P
1,Earth,0,TRAPPIST-1e,24.0,0,109.0,9.0,25.0,549.0,44.0,...,0,0,0,0,1,2,Vines,F,0,S
2,Europa,0,TRAPPIST-1e,58.0,1,43.0,3576.0,0.0,6715.0,49.0,...,1,0,0,0,1,3,Susent,A,0,S
3,Europa,0,TRAPPIST-1e,33.0,0,0.0,1283.0,371.0,3329.0,193.0,...,1,0,0,0,1,3,Susent,A,0,S
4,Earth,0,TRAPPIST-1e,16.0,0,303.0,70.0,151.0,565.0,2.0,...,0,0,0,0,1,4,Santantines,F,1,S


In [13]:
# Create dummy variables and drop original ones

HomePlanetDummies = pd.get_dummies(train_data['HomePlanet'], prefix='HomePlanet')
train_data = pd.concat([train_data, HomePlanetDummies], axis='columns')

DestinationDummies = pd.get_dummies(train_data['Destination'], prefix='Destination')
train_data = pd.concat([train_data, DestinationDummies], axis='columns')

CabinSideDummies = pd.get_dummies(train_data['CabinSide'], prefix='CabinSide')
train_data = pd.concat([train_data, CabinSideDummies], axis='columns')

train_data = train_data.drop(['HomePlanet', 'Destination', 'CabinSide'], axis='columns')

train_data.head()

Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,HomePlanet_Earth,...,CabinDeck,CabinNum,HomePlanet_Earth.1,HomePlanet_Europa,HomePlanet_Mars,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,CabinSide_P,CabinSide_S
0,0,39.0,0,0.0,0.0,0.0,0.0,0.0,0,0,...,B,0,0,1,0,0,0,1,1,0
1,0,24.0,0,109.0,9.0,25.0,549.0,44.0,1,1,...,F,0,1,0,0,0,0,1,0,1
2,0,58.0,1,43.0,3576.0,0.0,6715.0,49.0,0,0,...,A,0,0,1,0,0,0,1,0,1
3,0,33.0,0,0.0,1283.0,371.0,3329.0,193.0,0,0,...,A,0,0,1,0,0,0,1,0,1
4,0,16.0,0,303.0,70.0,151.0,565.0,2.0,1,1,...,F,1,1,0,0,0,0,1,0,1


In [14]:
#alternative for dealing with CabinNum
train_data['CabinNum'] = train_data['CabinNum'].replace(np.nan, 0)
train_data['CabinNum']=train_data['CabinNum'].astype(str).astype(int)

In [15]:
# Using feature hashing to encode PassengerGroup, CabinDeck and LastName

encoder = HashingEncoder(cols='PassengerGroup',n_components=5)
PassengerGroupDummies = pd.DataFrame(encoder.fit_transform(train_data['PassengerGroup']))
PassengerGroupDummies = PassengerGroupDummies.add_prefix('PassengerGroup_')
train_data = pd.concat([train_data, PassengerGroupDummies], axis='columns')

encoder = HashingEncoder(cols='CabinDeck',n_components=5)
CabinDeckDummies = pd.DataFrame(encoder.fit_transform(train_data['CabinDeck']))
CabinDeckDummies = CabinDeckDummies.add_prefix('CabinDeck_')
train_data = pd.concat([train_data, CabinDeckDummies], axis='columns')

encoder = HashingEncoder(cols='LastName',n_components=5)
LastNameDummies = pd.DataFrame(encoder.fit_transform(train_data['LastName']))
LastNameDummies = LastNameDummies.add_prefix('LastName_')
train_data = pd.concat([train_data, LastNameDummies], axis='columns')

train_data = train_data.drop(['PassengerGroup', 'CabinDeck', 'LastName'], axis='columns')

train_data.head()

Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,HomePlanet_Earth,...,CabinDeck_col_0,CabinDeck_col_1,CabinDeck_col_2,CabinDeck_col_3,CabinDeck_col_4,LastName_col_0,LastName_col_1,LastName_col_2,LastName_col_3,LastName_col_4
0,0,39.0,0,0.0,0.0,0.0,0.0,0.0,0,0,...,0,0,0,1,0,0,0,1,0,0
1,0,24.0,0,109.0,9.0,25.0,549.0,44.0,1,1,...,1,0,0,0,0,0,0,0,0,1
2,0,58.0,1,43.0,3576.0,0.0,6715.0,49.0,0,0,...,1,0,0,0,0,0,0,0,0,1
3,0,33.0,0,0.0,1283.0,371.0,3329.0,193.0,0,0,...,1,0,0,0,0,0,0,0,0,1
4,0,16.0,0,303.0,70.0,151.0,565.0,2.0,1,1,...,1,0,0,0,0,1,0,0,0,0


# Some data exploration

In [33]:
train_data.columns

Index(['CryoSleep', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall',
       'Spa', 'VRDeck', 'Transported', 'HomePlanet_Earth', 'HomePlanet_Europa',
       'HomePlanet_Mars', 'Destination_55 Cancri e',
       'Destination_PSO J318.5-22', 'Destination_TRAPPIST-1e',
       'HomePlanet_Earth', 'HomePlanet_Europa', 'HomePlanet_Mars',
       'Destination_55 Cancri e', 'Destination_PSO J318.5-22',
       'Destination_TRAPPIST-1e', 'HomePlanet_Earth', 'HomePlanet_Europa',
       'HomePlanet_Mars', 'Destination_55 Cancri e',
       'Destination_PSO J318.5-22', 'Destination_TRAPPIST-1e', 'CabinNum',
       'HomePlanet_Earth', 'HomePlanet_Europa', 'HomePlanet_Mars',
       'Destination_55 Cancri e', 'Destination_PSO J318.5-22',
       'Destination_TRAPPIST-1e', 'CabinSide_P', 'CabinSide_S',
       'PassengerGroup_col_0', 'PassengerGroup_col_1', 'PassengerGroup_col_2',
       'PassengerGroup_col_3', 'PassengerGroup_col_4', 'CabinDeck_col_0',
       'CabinDeck_col_1', 'CabinDeck_col_2', '

In [43]:
train_data.LastName_col_3.value_counts()

0    7050
1    1643
Name: LastName_col_3, dtype: int64

In [35]:
original = pd.read_csv('train.csv')

In [40]:
original['LastName'] = original['Name'].str.split(' ').str[1]

In [58]:
original['LastName'].value_counts().head(60)

Casonston      18
Oneiles        16
Domington      15
Litthews       15
Browlerson     14
Garnes         14
Cartez         14
Fulloydez      14
Hinglendez     13
Distured       13
Briggston      13
Barbes         13
Moodman        13
Gibbsonton     13
Vandan         12
Fowlesterez    12
Loway          12
Vloaf          12
Gainney        12
Hanner         12
Georgasey      12
Wagnerray      11
Cofferson      11
Jenson         11
Gouldensen     11
Morelly        11
Fryersonis     11
Ingston        11
Mosteraked     11
Torrez         11
Buckentry      11
Chanan         11
Crité          11
Pirejus        11
Villenson      11
Ousious        11
Clugete        11
Kiling         11
Ainserfle      11
Coolerson      11
Fla            11
Clemondsey     11
Belley         11
Yorkland       11
Guerson        11
Willy          11
Haydenzier     10
Mclardson      10
Wilsoney       10
Minen          10
Navages        10
Wartyson       10
Sageng         10
Sysilstict     10
Josey          10
Braymon   

In [47]:
encoder = HashingEncoder(cols='LastName',n_components=5)
LastNameDummies = pd.DataFrame(encoder.fit_transform(original['LastName']))
LastNameDummies = LastNameDummies.add_prefix('LastName_')
original = pd.concat([original, LastNameDummies], axis='columns')

# Baseline Lgbm model training

In [18]:
train_data['CabinNum']=train_data['CabinNum'].astype(str).astype(int)

In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split
import lightgbm as lgb

In [20]:
X = train_data.drop('Transported', axis= 1)
y = train_data.Transported

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [22]:
clf = lgb.LGBMClassifier()
clf.fit(X_train, y_train)


In [23]:
y_pred=clf.predict(X_test)

In [24]:
from sklearn.metrics import confusion_matrix

In [26]:
confusion_matrix(y_test,y_pred)

array([[1108,  316],
       [ 249, 1196]], dtype=int64)

In [120]:
(1108+1196)/(1065+1132+256+285)

0.8414901387874361

In [28]:
y_pred_train=clf.predict(X_train)
confusion_matrix(y_train,y_pred_train)

array([[2586,  305],
       [ 252, 2681]], dtype=int64)

In [30]:
(2586+2681)/(2586+2681+252+305)

0.9043612637362637

### Hyperparameter tuning

In [154]:
from sklearn.model_selection import GridSearchCV

In [48]:
# Define the model and the parameters to be optimized
model = lgb.LGBMClassifier()
params = {'boosting_type': ['gbdt', 'dart', 'goss'],
          'num_leaves': [31, 63, 127],
          'learning_rate': [0.1, 0.05, 0.01, 0.005, 0.001],
          'n_estimators': [100, 200, 300, 500, 1000]}

# Conduct the grid search
grid = GridSearchCV(model, param_grid=params, cv=5)
grid.fit(X, y)

# Print the best parameters
print("Best parameters: ", grid.best_params_)

# Evaluate the model on the test set
test_score = grid.score(X, y)
print("Test score: ", test_score)

KeyboardInterrupt: 

### training with all the data

In [31]:
lgbm = lgb.LGBMClassifier(boosting_type ='gbdt', learning_rate = 0.005, n_estimators = 500, num_leaves = 31)
lgbm.fit(X, y)

### Training with categorical variable option from lightgbm and all the data

In [157]:
# Create a LightGBM Dataset
lgb_dataset = lgb.Dataset(train_data.drop(columns=['Transported']), label=train_data['Transported'], categorical_feature=cat_vars)
# Train a LightGBM model
model = lgb.train({'learning_rate': 0.005, 'n_estimators': 500, 'num_leaves': 31,'boosting_type': 'gbdt', 'objective': 'binary'},lgb_dataset, num_boost_round=100)



In [156]:
#Prediction making example
y_pred=model.predict(X_test)
y_pred = [round(p) for p in y_pred]

# Testing the predictions

In [158]:
# Read data
train_data = pd.read_csv('test.csv')
passengerIdTest = train_data['PassengerId']
train_data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez


In [159]:
## Missing value
cat_col = ['HomePlanet','CryoSleep', 'Destination', 'VIP']
num_col = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

# Using Simple Imputer to deal with missing value of categorical variables
imputer = SimpleImputer(strategy="most_frequent")
imputer.fit(train_data[cat_col])
train_data[cat_col] = imputer.transform(train_data[cat_col])

# Using KNN Imputer to deal with missing value of numerical variables
imputer = KNNImputer(n_neighbors=7)
imputer.fit(train_data[num_col])
train_data[num_col] = imputer.transform(train_data[num_col])

# Remove missing value of cabin and name (because can not fill those missing value)
#train_data = train_data.dropna(axis='index')

train_data.shape

(4277, 13)

In [160]:
# Create a column "PassengerGroup" from "PassengerId" 
train_data['PassengerGroup'] = train_data['PassengerId'].str.split('_').str[0]

# Create column "LastName" from "Name"
train_data['LastName'] = train_data['Name'].str.split(' ').str[1]

# Split column "Cabin" into 3 columns "CabinDeck", "CabinNum", "CabinSide"
train_data[['CabinDeck', 'CabinNum', 'CabinSide']] = train_data.Cabin.str.split('/', expand = True)

# Drop 3 columns "PassengerId", "Name" and "Cabin"
train_data = train_data.drop(['PassengerId', 'Name', 'Cabin'], axis='columns')

train_data.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,PassengerGroup,LastName,CabinDeck,CabinNum,CabinSide
0,Earth,True,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,13,Carsoning,G,3,S
1,Earth,False,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,18,Peckers,F,4,S
2,Europa,True,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,19,Unhearfus,C,0,S
3,Europa,False,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,21,Caltilter,C,1,S
4,Earth,False,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,23,Harperez,F,5,S


In [161]:
# Convert boolean to integer (0 = False, 1 = True)

train_data['CryoSleep'] = list(map(int, train_data['CryoSleep']))
train_data['VIP'] = list(map(int, train_data['VIP']))
#train_data['Transported'] = list(map(int, train_data['Transported']))
train_data.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,PassengerGroup,LastName,CabinDeck,CabinNum,CabinSide
0,Earth,1,TRAPPIST-1e,27.0,0,0.0,0.0,0.0,0.0,0.0,13,Carsoning,G,3,S
1,Earth,0,TRAPPIST-1e,19.0,0,0.0,9.0,0.0,2823.0,0.0,18,Peckers,F,4,S
2,Europa,1,55 Cancri e,31.0,0,0.0,0.0,0.0,0.0,0.0,19,Unhearfus,C,0,S
3,Europa,0,TRAPPIST-1e,38.0,0,0.0,6652.0,0.0,181.0,585.0,21,Caltilter,C,1,S
4,Earth,0,TRAPPIST-1e,20.0,0,10.0,0.0,635.0,0.0,0.0,23,Harperez,F,5,S


In [162]:
# Create dummy variables and drop original ones

HomePlanetDummies = pd.get_dummies(train_data['HomePlanet'], prefix='HomePlanet')
train_data = pd.concat([train_data, HomePlanetDummies], axis='columns')

DestinationDummies = pd.get_dummies(train_data['Destination'], prefix='Destination')
train_data = pd.concat([train_data, DestinationDummies], axis='columns')

CabinSideDummies = pd.get_dummies(train_data['CabinSide'], prefix='CabinSide')
train_data = pd.concat([train_data, CabinSideDummies], axis='columns')

train_data = train_data.drop(['HomePlanet', 'Destination', 'CabinSide'], axis='columns')

train_data.head()

Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,PassengerGroup,LastName,CabinDeck,CabinNum,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,CabinSide_P,CabinSide_S
0,1,27.0,0,0.0,0.0,0.0,0.0,0.0,13,Carsoning,G,3,1,0,0,0,0,1,0,1
1,0,19.0,0,0.0,9.0,0.0,2823.0,0.0,18,Peckers,F,4,1,0,0,0,0,1,0,1
2,1,31.0,0,0.0,0.0,0.0,0.0,0.0,19,Unhearfus,C,0,0,1,0,1,0,0,0,1
3,0,38.0,0,0.0,6652.0,0.0,181.0,585.0,21,Caltilter,C,1,0,1,0,0,0,1,0,1
4,0,20.0,0,10.0,0.0,635.0,0.0,0.0,23,Harperez,F,5,1,0,0,0,0,1,0,1


In [163]:
# Using feature hashing to encode PassengerGroup, CabinDeck and LastName

encoder = HashingEncoder(cols='PassengerGroup',n_components=5)
PassengerGroupDummies = pd.DataFrame(encoder.fit_transform(train_data['PassengerGroup']))
PassengerGroupDummies = PassengerGroupDummies.add_prefix('PassengerGroup_')
train_data = pd.concat([train_data, PassengerGroupDummies], axis='columns')

encoder = HashingEncoder(cols='CabinDeck',n_components=5)
CabinDeckDummies = pd.DataFrame(encoder.fit_transform(train_data['CabinDeck']))
CabinDeckDummies = CabinDeckDummies.add_prefix('CabinDeck_')
train_data = pd.concat([train_data, CabinDeckDummies], axis='columns')

encoder = HashingEncoder(cols='LastName',n_components=5)
LastNameDummies = pd.DataFrame(encoder.fit_transform(train_data['LastName']))
LastNameDummies = LastNameDummies.add_prefix('LastName_')
train_data = pd.concat([train_data, LastNameDummies], axis='columns')

train_data = train_data.drop(['PassengerGroup', 'CabinDeck', 'LastName'], axis='columns')

train_data.head()

Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,CabinNum,HomePlanet_Earth,...,CabinDeck_col_0,CabinDeck_col_1,CabinDeck_col_2,CabinDeck_col_3,CabinDeck_col_4,LastName_col_0,LastName_col_1,LastName_col_2,LastName_col_3,LastName_col_4
0,1,27.0,0,0.0,0.0,0.0,0.0,0.0,3,1,...,0,0,1,0,0,0,1,0,0,0
1,0,19.0,0,0.0,9.0,0.0,2823.0,0.0,4,1,...,1,0,0,0,0,1,0,0,0,0
2,1,31.0,0,0.0,0.0,0.0,0.0,0.0,0,0,...,1,0,0,0,0,0,0,0,0,1
3,0,38.0,0,0.0,6652.0,0.0,181.0,585.0,1,0,...,1,0,0,0,0,1,0,0,0,0
4,0,20.0,0,10.0,0.0,635.0,0.0,0.0,5,1,...,1,0,0,0,0,1,0,0,0,0


In [164]:
#alternative for dealing with CabinNum
train_data['CabinNum'] = train_data['CabinNum'].replace(np.nan, 0)
train_data['CabinNum']=train_data['CabinNum'].astype(str).astype(int)

In [165]:
train_data

Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,CabinNum,HomePlanet_Earth,...,CabinDeck_col_0,CabinDeck_col_1,CabinDeck_col_2,CabinDeck_col_3,CabinDeck_col_4,LastName_col_0,LastName_col_1,LastName_col_2,LastName_col_3,LastName_col_4
0,1,27.000000,0,0.0,0.0,0.0,0.0,0.0,3,1,...,0,0,1,0,0,0,1,0,0,0
1,0,19.000000,0,0.0,9.0,0.0,2823.0,0.0,4,1,...,1,0,0,0,0,1,0,0,0,0
2,1,31.000000,0,0.0,0.0,0.0,0.0,0.0,0,0,...,1,0,0,0,0,0,0,0,0,1
3,0,38.000000,0,0.0,6652.0,0.0,181.0,585.0,1,0,...,1,0,0,0,0,1,0,0,0,0
4,0,20.000000,0,10.0,0.0,635.0,0.0,0.0,5,1,...,1,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4272,1,34.000000,0,0.0,0.0,0.0,0.0,0.0,1496,1,...,0,0,1,0,0,0,0,0,1,0
4273,0,42.000000,0,0.0,847.0,17.0,10.0,144.0,0,1,...,0,0,1,0,0,1,0,0,0,0
4274,1,13.571429,0,0.0,0.0,0.0,0.0,0.0,296,0,...,1,0,0,0,0,0,0,0,0,1
4275,0,28.285714,0,0.0,2680.0,0.0,0.0,523.0,297,0,...,1,0,0,0,0,1,0,0,0,0


# first atempt lightgbm testing

In [166]:
res = pd.DataFrame(
        {
            'PassengerId': list(passengerIdTest),
            'Transported': [(x == 1) for x in list(lgbm.predict(train_data))]
        }
    )
res.to_csv('prediction.csv', index=False)

In [167]:
passengerIdTest

0       0013_01
1       0018_01
2       0019_01
3       0021_01
4       0023_01
         ...   
4272    9266_02
4273    9269_01
4274    9271_01
4275    9273_01
4276    9277_01
Name: PassengerId, Length: 4277, dtype: object

In [168]:
train_data

Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,CabinNum,HomePlanet_Earth,...,CabinDeck_col_0,CabinDeck_col_1,CabinDeck_col_2,CabinDeck_col_3,CabinDeck_col_4,LastName_col_0,LastName_col_1,LastName_col_2,LastName_col_3,LastName_col_4
0,1,27.000000,0,0.0,0.0,0.0,0.0,0.0,3,1,...,0,0,1,0,0,0,1,0,0,0
1,0,19.000000,0,0.0,9.0,0.0,2823.0,0.0,4,1,...,1,0,0,0,0,1,0,0,0,0
2,1,31.000000,0,0.0,0.0,0.0,0.0,0.0,0,0,...,1,0,0,0,0,0,0,0,0,1
3,0,38.000000,0,0.0,6652.0,0.0,181.0,585.0,1,0,...,1,0,0,0,0,1,0,0,0,0
4,0,20.000000,0,10.0,0.0,635.0,0.0,0.0,5,1,...,1,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4272,1,34.000000,0,0.0,0.0,0.0,0.0,0.0,1496,1,...,0,0,1,0,0,0,0,0,1,0
4273,0,42.000000,0,0.0,847.0,17.0,10.0,144.0,0,1,...,0,0,1,0,0,1,0,0,0,0
4274,1,13.571429,0,0.0,0.0,0.0,0.0,0.0,296,0,...,1,0,0,0,0,0,0,0,0,1
4275,0,28.285714,0,0.0,2680.0,0.0,0.0,523.0,297,0,...,1,0,0,0,0,1,0,0,0,0


# Testing using categorical variaables directly on Lightgmb

### With the Le approach

In [169]:
y_pred=model.predict(train_data)
y_pred = [round(p) for p in y_pred]

In [171]:
res = pd.DataFrame(
        {
            'PassengerId': list(passengerIdTest),
            'Transported': [(x == 1) for x in y_pred]
        }
    )
res.to_csv('prediction.csv', index=False)

### with the Direct aproach

In [193]:
# Import library
#!pip install --upgrade category_encoders
from sklearn.impute import KNNImputer, SimpleImputer
from category_encoders import HashingEncoder
from sklearn import preprocessing

import pandas as pd
import numpy as np

In [194]:
# Read data
train_data = pd.read_csv('train.csv')
train_data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [195]:
## Missing value
cat_col = ['HomePlanet','CryoSleep', 'Destination', 'VIP']
num_col = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

# Using Simple Imputer to deal with missing value of categorical variables
imputer = SimpleImputer(strategy="most_frequent")
imputer.fit(train_data[cat_col])
train_data[cat_col] = imputer.transform(train_data[cat_col])

# Using KNN Imputer to deal with missing value of numerical variables
imputer = KNNImputer(n_neighbors=7)
imputer.fit(train_data[num_col])
train_data[num_col] = imputer.transform(train_data[num_col])

# Remove missing value of cabin and name (because can not fill those missing value)
#train_data = train_data.dropna(axis='index')

train_data.shape


(8693, 14)

In [196]:
# Create a column "PassengerGroup" from "PassengerId" 
train_data['PassengerGroup'] = train_data['PassengerId'].str.split('_').str[0]

# Create column "LastName" from "Name"
train_data['LastName'] = train_data['Name'].str.split(' ').str[1]

# Split column "Cabin" into 3 columns "CabinDeck", "CabinNum", "CabinSide"
train_data[['CabinDeck', 'CabinNum', 'CabinSide']] = train_data.Cabin.str.split('/', expand = True)

# Drop 3 columns "PassengerId", "Name" and "Cabin"
train_data = train_data.drop(['PassengerId', 'Name', 'Cabin'], axis='columns')

train_data.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,PassengerGroup,LastName,CabinDeck,CabinNum,CabinSide
0,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False,1,Ofracculy,B,0,P
1,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,True,2,Vines,F,0,S
2,Europa,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False,3,Susent,A,0,S
3,Europa,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False,3,Susent,A,0,S
4,Earth,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,True,4,Santantines,F,1,S


In [197]:
# Convert boolean to integer (0 = False, 1 = True)

train_data['CryoSleep'] = list(map(int, train_data['CryoSleep']))
train_data['VIP'] = list(map(int, train_data['VIP']))
train_data['Transported'] = list(map(int, train_data['Transported']))
train_data.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,PassengerGroup,LastName,CabinDeck,CabinNum,CabinSide
0,Europa,0,TRAPPIST-1e,39.0,0,0.0,0.0,0.0,0.0,0.0,0,1,Ofracculy,B,0,P
1,Earth,0,TRAPPIST-1e,24.0,0,109.0,9.0,25.0,549.0,44.0,1,2,Vines,F,0,S
2,Europa,0,TRAPPIST-1e,58.0,1,43.0,3576.0,0.0,6715.0,49.0,0,3,Susent,A,0,S
3,Europa,0,TRAPPIST-1e,33.0,0,0.0,1283.0,371.0,3329.0,193.0,0,3,Susent,A,0,S
4,Earth,0,TRAPPIST-1e,16.0,0,303.0,70.0,151.0,565.0,2.0,1,4,Santantines,F,1,S


In [198]:
# Create dummy variables and drop original ones

HomePlanetDummies = pd.get_dummies(train_data['HomePlanet'], prefix='HomePlanet')
train_data = pd.concat([train_data, HomePlanetDummies], axis='columns')

DestinationDummies = pd.get_dummies(train_data['Destination'], prefix='Destination')
train_data = pd.concat([train_data, DestinationDummies], axis='columns')

CabinSideDummies = pd.get_dummies(train_data['CabinSide'], prefix='CabinSide')
train_data = pd.concat([train_data, CabinSideDummies], axis='columns')

train_data = train_data.drop(['HomePlanet', 'Destination', 'CabinSide'], axis='columns')

train_data.head()

Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,PassengerGroup,...,CabinDeck,CabinNum,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,CabinSide_P,CabinSide_S
0,0,39.0,0,0.0,0.0,0.0,0.0,0.0,0,1,...,B,0,0,1,0,0,0,1,1,0
1,0,24.0,0,109.0,9.0,25.0,549.0,44.0,1,2,...,F,0,1,0,0,0,0,1,0,1
2,0,58.0,1,43.0,3576.0,0.0,6715.0,49.0,0,3,...,A,0,0,1,0,0,0,1,0,1
3,0,33.0,0,0.0,1283.0,371.0,3329.0,193.0,0,3,...,A,0,0,1,0,0,0,1,0,1
4,0,16.0,0,303.0,70.0,151.0,565.0,2.0,1,4,...,F,1,1,0,0,0,0,1,0,1


In [199]:
#alternative for dealing with CabinNum
train_data['CabinNum'] = train_data['CabinNum'].replace(np.nan, 0)
train_data['CabinNum']=train_data['CabinNum'].astype(str).astype(int)

In [200]:
# Using feature hashing to encode PassengerGroup, CabinDeck and LastName

encoder = HashingEncoder(cols='PassengerGroup',n_components=5)
PassengerGroupDummies = pd.DataFrame(encoder.fit_transform(train_data['PassengerGroup']))
PassengerGroupDummies = PassengerGroupDummies.add_prefix('PassengerGroup_')
train_data = pd.concat([train_data, PassengerGroupDummies], axis='columns')

encoder = HashingEncoder(cols='CabinDeck',n_components=5)
CabinDeckDummies = pd.DataFrame(encoder.fit_transform(train_data['CabinDeck']))
CabinDeckDummies = CabinDeckDummies.add_prefix('CabinDeck_')
train_data = pd.concat([train_data, CabinDeckDummies], axis='columns')

encoder = HashingEncoder(cols='LastName',n_components=10)
LastNameDummies = pd.DataFrame(encoder.fit_transform(train_data['LastName']))
LastNameDummies = LastNameDummies.add_prefix('LastName_')
train_data = pd.concat([train_data, LastNameDummies], axis='columns')

#encoder = preprocessing.LabelEncoder()
#LastNameDummies = pd.DataFrame(encoder.fit_transform(train_data['LastName']))
#LastNameDummies = LastNameDummies.add_prefix('LastName_')
#train_data = pd.concat([train_data, LastNameDummies], axis='columns')


train_data = train_data.drop(['PassengerGroup', 'CabinDeck','LastName'], axis='columns')

train_data.head()

Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,CabinNum,...,LastName_col_0,LastName_col_1,LastName_col_2,LastName_col_3,LastName_col_4,LastName_col_5,LastName_col_6,LastName_col_7,LastName_col_8,LastName_col_9
0,0,39.0,0,0.0,0.0,0.0,0.0,0.0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0,24.0,0,109.0,9.0,25.0,549.0,44.0,1,0,...,0,0,0,0,0,0,0,0,0,1
2,0,58.0,1,43.0,3576.0,0.0,6715.0,49.0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,33.0,0,0.0,1283.0,371.0,3329.0,193.0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,16.0,0,303.0,70.0,151.0,565.0,2.0,1,1,...,0,0,0,0,0,1,0,0,0,0


In [202]:
# Create a LightGBM Dataset
#train_data = train_data.drop(['LastName'], axis='columns')
cat_vars=['CryoSleep', 'VIP', 'VRDeck', 'HomePlanet_Earth', 'HomePlanet_Europa',
       'HomePlanet_Mars', 'Destination_55 Cancri e',
       'Destination_PSO J318.5-22', 'Destination_TRAPPIST-1e',
       'CabinNum', 'HomePlanet_Earth', 'HomePlanet_Europa', 'HomePlanet_Mars',
       'Destination_55 Cancri e', 'Destination_PSO J318.5-22',
       'Destination_TRAPPIST-1e', 'CabinSide_P', 'CabinSide_S',
       'PassengerGroup_col_0', 'PassengerGroup_col_1', 'PassengerGroup_col_2',
       'PassengerGroup_col_3', 'PassengerGroup_col_4', 'CabinDeck_col_0',
       'CabinDeck_col_1', 'CabinDeck_col_2', 'CabinDeck_col_3',
       'CabinDeck_col_4','LastName_col_1', 'LastName_col_2', 'LastName_col_3', 'LastName_col_4',
       'LastName_col_5', 'LastName_col_6', 'LastName_col_7', 'LastName_col_8',
       'LastName_col_9']
lgb_dataset = lgb.Dataset(train_data.drop(columns=['Transported']), label=train_data['Transported'], categorical_feature=cat_vars)
# Train a LightGBM model
model = lgb.train({'learning_rate': 0.005, 'n_estimators': 500, 'num_leaves': 31,'boosting_type': 'gbdt', 'objective': 'binary'},lgb_dataset, num_boost_round=100)



In [201]:
train_data.columns

Index(['CryoSleep', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall',
       'Spa', 'VRDeck', 'Transported', 'CabinNum', 'HomePlanet_Earth',
       'HomePlanet_Europa', 'HomePlanet_Mars', 'Destination_55 Cancri e',
       'Destination_PSO J318.5-22', 'Destination_TRAPPIST-1e', 'CabinSide_P',
       'CabinSide_S', 'PassengerGroup_col_0', 'PassengerGroup_col_1',
       'PassengerGroup_col_2', 'PassengerGroup_col_3', 'PassengerGroup_col_4',
       'CabinDeck_col_0', 'CabinDeck_col_1', 'CabinDeck_col_2',
       'CabinDeck_col_3', 'CabinDeck_col_4', 'LastName_col_0',
       'LastName_col_1', 'LastName_col_2', 'LastName_col_3', 'LastName_col_4',
       'LastName_col_5', 'LastName_col_6', 'LastName_col_7', 'LastName_col_8',
       'LastName_col_9'],
      dtype='object')

In [176]:
train_data['Transported']

0       0
1       1
2       0
3       0
4       1
       ..
8688    0
8689    0
8690    1
8691    0
8692    1
Name: Transported, Length: 8693, dtype: int64

### Over test 

In [203]:
# Read data
train_data = pd.read_csv('test.csv')
passengerIdTest = train_data['PassengerId']
train_data.head()

## Missing value
cat_col = ['HomePlanet','CryoSleep', 'Destination', 'VIP']
num_col = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

# Using Simple Imputer to deal with missing value of categorical variables
imputer = SimpleImputer(strategy="most_frequent")
imputer.fit(train_data[cat_col])
train_data[cat_col] = imputer.transform(train_data[cat_col])

# Using KNN Imputer to deal with missing value of numerical variables
imputer = KNNImputer(n_neighbors=7)
imputer.fit(train_data[num_col])
train_data[num_col] = imputer.transform(train_data[num_col])

# Remove missing value of cabin and name (because can not fill those missing value)
#train_data = train_data.dropna(axis='index')

# Create a column "PassengerGroup" from "PassengerId" 
train_data['PassengerGroup'] = train_data['PassengerId'].str.split('_').str[0]

# Create column "LastName" from "Name"
train_data['LastName'] = train_data['Name'].str.split(' ').str[1]

# Split column "Cabin" into 3 columns "CabinDeck", "CabinNum", "CabinSide"
train_data[['CabinDeck', 'CabinNum', 'CabinSide']] = train_data.Cabin.str.split('/', expand = True)

# Drop 3 columns "PassengerId", "Name" and "Cabin"
train_data = train_data.drop(['PassengerId', 'Name', 'Cabin'], axis='columns')

# Convert boolean to integer (0 = False, 1 = True)

train_data['CryoSleep'] = list(map(int, train_data['CryoSleep']))
train_data['VIP'] = list(map(int, train_data['VIP']))
#train_data['Transported'] = list(map(int, train_data['Transported']))

# Create dummy variables and drop original ones

HomePlanetDummies = pd.get_dummies(train_data['HomePlanet'], prefix='HomePlanet')
train_data = pd.concat([train_data, HomePlanetDummies], axis='columns')

DestinationDummies = pd.get_dummies(train_data['Destination'], prefix='Destination')
train_data = pd.concat([train_data, DestinationDummies], axis='columns')

CabinSideDummies = pd.get_dummies(train_data['CabinSide'], prefix='CabinSide')
train_data = pd.concat([train_data, CabinSideDummies], axis='columns')

train_data = train_data.drop(['HomePlanet', 'Destination', 'CabinSide'], axis='columns')


#alternative for dealing with CabinNum
train_data['CabinNum'] = train_data['CabinNum'].replace(np.nan, 0)
train_data['CabinNum']=train_data['CabinNum'].astype(str).astype(int)

# Using feature hashing to encode PassengerGroup, CabinDeck and LastName

encoder = HashingEncoder(cols='PassengerGroup',n_components=5)
PassengerGroupDummies = pd.DataFrame(encoder.fit_transform(train_data['PassengerGroup']))
PassengerGroupDummies = PassengerGroupDummies.add_prefix('PassengerGroup_')
train_data = pd.concat([train_data, PassengerGroupDummies], axis='columns')

encoder = HashingEncoder(cols='CabinDeck',n_components=5)
CabinDeckDummies = pd.DataFrame(encoder.fit_transform(train_data['CabinDeck']))
CabinDeckDummies = CabinDeckDummies.add_prefix('CabinDeck_')
train_data = pd.concat([train_data, CabinDeckDummies], axis='columns')

encoder = HashingEncoder(cols='LastName',n_components=10)
LastNameDummies = pd.DataFrame(encoder.fit_transform(train_data['LastName']))
LastNameDummies = LastNameDummies.add_prefix('LastName_')
train_data = pd.concat([train_data, LastNameDummies], axis='columns')

#encoder = preprocessing.LabelEncoder()
#LastNameDummies = pd.DataFrame(encoder.fit_transform(train_data['LastName']))
#LastNameDummies = LastNameDummies.add_prefix('LastName_')
#train_data = pd.concat([train_data, LastNameDummies], axis='columns')

train_data = train_data.drop(['PassengerGroup', 'CabinDeck','LastName'], axis='columns')

train_data.head()


Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,CabinNum,HomePlanet_Earth,...,LastName_col_0,LastName_col_1,LastName_col_2,LastName_col_3,LastName_col_4,LastName_col_5,LastName_col_6,LastName_col_7,LastName_col_8,LastName_col_9
0,1,27.0,0,0.0,0.0,0.0,0.0,0.0,3,1,...,0,1,0,0,0,0,0,0,0,0
1,0,19.0,0,0.0,9.0,0.0,2823.0,0.0,4,1,...,1,0,0,0,0,0,0,0,0,0
2,1,31.0,0,0.0,0.0,0.0,0.0,0.0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,0,38.0,0,0.0,6652.0,0.0,181.0,585.0,1,0,...,0,0,0,0,0,1,0,0,0,0
4,0,20.0,0,10.0,0.0,635.0,0.0,0.0,5,1,...,1,0,0,0,0,0,0,0,0,0


In [204]:
y_pred=model.predict(train_data)
y_pred = [round(p) for p in y_pred]

In [205]:
res = pd.DataFrame(
        {
            'PassengerId': list(passengerIdTest),
            'Transported': [(x == 1) for x in y_pred]
        }
    )
res.to_csv('prediction.csv', index=False)

In [157]:
train_data.columns

Index(['CryoSleep', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall',
       'Spa', 'VRDeck', 'CabinNum', 'HomePlanet_Earth', 'HomePlanet_Europa',
       'HomePlanet_Mars', 'Destination_55 Cancri e',
       'Destination_PSO J318.5-22', 'Destination_TRAPPIST-1e', 'CabinSide_P',
       'CabinSide_S', 'PassengerGroup_col_0', 'PassengerGroup_col_1',
       'PassengerGroup_col_2', 'PassengerGroup_col_3', 'PassengerGroup_col_4',
       'CabinDeck_col_0', 'CabinDeck_col_1', 'CabinDeck_col_2',
       'CabinDeck_col_3', 'CabinDeck_col_4'],
      dtype='object')

In [None]:
'CryoSleep', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall',
       'Spa', 'VRDeck', 'Transported', 'CabinNum', 'HomePlanet_Earth',
       'HomePlanet_Europa', 'HomePlanet_Mars', 'Destination_55 Cancri e',
       'Destination_PSO J318.5-22', 'Destination_TRAPPIST-1e', 'CabinSide_P',
       'CabinSide_S', 'PassengerGroup_col_0', 'PassengerGroup_col_1',
       'PassengerGroup_col_2', 'PassengerGroup_col_3', 'PassengerGroup_col_4',
       'CabinDeck_col_0', 'CabinDeck_col_1', 'CabinDeck_col_2',
       'CabinDeck_col_3', 'CabinDeck_col_4']