# Spaceship Titanic | Kaggle
#### September 2022

In [1]:
import pandas as pd
import numpy as np
import datetime as dt

from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split

In [2]:
# Read in data from https://www.kaggle.com/competitions/spaceship-titanic/data
train = pd.read_csv('data/train.csv', header=0)
val = pd.read_csv('data/test.csv', header=0)

In [3]:
# Perform some simple cleaning of the data
# Age
train['Age'] = train['Age'].fillna(train['Age'].mean())
val['Age'] = val['Age'].fillna(train['Age'].mean())
# HomePlanet
train['HomePlanet'] = train['HomePlanet'].fillna('Missing')
val['HomePlanet'] = val['HomePlanet'].fillna('Missing')
# Name
train['Name'] = train['Name'].fillna('Missing Missing')
val['Name'] = val['Name'].fillna('Missing Missing')
# VIP
train['VIP'] = train['VIP'].fillna(False)
val['VIP'] = val['VIP'].fillna(False)
# Cabin
train['Cabin'] = train['Cabin'].fillna('X/X/X')
val['Cabin'] = val['Cabin'].fillna('X/X/X')
# Destination
train['Destination'] = train['Destination'].fillna('Missing')
val['Destination'] = val['Destination'].fillna('Missing')
# CryoSleep
train['CryoSleep'] = train['CryoSleep'].fillna(False)
val['CryoSleep'] = val['CryoSleep'].fillna(False)
# RoomService
train['RoomService'] = train['RoomService'].fillna(0)
val['RoomService'] = val['RoomService'].fillna(0)
# FoodCourt
train['FoodCourt'] = train['FoodCourt'].fillna(0)
val['FoodCourt'] = val['FoodCourt'].fillna(0)
# ShoppingMall
train['ShoppingMall'] = train['ShoppingMall'].fillna(0)
val['ShoppingMall'] = val['ShoppingMall'].fillna(0)
# Spa
train['Spa'] = train['Spa'].fillna(0)
val['Spa'] = val['Spa'].fillna(0)
# VRDeck
train['VRDeck'] = train['VRDeck'].fillna(0)
val['VRDeck'] = val['VRDeck'].fillna(0)

In [4]:
train.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [5]:
val.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez


In [6]:
# Create a last name column based on intuition. Maybe all members of the same family are transported together.
train['LastName'] = train['Name'].str.split().str[-1]
val['LastName'] = val['Name'].str.split().str[-1]

# Split Cabin to separate zones based on intuition.
train['Cabin_1'] = train['Cabin'].str[0]
train['Cabin_2'] = train['Cabin'].str[2]
train['Cabin_3'] = train['Cabin'].str[4]
val['Cabin_1'] = val['Cabin'].str[0]
val['Cabin_2'] = val['Cabin'].str[2]
val['Cabin_3'] = val['Cabin'].str[4]

In [7]:
train.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,LastName,Cabin_1,Cabin_2,Cabin_3
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False,Ofracculy,B,0,P
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True,Vines,F,0,S
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False,Susent,A,0,S
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False,Susent,A,0,S
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True,Santantines,F,1,S


In [8]:
# Create an indicator for the target (dependent variable).
train['actual'] = np.where(train['Transported'] == False, 0, 1)

In [9]:
# Split the training set to prevent overfitting
train, test = train_test_split(train, test_size=0.2, random_state=1234)

In [10]:
print('train shape: ', train.shape)
print('test shape:  ', test.shape)

train shape:  (6954, 19)
test shape:   (1739, 19)


In [11]:
# Check distribution of the target (dependent variable) to make sure they are similar in the train and test sets.
print('Train - Distribution of Actual')
print(train['actual'].describe())
print()
print('Test - Distribution of Actual')
print(test['actual'].describe())

Train - Distribution of Actual
count    6954.000000
mean        0.505752
std         0.500003
min         0.000000
25%         0.000000
50%         1.000000
75%         1.000000
max         1.000000
Name: actual, dtype: float64

Test - Distribution of Actual
count    1739.000000
mean        0.495112
std         0.500120
min         0.000000
25%         0.000000
50%         0.000000
75%         1.000000
max         1.000000
Name: actual, dtype: float64


In [12]:
# Specify dependent and independent variables for the train and test samples
keep_vars = ['PassengerId', 'Age', 'LastName', 'Destination', 'CryoSleep', 'HomePlanet', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Cabin', 'Cabin_1', 'Cabin_2', 'Cabin_3']

train_y = train['Transported']
train_x = train[keep_vars].set_index('PassengerId')

test_y = test['Transported']
test_x = test[keep_vars].set_index('PassengerId')

val = val[keep_vars].set_index('PassengerId')

In [13]:
# Check data types
train_x.dtypes

Age             float64
LastName         object
Destination      object
CryoSleep          bool
HomePlanet       object
RoomService     float64
FoodCourt       float64
ShoppingMall    float64
Spa             float64
VRDeck          float64
Cabin            object
Cabin_1          object
Cabin_2          object
Cabin_3          object
dtype: object

In [14]:
# Determine categorical features
cat_features = np.where(train_x.dtypes != np.float64)
cat_features

(array([ 1,  2,  3,  4, 10, 11, 12, 13]),)

In [15]:
# Create the CatBoost Pool and specify categorical features for the model
cat_features = cat_features[0]
train_pool = Pool(train_x, label=train_y, cat_features=cat_features)
test_pool = Pool(test_x, label=test_y, cat_features=cat_features)

  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,


In [16]:
# Initialize the Classification model
model = CatBoostClassifier()

In [17]:
# Fit the model to the training set. Use the test set as the eval set to prevent overfitting to the training set.
model.fit(X=train_x, y=train_y, eval_set=test_pool, cat_features=cat_features, verbose=100)

Learning rate set to 0.051161
0:	learn: 0.6691023	test: 0.6682777	best: 0.6682777 (0)	total: 69.1ms	remaining: 1m 9s
100:	learn: 0.4071985	test: 0.4077079	best: 0.4077079 (100)	total: 1.14s	remaining: 10.2s
200:	learn: 0.3843252	test: 0.3947186	best: 0.3947186 (200)	total: 2.38s	remaining: 9.45s
300:	learn: 0.3639598	test: 0.3908268	best: 0.3908218 (291)	total: 3.49s	remaining: 8.1s
400:	learn: 0.3475413	test: 0.3890225	best: 0.3890225 (400)	total: 4.53s	remaining: 6.77s
500:	learn: 0.3323729	test: 0.3883406	best: 0.3882150 (484)	total: 5.59s	remaining: 5.57s
600:	learn: 0.3180035	test: 0.3872854	best: 0.3867516 (584)	total: 6.71s	remaining: 4.46s
700:	learn: 0.3062196	test: 0.3883877	best: 0.3867516 (584)	total: 7.77s	remaining: 3.31s
800:	learn: 0.2939907	test: 0.3896319	best: 0.3867516 (584)	total: 8.88s	remaining: 2.21s
900:	learn: 0.2834070	test: 0.3899998	best: 0.3867516 (584)	total: 9.94s	remaining: 1.09s
999:	learn: 0.2726594	test: 0.3911279	best: 0.3867516 (584)	total: 11s	rem

<catboost.core.CatBoostClassifier at 0x7fca0a4bd6a0>

In [18]:
# Get feature importance
varimportance = model.get_feature_importance(train_pool, fstr_type='FeatureImportance')
variables = list(train_x)
variable_importance = pd.DataFrame({'feature':variables, 'importance':varimportance}).sort_values('importance', ascending=False)
variable_importance.head(50)

Unnamed: 0,feature,importance
8,Spa,13.021371
9,VRDeck,12.189834
4,HomePlanet,9.835337
3,CryoSleep,8.282294
5,RoomService,8.150411
11,Cabin_1,7.906831
6,FoodCourt,7.142044
0,Age,7.125806
12,Cabin_2,6.10838
1,LastName,5.779194


In [19]:
# Score the validation set for submission to Kaggle
submission = pd.DataFrame(model.predict(val), index=val.index)
submission.columns = ['Transported']
submission.head()

  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,


Unnamed: 0_level_0,Transported
PassengerId,Unnamed: 1_level_1
0013_01,True
0018_01,False
0019_01,True
0021_01,True
0023_01,True


In [20]:
# Print CSV of scores
filename = 'submissions/submission_'+dt.datetime.now().strftime("%Y%m%d-%H%M%S")
submission.to_csv(filename, header=True, index=True)