# Spaceship Titanic
Predict which passengers are transported to an alternate dimension

Competition: https://www.kaggle.com/competitions/spaceship-titanic

Hugging Face: https://huggingface.co/spaces/alperugurcan/transported-prediction

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import KNNImputer

In [2]:
df_train=pd.read_csv('train.csv')

In [3]:
df_test=pd.read_csv('test.csv')

In [4]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


In [5]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4277 entries, 0 to 4276
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   4277 non-null   object 
 1   HomePlanet    4190 non-null   object 
 2   CryoSleep     4184 non-null   object 
 3   Cabin         4177 non-null   object 
 4   Destination   4185 non-null   object 
 5   Age           4186 non-null   float64
 6   VIP           4184 non-null   object 
 7   RoomService   4195 non-null   float64
 8   FoodCourt     4171 non-null   float64
 9   ShoppingMall  4179 non-null   float64
 10  Spa           4176 non-null   float64
 11  VRDeck        4197 non-null   float64
 12  Name          4183 non-null   object 
dtypes: float64(6), object(7)
memory usage: 434.5+ KB


In [6]:
df_test['Transported'] = False

In [7]:
df = pd.concat([df_train, df_test], sort=False)
df.drop(['Name'], axis=1, inplace=True)

In [8]:
df.shape[0]==df_train.shape[0]+df_test.shape[0]

True

In [9]:
df.isnull().sum()

Unnamed: 0,0
PassengerId,0
HomePlanet,288
CryoSleep,310
Cabin,299
Destination,274
Age,270
VIP,296
RoomService,263
FoodCourt,289
ShoppingMall,306


In [10]:
df[["deck", "num", "side"]] = df["Cabin"].str.split('/', expand=True)
df= df.drop(columns=['Cabin'])

In [11]:
df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,deck,num,side
0,0001_01,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False,B,0,P
1,0002_01,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,True,F,0,S
2,0003_01,Europa,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False,A,0,S
3,0003_02,Europa,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False,A,0,S
4,0004_01,Earth,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,True,F,1,S


In [12]:
df['deck'] = df['deck'].fillna('U')
df['num'] = df['num'].fillna(-1)
df['side'] = df['side'].fillna('U')

In [13]:
df['HomePlanet'].value_counts()

Unnamed: 0_level_0,count
HomePlanet,Unnamed: 1_level_1
Earth,6865
Europa,3133
Mars,2684


In [14]:
df.deck.value_counts()

Unnamed: 0_level_0,count
deck,Unnamed: 1_level_1
F,4239
G,3781
E,1323
B,1141
C,1102
D,720
A,354
U,299
T,11


In [15]:
df['deck'] = df['deck'].map({'B': 0, 'F': 1, 'E': 2, 'A': 3, 'G': 4, 'D': 5, 'C': 6, 'U': 7})
df['side'] = df['side'].map({'U': -1,'P': 1, 'S': 2})

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12970 entries, 0 to 4276
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   12970 non-null  object 
 1   HomePlanet    12682 non-null  object 
 2   CryoSleep     12660 non-null  object 
 3   Destination   12696 non-null  object 
 4   Age           12700 non-null  float64
 5   VIP           12674 non-null  object 
 6   RoomService   12707 non-null  float64
 7   FoodCourt     12681 non-null  float64
 8   ShoppingMall  12664 non-null  float64
 9   Spa           12686 non-null  float64
 10  VRDeck        12702 non-null  float64
 11  Transported   12970 non-null  bool   
 12  deck          12959 non-null  float64
 13  num           12970 non-null  object 
 14  side          12970 non-null  int64  
dtypes: bool(1), float64(7), int64(1), object(6)
memory usage: 1.5+ MB


In [17]:
impute_cols = ['Age', 'VIP', 'num', 'CryoSleep', 'side', 'deck', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
rest = list(set(df.columns) - set(impute_cols))
df_rest=df[rest]

In [18]:
imp = KNNImputer(n_neighbors=5)
df_imputed = imp.fit_transform(df[impute_cols])
df_imputed = pd.DataFrame(df_imputed, columns = impute_cols)
df = pd.concat([df_rest.reset_index(drop = True), df_imputed.reset_index(drop = True)], axis = 1)

In [19]:
df.isnull().sum()

Unnamed: 0,0
PassengerId,0
Destination,274
HomePlanet,288
Transported,0
Age,0
VIP,0
num,0
CryoSleep,0
side,0
deck,0


In [20]:
df['HomePlanet'] = df['HomePlanet'].fillna('U')
df['Destination'] = df['Destination'].fillna('U')
category_colls = ['HomePlanet', 'Destination']

for col in category_colls:
    df = pd.concat([df, pd.get_dummies(df[col], prefix = col)], axis = 1)

In [21]:
bill_cols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
df['amt_spent'] = df[bill_cols].sum(axis = 1)
df['std_amt_spent'] = df[bill_cols].std(axis = 1)
df['mean_amt_spent'] = df[bill_cols].mean(axis = 1)

df['3_high_cols'] = df['CryoSleep'] + df['HomePlanet_Europa'] + df['Destination_55 Cancri e']
df['3_low_cols'] = df['mean_amt_spent'] + df['amt_spent'] + df['HomePlanet_Earth']

In [22]:
df = df.drop(columns = category_colls)

In [23]:
df.corr()['Transported'].sort_values(ascending = False)

Unnamed: 0,Transported
Transported,1.0
CryoSleep,0.324501
3_high_cols,0.284204
HomePlanet_Europa,0.131977
Destination_55 Cancri e,0.083625
side,0.059872
FoodCourt,0.034897
deck,0.029526
PassengerId,0.014628
HomePlanet_U,0.006403


In [24]:
df_train, df_test = df[:df_train.shape[0]], df[df_train.shape[0]:]
df_test = df_test.drop(columns = 'Transported')
df_train.shape, df_test.shape

((8693, 26), (4277, 25))

In [25]:
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [26]:
X = df_train.drop(columns = 'Transported')
y = df_train['Transported']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

model_1 = LogisticRegression()
model_2 = DecisionTreeClassifier()
model_3 = RandomForestClassifier()
model_4 = XGBClassifier(enable_categorical=True)
model_5 = LGBMClassifier()

In [27]:
model_1.fit(X_train, y_train)
pred = model_1.predict(X_test)
accuracy_score(y_test, pred)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.7682576193214491

In [28]:
model_2.fit(X_train, y_train)
pred = model_2.predict(X_test)
accuracy_score(y_test, pred)

0.7406555491661875

In [29]:
model_3.fit(X_train, y_train)
pred = model_3.predict(X_test)
accuracy_score(y_test, pred)

0.7883841288096607

In [30]:
X_train['PassengerId'] = pd.to_numeric(X_train['PassengerId'], errors='ignore')

  X_train['PassengerId'] = pd.to_numeric(X_train['PassengerId'], errors='ignore')


In [31]:
X_train['PassengerId'] = X_train['PassengerId'].astype('category')
X_test['PassengerId'] = X_test['PassengerId'].astype('category')

In [32]:
model_4.fit(X_train, y_train)
pred = model_4.predict(X_test)
accuracy_score(y_test, pred)

0.7918343875790684

In [33]:
model_5.fit(X_train, y_train)
pred = model_5.predict(X_test)
accuracy_score(y_test, pred)

[LightGBM] [Info] Number of positive: 3500, number of negative: 3454
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001832 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2705
[LightGBM] [Info] Number of data points in the train set: 6954, number of used features: 25
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503307 -> initscore=0.013230
[LightGBM] [Info] Start training from score 0.013230


0.7947096032202415

In [34]:
df_dummy = pd.read_csv('test.csv')
pred = model_3.predict(df_test)

final = pd.DataFrame()
final['PassengerId'] = df_dummy['PassengerId']
final['Transported'] = pred

final.to_csv('submission.csv', index = False)

In [35]:
import joblib
joblib.dump(model_3, 'model_3.pkl')

['model_3.pkl']