Replicating what was done in:
https://www.kaggle.com/code/samuelcortinhas/spaceship-titanic-a-complete-guide#Introduction

In [None]:
# Core
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set(style='darkgrid', font_scale=1.4)
# from imblearn.over_sampling import SMOTE
import itertools
import warnings
warnings.filterwarnings('ignore')
# import plotly.express as px
import time

# Sklearn
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, confusion_matrix, recall_score, precision_score, f1_score
# from sklearn.metrics import roc_auc_score, plot_confusion_matrix, plot_roc_curve, roc_curve
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.feature_selection import mutual_info_classif
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
# import eli5
# from eli5.sklearn import PermutationImportance
from sklearn.utils import resample

# Models
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
# from xgboost import XGBClassifier
# from lightgbm import LGBMClassifier
# from catboost import CatBoostClassifier
from sklearn.naive_bayes import GaussianNB

In [82]:
train = pd.read_csv("./data/train.csv")
test = pd.read_csv("./data/test.csv")
print('Train set shape:', train.shape)
print('Test set shape:', test.shape)
train.head()

Train set shape: (8693, 14)
Test set shape: (4277, 13)


Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [80]:
print(train.isna().sum())
print(test.isna().sum())

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64
PassengerId       0
HomePlanet       87
CryoSleep        93
Cabin           100
Destination      92
Age              91
VIP              93
RoomService      82
FoodCourt       106
ShoppingMall     98
Spa             101
VRDeck           80
Name             94
dtype: int64


In [None]:
print(train.nunique())
print(train.dtypes)

In [None]:
plt.pie(train['Transported'].value_counts(),labels=["Transported","Not Transported"], autopct='%1.1f%%')
plt.show()

In [None]:
sns.histplot(data=train, x='Age', hue='Transported', binwidth=1)
plt.show()

Likelihood: 0-20 ~ Yes / 20-40 ~ No / 40 > ~ evenly --> children / adults / elderly 

In [None]:
fig=plt.figure(figsize=(10,20))
luxuries = ['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']
for i, luxury in enumerate(luxuries):
    ax=fig.add_subplot(5,2,2*i+1)
    sns.histplot(data=train, x=luxury, hue='Transported', bins=20)
    ax.set_title(luxury)
fig.tight_layout()
plt.show()

People that spend money tend to be transported / maybe separate into 2 --> spent or no spent  

In [None]:
fig=plt.figure(figsize=(10,20))
categories=['HomePlanet', 'CryoSleep', 'Destination', 'VIP']
for i, cat in enumerate(categories):
    ax=fig.add_subplot(4,1,i+1)
    sns.countplot(data=train, x=cat, axes=ax, hue='Transported')
fig.tight_layout()
plt.show()

In [83]:
train[['PassengerId','Cabin','Name']].head()

Unnamed: 0,PassengerId,Cabin,Name
0,0001_01,B/0/P,Maham Ofracculy
1,0002_01,F/0/S,Juanna Vines
2,0003_01,A/0/S,Altark Susent
3,0003_02,A/0/S,Solam Susent
4,0004_01,F/1/S,Willy Santantines


- We'll keep the group for the passenger (gggg__xx) 
- for the cabin the deck and side d_x_s
- we'll use the last name 

## Feature Engineering

- Age: transform into 3 classes: children, adult, elderly
- Luxuries: transform into 2 classes: either spent money or not
- VIP does not seem to help much --> maybe drop it
- Add group, deck, side, last_name columns
- Drop ID, Cabin, Name and Luxuries

In [77]:
def preprocess(data):
    #TODO: handle missing values
    data['Age'].apply(lambda x : 0 if 0 <= x <= 20 else (1 if 20 < x <= 40 else 2)) #TODO: not do it in place
    data['Spent'] = data[luxuries].sum(axis=1)
    data['Spent'].apply(lambda x : 1 if x > 0 else 0)
    data['Group'] = data['PassengerId'].apply(lambda x: x.split('_')[0]).astype(int)
    data['Deck'] = data['Cabin'].apply(lambda x: x.split('/')[0])
    data['Side'] = data['Cabin'].apply(lambda x: x.split('/')[2])
    data['LName'] = data['Name'].apply(lambda x: x.split(' ')[1])
    data = data.drop(['PassengerId','Cabin', 'Name'] + luxuries)
    return data

In [91]:
train.dtypes

PassengerId      object
HomePlanet       object
CryoSleep        object
Cabin            object
Destination      object
Age             float64
VIP              object
RoomService     float64
FoodCourt       float64
ShoppingMall    float64
Spa             float64
VRDeck          float64
Name             object
Transported        bool
dtype: object

In [81]:
preprocess(train)
train.head()

AttributeError: 'float' object has no attribute 'split'