In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [2]:
train = pd.read_csv("../input/spaceship-titanic/train.csv")
test = pd.read_csv("../input/spaceship-titanic/test.csv")

In [3]:
#train.info()
#train.columns
train.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [4]:
train.isna().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

Take care of missing values first by filling in with mean,median or mode (seems like you can't do anything before you impute basically). or just fill with NONE (cats only tho)

In [5]:
y = train["Transported"]
train.drop("Transported", axis=1, inplace=True)

In [6]:
num = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa','VRDeck', 'Decks'] #you can also use list comprehensions for these btw
cat = ['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP', 'Surname']

In [7]:
train.columns

Index(['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age',
       'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
       'Name'],
      dtype='object')

Handle NaN values

In [8]:
for col in train.columns: #columns are practically the same anyway so we iterate over them
    if col not in num:
        train[col].fillna("NONE", inplace=True)
        test[col].fillna("NONE", inplace=True)
    train[col].fillna(train[col].mode()[0], inplace=True)
    test[col].fillna(test[col].mode()[0], inplace=True)

Now we check for missing values once more

In [9]:
train.isna().sum()

PassengerId     0
HomePlanet      0
CryoSleep       0
Cabin           0
Destination     0
Age             0
VIP             0
RoomService     0
FoodCourt       0
ShoppingMall    0
Spa             0
VRDeck          0
Name            0
dtype: int64

do some feature engineering

In [10]:
#train["Surname"] = train.Name.apply(lambda x: x.split()[-1])
train["Decks"] = train.PassengerId.apply(lambda x: x[:4]).astype('int64')
train["cabin_letters"] = train.Cabin.apply(lambda x: x.split("/")[0])
train.drop(["Name", "Cabin"], axis=1, inplace=True)

#test["Surname"] = test.Name.apply(lambda x: x.split()[-1])
test["Decks"] = test.PassengerId.apply(lambda x: x[:4]).astype('int64')
test["cabin_letters"] = test.Cabin.apply(lambda x: x.split("/")[0])
test.drop(["Name","Cabin"], axis=1, inplace=True)

Seperate data into numberic and categroical columns for data understanding

In [11]:
#num = []
#cat = []

Normalization and scaling

In [12]:
features = ['HomePlanet', 'CryoSleep', 'Destination', 'Age', 'VIP',
       'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
       'Decks', 'cabin_letters']
x = pd.get_dummies(train[features])
test_encoded = pd.get_dummies(test[features])

In [13]:
x.head()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Decks,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,...,VIP_NONE,cabin_letters_A,cabin_letters_B,cabin_letters_C,cabin_letters_D,cabin_letters_E,cabin_letters_F,cabin_letters_G,cabin_letters_NONE,cabin_letters_T
0,39.0,0.0,0.0,0.0,0.0,0.0,1,False,True,False,...,False,False,True,False,False,False,False,False,False,False
1,24.0,109.0,9.0,25.0,549.0,44.0,2,True,False,False,...,False,False,False,False,False,False,True,False,False,False
2,58.0,43.0,3576.0,0.0,6715.0,49.0,3,False,True,False,...,False,True,False,False,False,False,False,False,False,False
3,33.0,0.0,1283.0,371.0,3329.0,193.0,3,False,True,False,...,False,True,False,False,False,False,False,False,False,False
4,16.0,303.0,70.0,151.0,565.0,2.0,4,True,False,False,...,False,False,False,False,False,False,True,False,False,False


In [14]:
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()

x_scaled = ss.fit_transform(x)
test_scaled = ss.transform(test_encoded)

In [15]:
#X_train, X_test, y_train, y_test = train_test_split(x_scaled, y, test_size=0.2, random_state=4)
#first test with simple algos and metrics tailored to algo type before cross-validation (at least typically)

In [16]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV

rf1 = RandomForestClassifier(random_state=1)
rf_params ={
    'bootstrap': [True, False],
    'max_depth': [10, None],
    'min_samples_leaf': [1, 2, 4],
    'min_samples_split': [2, 5, 10],
    'n_estimators': [5, 10, 15, 20, 25, 30]}
    
rf_gs = GridSearchCV(rf1, param_grid = rf_params, cv=5, n_jobs = -1)
rf_gs.fit(x_scaled, y)
print(rf_gs.best_score_)
print(rf_gs.best_params_)

0.7730396753289293
{'bootstrap': True, 'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 30}


We then simply set params to best params

In [17]:
rf = RandomForestClassifier(random_state = 1, bootstrap = True, max_depth=10, min_samples_leaf=4, min_samples_split =2, n_estimators = 30)

cv = cross_val_score(rf, x_scaled, y, cv=5)
print(cv)
print(cv.mean())

[0.72685451 0.77055779 0.78780909 0.81070196 0.76927503]
0.7730396753289293


In [18]:
rf.fit(x_scaled, y)
predictions = rf.predict(test_encoded)



In [19]:
submission = pd.read_csv("../input/spaceship-titanic/sample_submission.csv")
submission.tail()
submission["Transported"] = predictions
submission.to_csv('submission.csv', index= False)
