In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/spaceship-titanic/sample_submission.csv
/kaggle/input/spaceship-titanic/train.csv
/kaggle/input/spaceship-titanic/test.csv


In [2]:
train = pd.read_csv("/kaggle/input/spaceship-titanic/train.csv")
submit = pd.read_csv("/kaggle/input/spaceship-titanic/test.csv")
train_labels = train["Transported"]

In [3]:
submission_id = submit.PassengerId
submission_id

0       0013_01
1       0018_01
2       0019_01
3       0021_01
4       0023_01
         ...   
4272    9266_02
4273    9269_01
4274    9271_01
4275    9273_01
4276    9277_01
Name: PassengerId, Length: 4277, dtype: object

# Exploration

In [4]:
train.shape

(8693, 14)

In [5]:
train.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [6]:
[(c, train.columns.get_loc(c)) for c in train.columns if c in train]

[('PassengerId', 0),
 ('HomePlanet', 1),
 ('CryoSleep', 2),
 ('Cabin', 3),
 ('Destination', 4),
 ('Age', 5),
 ('VIP', 6),
 ('RoomService', 7),
 ('FoodCourt', 8),
 ('ShoppingMall', 9),
 ('Spa', 10),
 ('VRDeck', 11),
 ('Name', 12),
 ('Transported', 13)]

In [7]:
train.describe()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
count,8514.0,8512.0,8510.0,8485.0,8510.0,8505.0
mean,28.82793,224.687617,458.077203,173.729169,311.138778,304.854791
std,14.489021,666.717663,1611.48924,604.696458,1136.705535,1145.717189
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,0.0,0.0,0.0,0.0,0.0
50%,27.0,0.0,0.0,0.0,0.0,0.0
75%,38.0,47.0,76.0,27.0,59.0,46.0
max,79.0,14327.0,29813.0,23492.0,22408.0,24133.0


In [8]:
train["Destination"].value_counts()

TRAPPIST-1e      5915
55 Cancri e      1800
PSO J318.5-22     796
Name: Destination, dtype: int64

In [9]:
nan_counts = train.isna().sum()
nan_counts

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

In [10]:
train["Destination"].value_counts()

TRAPPIST-1e      5915
55 Cancri e      1800
PSO J318.5-22     796
Name: Destination, dtype: int64

Separate out deck, room number and side (Port or Starboard) for each Cabin, creating new attributes

In [11]:
cabins = train["Cabin"]
cabins.isna().sum()

199

In [12]:
train.columns

Index(['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age',
       'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
       'Name', 'Transported'],
      dtype='object')

In [13]:
from sklearn import set_config
set_config(transform_output="pandas")



In [14]:
# Custom transformer to drop columns
class DropperTransformer():
    def __init__(self,columns):
        self.columns=columns

    def transform(self,X,y=None):
        return X.drop(self.columns,axis=1)

    def fit(self, X, y=None):
        return self 

In [15]:
# Create a custom transformer to parse out deck, room_number and side
from sklearn.base import BaseEstimator, TransformerMixin

class DeckTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, segment_cabin_feature=True):
        self.segment_cabin_feature = segment_cabin_feature
        
    def fit(self, X, y=None):
         return self
        
    def transform(self, X):
        temp = X["Cabin"].str.split("/", expand=True)
        X["Deck"] = temp[0]
        X["CabinNumber"] = temp[1]
        X["Side"] = temp[2]
        return X

In [16]:
# Custom transformer to add up luxury spending
class LuxuryTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
        
    def fit(self, X, y=None):
         return self
        
    def transform(self, X):
        X["Spending"] = X["RoomService"] + X["FoodCourt"] + X["ShoppingMall"] + X["Spa"] + X["VRDeck"]
        return X

In [17]:
# Custom transformer to convert boolean columns to 0 or 1?
class BoolToIntTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
         return self
        
    def transform(self, X):
        X["VIP"] = X["VIP"].replace({True: 1, False: 0})
        X["CryoSleep"] = X["CryoSleep"].replace({True: 1, False: 0})
        return X

One-hot encoding for the following columns: HomePlanet, Destination, Side

Spending = RoomService + FoodCourt + ShoppingMall + Spa + VRDeck
Drop these columns, keep Spending

Drop Name

Separate out Passenger_Id parts into FamilyID and IndividualID


Manually One-Hot encode columns due to deck transformer

In [18]:
deck_transformer = DeckTransformer(True)
train = deck_transformer.fit_transform(train)
submit = deck_transformer.fit_transform(submit)

In [19]:
# Get columns that need to be encoded
cols = ["HomePlanet", "Destination", "Deck", "Side"]

train = pd.get_dummies(data=train,columns=cols,dtype="float")
submit = pd.get_dummies(data=submit,columns=cols,dtype="float")

In [20]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

numeric_features = ["Age","RoomService","FoodCourt", "ShoppingMall", "Spa", "VRDeck"]
cat_features = ["HomePlanet", "Destination"]
deck_features = ["Side", "Deck"]
boolean_features = ["VIP", "CryoSleep"]

numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median")),
           ("luxury_add", LuxuryTransformer())
])

boolean_transformer = Pipeline(
    steps=[("bool", BoolToIntTransformer())]
)

# Declare Column transformer
from sklearn.compose import ColumnTransformer

full_pipeline = ColumnTransformer([
    ("num", numeric_transformer, numeric_features),
    ("bool", boolean_transformer,boolean_features)
],remainder="passthrough")

train_prepared = full_pipeline.fit_transform(train)
submit_prepared = full_pipeline.fit_transform(submit)
train_prepared

Unnamed: 0,num__Age,num__RoomService,num__FoodCourt,num__ShoppingMall,num__Spa,num__VRDeck,num__Spending,bool__VIP,bool__CryoSleep,remainder__PassengerId,...,remainder__Deck_A,remainder__Deck_B,remainder__Deck_C,remainder__Deck_D,remainder__Deck_E,remainder__Deck_F,remainder__Deck_G,remainder__Deck_T,remainder__Side_P,remainder__Side_S
0,39.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0001_01,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,24.0,109.0,9.0,25.0,549.0,44.0,736.0,0.0,0.0,0002_01,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
2,58.0,43.0,3576.0,0.0,6715.0,49.0,10383.0,1.0,0.0,0003_01,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,33.0,0.0,1283.0,371.0,3329.0,193.0,5176.0,0.0,0.0,0003_02,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,16.0,303.0,70.0,151.0,565.0,2.0,1091.0,0.0,0.0,0004_01,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,41.0,0.0,6819.0,0.0,1643.0,74.0,8536.0,1.0,0.0,9276_01,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
8689,18.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,9278_01,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
8690,26.0,0.0,0.0,1872.0,1.0,0.0,1873.0,0.0,0.0,9279_01,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
8691,32.0,0.0,1049.0,0.0,353.0,3235.0,4637.0,0.0,0.0,9280_01,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


In [21]:
submit_prepared

Unnamed: 0,num__Age,num__RoomService,num__FoodCourt,num__ShoppingMall,num__Spa,num__VRDeck,num__Spending,bool__VIP,bool__CryoSleep,remainder__PassengerId,...,remainder__Deck_A,remainder__Deck_B,remainder__Deck_C,remainder__Deck_D,remainder__Deck_E,remainder__Deck_F,remainder__Deck_G,remainder__Deck_T,remainder__Side_P,remainder__Side_S
0,27.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0013_01,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1,19.0,0.0,9.0,0.0,2823.0,0.0,2832.0,0.0,0.0,0018_01,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
2,31.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0019_01,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,38.0,0.0,6652.0,0.0,181.0,585.0,7418.0,0.0,0.0,0021_01,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,20.0,10.0,0.0,635.0,0.0,0.0,645.0,0.0,0.0,0023_01,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4272,34.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,9266_02,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
4273,42.0,0.0,847.0,17.0,10.0,144.0,1018.0,0.0,0.0,9269_01,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4274,26.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,9271_01,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4275,26.0,0.0,2680.0,0.0,0.0,523.0,3203.0,0.0,0.0,9273_01,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [22]:
nan_counts = train_prepared.isna().sum()
nan_counts

num__Age                                  0
num__RoomService                          0
num__FoodCourt                            0
num__ShoppingMall                         0
num__Spa                                  0
num__VRDeck                               0
num__Spending                             0
bool__VIP                               203
bool__CryoSleep                         217
remainder__PassengerId                    0
remainder__Cabin                        199
remainder__Name                         200
remainder__Transported                    0
remainder__CabinNumber                  199
remainder__HomePlanet_Earth               0
remainder__HomePlanet_Europa              0
remainder__HomePlanet_Mars                0
remainder__Destination_55 Cancri e        0
remainder__Destination_PSO J318.5-22      0
remainder__Destination_TRAPPIST-1e        0
remainder__Deck_A                         0
remainder__Deck_B                         0
remainder__Deck_C               

In [23]:
nan_counts = submit_prepared.isna().sum()
nan_counts

num__Age                                  0
num__RoomService                          0
num__FoodCourt                            0
num__ShoppingMall                         0
num__Spa                                  0
num__VRDeck                               0
num__Spending                             0
bool__VIP                                93
bool__CryoSleep                          93
remainder__PassengerId                    0
remainder__Cabin                        100
remainder__Name                          94
remainder__CabinNumber                  100
remainder__HomePlanet_Earth               0
remainder__HomePlanet_Europa              0
remainder__HomePlanet_Mars                0
remainder__Destination_55 Cancri e        0
remainder__Destination_PSO J318.5-22      0
remainder__Destination_TRAPPIST-1e        0
remainder__Deck_A                         0
remainder__Deck_B                         0
remainder__Deck_C                         0
remainder__Deck_D               

In [24]:
# Fill missing values
train_prepared[['bool__VIP','bool__CryoSleep']] = train_prepared[['bool__VIP','bool__CryoSleep']].fillna(value=0.0)
submit_prepared[['bool__VIP','bool__CryoSleep']] = submit_prepared[['bool__VIP','bool__CryoSleep']].fillna(value=0.0)

train_prepared[['remainder__CabinNumber']] = train_prepared[['remainder__CabinNumber']].fillna(value="-1")
submit_prepared[['remainder__CabinNumber']] = submit_prepared[['remainder__CabinNumber']].fillna(value='-1')


In [25]:
nan_counts = submit_prepared.isna().sum()
nan_counts

num__Age                                  0
num__RoomService                          0
num__FoodCourt                            0
num__ShoppingMall                         0
num__Spa                                  0
num__VRDeck                               0
num__Spending                             0
bool__VIP                                 0
bool__CryoSleep                           0
remainder__PassengerId                    0
remainder__Cabin                        100
remainder__Name                          94
remainder__CabinNumber                    0
remainder__HomePlanet_Earth               0
remainder__HomePlanet_Europa              0
remainder__HomePlanet_Mars                0
remainder__Destination_55 Cancri e        0
remainder__Destination_PSO J318.5-22      0
remainder__Destination_TRAPPIST-1e        0
remainder__Deck_A                         0
remainder__Deck_B                         0
remainder__Deck_C                         0
remainder__Deck_D               

In [26]:
[c for c in train_prepared.columns if c in train_prepared]

['num__Age',
 'num__RoomService',
 'num__FoodCourt',
 'num__ShoppingMall',
 'num__Spa',
 'num__VRDeck',
 'num__Spending',
 'bool__VIP',
 'bool__CryoSleep',
 'remainder__PassengerId',
 'remainder__Cabin',
 'remainder__Name',
 'remainder__Transported',
 'remainder__CabinNumber',
 'remainder__HomePlanet_Earth',
 'remainder__HomePlanet_Europa',
 'remainder__HomePlanet_Mars',
 'remainder__Destination_55 Cancri e',
 'remainder__Destination_PSO J318.5-22',
 'remainder__Destination_TRAPPIST-1e',
 'remainder__Deck_A',
 'remainder__Deck_B',
 'remainder__Deck_C',
 'remainder__Deck_D',
 'remainder__Deck_E',
 'remainder__Deck_F',
 'remainder__Deck_G',
 'remainder__Deck_T',
 'remainder__Side_P',
 'remainder__Side_S']

In [27]:
len([c for c in train_prepared.columns if c in train_prepared])

30

In [28]:
train_prepared = train_prepared.drop(["num__RoomService", "num__FoodCourt", "num__ShoppingMall", "num__Spa", "num__VRDeck", "remainder__Name","remainder__Cabin"],axis=1)
submit_prepared = submit_prepared.drop(["num__RoomService", "num__FoodCourt", "num__ShoppingMall", "num__Spa", "num__VRDeck","remainder__Cabin", "remainder__Name"],axis=1)

# Model Exploration

In [29]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(train_prepared, test_size=0.2, random_state=42)
train_set.isna().sum()

num__Age                                0
num__Spending                           0
bool__VIP                               0
bool__CryoSleep                         0
remainder__PassengerId                  0
remainder__Transported                  0
remainder__CabinNumber                  0
remainder__HomePlanet_Earth             0
remainder__HomePlanet_Europa            0
remainder__HomePlanet_Mars              0
remainder__Destination_55 Cancri e      0
remainder__Destination_PSO J318.5-22    0
remainder__Destination_TRAPPIST-1e      0
remainder__Deck_A                       0
remainder__Deck_B                       0
remainder__Deck_C                       0
remainder__Deck_D                       0
remainder__Deck_E                       0
remainder__Deck_F                       0
remainder__Deck_G                       0
remainder__Deck_T                       0
remainder__Side_P                       0
remainder__Side_S                       0
dtype: int64

In [30]:
# Get Appropriate labels
train_set_labels = train_set["remainder__Transported"]
test_set_labels = test_set["remainder__Transported"]

#Drop from both
train_set = train_set.drop(["remainder__Transported"],axis=1)
test_set = test_set.drop(["remainder__Transported"],axis=1)

# Convert
train_set_labels = train_set_labels.replace({True: 1.0, False: 0.0})
test_set_labels = test_set_labels.replace({True: 1, False: 0})

train_set_labels = train_set_labels.values.ravel()
test_set_labels = test_set_labels.values.ravel()

In [31]:
train_set_labels

array([0., 0., 1., ..., 0., 0., 0.])

In [32]:
test_set_labels

array([1, 0, 0, ..., 0, 1, 0])

In [33]:
from sklearn.model_selection import cross_val_score

Model 1: Decision Tree

In [34]:
from sklearn.tree import DecisionTreeRegressor
tree_reg = DecisionTreeRegressor()

tree_reg.fit(train_set, train_set_labels)
predictions = tree_reg.predict(test_set)
predictions

array([0., 0., 1., ..., 0., 1., 1.])

In [35]:
cross_val_score(tree_reg, train_set, train_set_labels, cv=6, scoring="accuracy")

array([0.67385677, 0.69628991, 0.68334771, 0.66695427, 0.68507334,
       0.68679896])

 Model 2: RandomForestClassifier

In [36]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'n_estimators':[10,20,30], 'max_features':[12,16,21]},
    {'bootstrap':[False], 'n_estimators':[5,20], 'max_features':[12,21]}
]

forest_reg = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(forest_reg, param_grid, scoring='accuracy', return_train_score=True)
grid_search.fit(train_set,train_set_labels)

In [37]:
grid_search.best_params_

{'max_features': 16, 'n_estimators': 20}

In [38]:
grid_search.best_estimator_

In [39]:
best = grid_search.best_estimator_
cross_val_score(best, train_set, train_set_labels, cv=10, scoring="recall")

array([0.66857143, 0.67714286, 0.62285714, 0.62285714, 0.65142857,
       0.65142857, 0.67714286, 0.64285714, 0.69142857, 0.66571429])

In [40]:
cross_val_score(best, train_set, train_set_labels, cv=10, scoring="accuracy")

array([0.74425287, 0.72126437, 0.72126437, 0.72701149, 0.73381295,
       0.72661871, 0.75107914, 0.7352518 , 0.74820144, 0.76402878])

Model 3: Gradient Boost Classifier

In [41]:
from sklearn.ensemble import GradientBoostingClassifier

gbrt = GradientBoostingClassifier(random_state= 42)

cross_val_score(gbrt, train_set, train_set_labels, cv=10, scoring="accuracy")

array([0.73706897, 0.7658046 , 0.73132184, 0.75862069, 0.74532374,
       0.75107914, 0.77122302, 0.75683453, 0.76690647, 0.77985612])

In [42]:
cross_val_score(gbrt, train_set, train_set_labels, cv=10, scoring="recall")

array([0.66857143, 0.70571429, 0.62857143, 0.68571429, 0.66571429,
       0.69142857, 0.71142857, 0.7       , 0.73428571, 0.7       ])

In [43]:
param_grid = [
    {'n_estimators':[10,20,30], 'max_features':[12,16,21], 'learning_rate': [0.05, 0.1,0.2]},
    {'warm_start':[True], 'n_estimators':[5,20], 'max_features':[12,21], 'learning_rate': [0.05, 0.1,0.2]}
]

# Accuracy
grid_search = GridSearchCV(gbrt, param_grid, scoring='accuracy', return_train_score=True)
grid_search.fit(train_set,train_set_labels)

accuracy_best = grid_search.best_estimator_

In [44]:
# Recall
grid_search = GridSearchCV(gbrt, param_grid, scoring='recall', return_train_score=True)
grid_search.fit(train_set,train_set_labels)

recall_best = grid_search.best_estimator_

In [45]:
cross_val_score(accuracy_best, train_set, train_set_labels, cv=10, scoring="recall")

array([0.66285714, 0.70857143, 0.60571429, 0.69142857, 0.66571429,
       0.69428571, 0.70571429, 0.7       , 0.70571429, 0.69142857])

In [46]:
cross_val_score(accuracy_best, train_set, train_set_labels, cv=10, scoring="accuracy")

array([0.74137931, 0.77155172, 0.7183908 , 0.76867816, 0.75683453,
       0.74820144, 0.76978417, 0.75971223, 0.75683453, 0.77553957])

In [47]:
cross_val_score(accuracy_best, train_set, train_set_labels, cv=10, scoring="precision")

array([0.78911565, 0.81311475, 0.78518519, 0.82033898, 0.81754386,
       0.78135048, 0.8125    , 0.7980456 , 0.78913738, 0.83448276])

In [48]:
cross_val_score(recall_best, train_set, train_set_labels, cv=10, scoring="recall")

array([0.69142857, 0.69428571, 0.61142857, 0.68285714, 0.67142857,
       0.68857143, 0.68857143, 0.70285714, 0.72285714, 0.69428571])

In [49]:
cross_val_score(recall_best, train_set, train_set_labels, cv=10, scoring="accuracy")

array([0.7341954 , 0.74137931, 0.7112069 , 0.74425287, 0.74532374,
       0.73093525, 0.74820144, 0.74388489, 0.75683453, 0.76258993])

In [50]:
cross_val_score(recall_best, train_set, train_set_labels, cv=10, scoring="precision")

array([0.75862069, 0.76898734, 0.76702509, 0.78104575, 0.79124579,
       0.75548589, 0.78501629, 0.76875   , 0.77846154, 0.80730897])

# Predictions

In [51]:
accuracy_best.fit(train_set,train_set_labels)
predictions = accuracy_best.predict(test_set)
predictions

array([0., 0., 1., ..., 0., 1., 1.])

In [52]:
test_output = pd.DataFrame()
test_output["PassengerId"] = test_set.index
test_output["Predicted"] = predictions
test_output["Actual"] = test_set_labels
test_output["Correct"] = test_output["Predicted"] == test_output["Actual"]

In [53]:
test_output

Unnamed: 0,PassengerId,Predicted,Actual,Correct
0,304,0.0,1,False
1,2697,0.0,0,True
2,8424,1.0,0,False
3,1672,0.0,1,False
4,8458,1.0,1,True
...,...,...,...,...
1734,7175,1.0,0,False
1735,3187,1.0,1,True
1736,1302,0.0,0,True
1737,5934,1.0,1,True


In [54]:
test_output["Correct"].sum() / 1739

0.750431282346176

Submission

In [55]:
submission_predictions = accuracy_best.predict(submit_prepared)


output = pd.DataFrame({'PassengerId': submission_id,
                       'Transported': submission_predictions.squeeze()})

In [56]:
output.Transported = output.Transported.replace({1.0: True, 0.0: False})
output

Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,False
4,0023_01,False
...,...,...
4272,9266_02,True
4273,9269_01,True
4274,9271_01,True
4275,9273_01,True


In [57]:
output.to_csv('/kaggle/working/submission.csv', index=False)