In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy as np # linear algebra
import pandas as pd 

%matplotlib inline
import matplotlib.pyplot as plt


# Read data

In [None]:
import pathlib
data_dir = pathlib.Path('/content/drive/MyDrive/ColabNotebooks/spaceship_titanic')

df_train = pd.read_csv(data_dir / "train.csv")
df_test = pd.read_csv(data_dir / "test.csv")
df_test_original = pd.read_csv(data_dir / "test.csv")

#print(df_train.describe())

# Check Nulls

In [None]:
total = df_train.isnull().sum().sort_values(ascending = False)
percent = (df_train.isnull().sum()/df_train.isnull().count()*100).sort_values(ascending = False)
missing_data  = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
print(missing_data)

              Total   Percent
CryoSleep       217  2.496261
ShoppingMall    208  2.392730
VIP             203  2.335212
HomePlanet      201  2.312205
Name            200  2.300702
Cabin           199  2.289198
VRDeck          188  2.162660
FoodCourt       183  2.105142
Spa             183  2.105142
Destination     182  2.093639
RoomService     181  2.082135
Age             179  2.059128
PassengerId       0  0.000000
Transported       0  0.000000


# Add Variables

In [None]:
col_to_sum = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

#Create variable SumSpends to gather the sum of spent value by each passenger
df_train['SumSpends'] = df_train[col_to_sum].sum(axis=1)
df_test['SumSpends'] = df_test[col_to_sum].sum(axis=1)


#Cabin column - split into 3 columns, deck, num and side
df_train[['deck','num', 'side']] = df_train['Cabin'].str.split('/', 3, expand=True)
df_test[['deck','num', 'side']] = df_test['Cabin'].str.split('/', 3, expand=True)

df_train.drop('Cabin', axis=1, inplace=True)
df_test.drop('Cabin', axis=1, inplace=True)

#encode categorical variables as integer to improve model performance
from sklearn.preprocessing import OrdinalEncoder
oc = OrdinalEncoder()

size_train = len(df_train)

#join train and test datasets for encoding
df_for_encode = pd.concat([df_train, df_test])

#get columns to be encoded
object_cols = [col for col in df_train.columns if df_train[col].dtype == 'object' or df_train[col].dtype == 'category']
object_cols.append('Transported')

#convert to category to save space
df_for_encode[object_cols] = df_for_encode[object_cols].astype('category')
#encode values
df_for_encode[object_cols] = oc.fit_transform(df_for_encode[object_cols])

del df_train, df_test

#split train and test datasets
df_train = df_for_encode.iloc[:size_train, :]
df_test = df_for_encode.iloc[size_train: , :]
df_test.drop('Transported', axis=1, inplace=True)

del df_for_encode

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


# New Data Cleaning

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

object_cols = [col for col in df_train.columns if df_train[col].dtype == 'object' or df_train[col].dtype == 'category']

df_train[object_cols] = df_train[object_cols].astype('category')
df_test[object_cols] = df_test[object_cols].astype('category')

null_cols = df_train.isnull().sum().sort_values(ascending=False)
null_cols = list(null_cols[null_cols>1].index)


#Replace null values using sklearn SimpleImputer, different strategies are used for different columns
toMedian = ['num']
toMode = ['CryoSleep','VIP','HomePlanet','Name','deck','side']
toMean = ['Destination','Age','SumSpends','RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']

ct = ColumnTransformer([("imp", SimpleImputer(strategy='mean'), toMean)])
df_train[toMean] = ct.fit_transform(df_train[toMean])
df_test[toMean] = ct.fit_transform(df_test[toMean])


ct2 = ColumnTransformer([("imp", SimpleImputer(strategy='median'), toMedian)])
df_train[toMedian] = ct2.fit_transform(df_train[toMedian])
df_test[toMedian] = ct2.fit_transform(df_test[toMedian])


ct3 = ColumnTransformer([("imp", SimpleImputer(strategy='most_frequent'), toMode)])
df_train[toMode] = ct3.fit_transform(df_train[toMode])
df_test[toMode] = ct3.fit_transform(df_test[toMode])



In [None]:
df_train.head(35)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,SumSpends,deck,num,side
0,0.0,1.0,0.0,2.0,39.0,0.0,0.0,0.0,0.0,0.0,0.0,7819.0,0.0,0.0,1.0,0.0,0.0
1,1.0,0.0,0.0,2.0,24.0,0.0,109.0,9.0,25.0,549.0,44.0,6688.0,1.0,736.0,5.0,0.0,1.0
2,2.0,1.0,0.0,2.0,58.0,1.0,43.0,3576.0,0.0,6715.0,49.0,669.0,0.0,10383.0,0.0,0.0,1.0
3,3.0,1.0,0.0,2.0,33.0,0.0,0.0,1283.0,371.0,3329.0,193.0,10688.0,0.0,5176.0,0.0,0.0,1.0
4,4.0,0.0,0.0,2.0,16.0,0.0,303.0,70.0,151.0,565.0,2.0,12400.0,1.0,1091.0,5.0,1.0,1.0
5,5.0,0.0,0.0,1.0,44.0,0.0,0.0,483.0,0.0,291.0,0.0,10283.0,1.0,774.0,5.0,0.0,0.0
6,6.0,0.0,0.0,2.0,26.0,0.0,42.0,1539.0,3.0,0.0,0.0,1733.0,1.0,1584.0,5.0,1006.0,1.0
7,7.0,0.0,1.0,2.0,28.0,0.0,0.0,0.0,0.0,0.0,304.854791,2182.0,1.0,0.0,6.0,0.0,1.0
8,8.0,0.0,0.0,2.0,35.0,0.0,0.0,785.0,17.0,216.0,0.0,842.0,1.0,1018.0,5.0,1117.0,1.0
9,9.0,1.0,1.0,0.0,14.0,0.0,0.0,0.0,0.0,0.0,0.0,4286.0,1.0,0.0,1.0,1.0,0.0


# Delete Outliers

In [None]:
import seaborn as sns 
from matplotlib.pyplot import figure

clean_outliers = False
print_boxplot = False

cols_with_outliers = ['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']
threshold_outliers = [10000, 25000, 10000, 15000, 18000]

if clean_outliers:
  for index, var in enumerate(cols_with_outliers):
    print(df_train[var].shape)
    df_train.drop(df_train[df_train[var] > threshold_outliers[index]].index, axis=0, inplace=True)
    print(df_train[var].shape)

if print_boxplot:
  for index, var in enumerate(df_train.columns):
    figure(figsize=(8, 6), dpi=80)
    sns.boxplot(data=df_train[var], orient='h')
    plt.xlabel(var)
    plt.xlim((0,25000))
    plt.show()
    plt.clf()

# Log Transform

In [None]:
log_transform = False

if log_transform:
  import seaborn as sns
  cols_logs = ['FoodCourt','ShoppingMall','Spa','VRDeck']
  #teste=df_train.copy(deep=True)

  df_train[cols_logs] = df_train[cols_logs].replace(0, 0.0001)

  for col in cols_logs:
    df_train[col] = np.log(df_train[col])
    #sns.histplot(df_train[col])
    #plt.show()
    #plt.clf()


In [None]:
df_train

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,SumSpends,deck,num,side
0,0.0,1.0,0.0,2.0,39.0,0.0,0.0,0.0,0.0,0.0,0.0,7819.0,0.0,0.0,1.0,0.0,0.0
1,1.0,0.0,0.0,2.0,24.0,0.0,109.0,9.0,25.0,549.0,44.0,6688.0,1.0,736.0,5.0,0.0,1.0
2,2.0,1.0,0.0,2.0,58.0,1.0,43.0,3576.0,0.0,6715.0,49.0,669.0,0.0,10383.0,0.0,0.0,1.0
3,3.0,1.0,0.0,2.0,33.0,0.0,0.0,1283.0,371.0,3329.0,193.0,10688.0,0.0,5176.0,0.0,0.0,1.0
4,4.0,0.0,0.0,2.0,16.0,0.0,303.0,70.0,151.0,565.0,2.0,12400.0,1.0,1091.0,5.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,12964.0,1.0,0.0,0.0,41.0,1.0,0.0,6819.0,0.0,1643.0,74.0,5252.0,0.0,8536.0,0.0,1872.0,0.0
8689,12966.0,0.0,1.0,1.0,18.0,0.0,0.0,0.0,0.0,0.0,0.0,7124.0,0.0,0.0,6.0,556.0,1.0
8690,12967.0,0.0,0.0,2.0,26.0,0.0,0.0,0.0,1872.0,1.0,0.0,4498.0,1.0,1873.0,6.0,559.0,1.0
8691,12968.0,1.0,0.0,0.0,32.0,0.0,0.0,1049.0,0.0,353.0,3235.0,2389.0,0.0,4637.0,4.0,1460.0,1.0


# Normalization



In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()#z-score normalization

df_train2 = df_train.copy(deep=True)#deep = True to create new copy
df_test2 = df_test.copy(deep=True)

#vars_to_normalize = ['RoomService', 'Spa', 'VRDeck', 'SumSpends']
vars_to_normalize = df_test2.columns.to_list()
vars_to_normalize.remove('CryoSleep')
print(vars_to_normalize)

scaler.fit(df_train2[vars_to_normalize])
df_train2[vars_to_normalize] = scaler.transform(df_train2[vars_to_normalize])
df_test2[vars_to_normalize] = scaler.transform(df_test2[vars_to_normalize])

['PassengerId', 'HomePlanet', 'Destination', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Name', 'SumSpends', 'deck', 'num', 'side']


In [None]:

df_train2

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,SumSpends,deck,num,side
0,-1.740838,0.440385,0.0,0.636441,0.709437,-0.153063,-0.340590,-0.287314,-0.290817,-0.276663,-0.269023,0.440548,0.0,-0.514066,-1.886321,-1.895131,-1.032865
1,-1.740569,-0.817259,0.0,0.636441,-0.336717,-0.153063,-0.175364,-0.281669,-0.248968,0.211505,-0.230194,0.135368,1.0,-0.251479,0.385470,-1.895131,0.968181
2,-1.740301,0.440385,0.0,0.636441,2.034566,6.533255,-0.275409,1.955616,-0.290817,5.694289,-0.225782,-1.488746,0.0,3.190333,-2.454269,-1.895131,0.968181
3,-1.740032,0.440385,0.0,0.636441,0.290975,-0.153063,-0.340590,0.517406,0.330225,2.683471,-0.098708,1.214694,0.0,1.332604,-2.454269,-1.895131,0.968181
4,-1.739764,-0.817259,0.0,0.636441,-0.894666,-0.153063,0.118709,-0.243409,-0.038048,0.225732,-0.267258,1.676645,1.0,-0.124824,0.385470,-1.893235,0.968181
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,1.741038,0.440385,0.0,-1.827957,0.848924,6.533255,-0.340590,3.989682,-0.290817,1.184286,-0.203720,-0.252109,0.0,2.531369,-2.454269,1.653523,-1.032865
8689,1.741575,-0.817259,1.0,-0.595758,-0.755179,-0.153063,-0.340590,-0.287314,-0.290817,-0.276663,-0.269023,0.253015,0.0,-0.514066,0.953418,-0.841150,0.968181
8690,1.741844,-0.817259,0.0,0.636441,-0.197230,-0.153063,-0.340590,-0.287314,2.842851,-0.275774,-0.269023,-0.455562,1.0,0.154175,0.953418,-0.835463,0.968181
8691,1.742112,0.440385,0.0,-1.827957,0.221232,-0.153063,-0.340590,0.370637,-0.290817,0.037223,2.585740,-1.024637,0.0,1.140302,-0.182478,0.872516,0.968181


In [None]:
df_train2.isnull().sum()

PassengerId     0
HomePlanet      0
CryoSleep       0
Destination     0
Age             0
VIP             0
RoomService     0
FoodCourt       0
ShoppingMall    0
Spa             0
VRDeck          0
Name            0
Transported     0
SumSpends       0
deck            0
num             0
side            0
dtype: int64

In [None]:
import tensorflow as tf
class MyCallback(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs={}):
        if logs.get('val_accuracy') is not None and logs.get('val_accuracy') > 0.81:
            print("Reached 80.8% cal_accuracy so cancelling training!")
            self.model.stop_training = True


callbacks = MyCallback()

#Tensorflow - Training

In [None]:

import sklearn
from sklearn.model_selection import train_test_split

#vars_to_use_training = ['FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'SumSpends','deck','num','HomePlanet', 'Transported']
vars_to_use_training = ['CryoSleep', 'RoomService', 'Spa', 'VRDeck', 'SumSpends','deck','side','Transported']
df_train2 = df_train2[vars_to_use_training]

#train test split
X_train, X_test, y_train, y_test = train_test_split(df_train2.drop(['Transported'], axis=1), df_train2['Transported'],
                                                    test_size=0.30, random_state=45)

In [None]:

model = tf.keras.Sequential()
#First Hidden Layer
dim = len(df_train2.columns) - 1
model.add(tf.keras.layers.Dense(256, activation='relu', input_dim=dim))
model.add(tf.keras.layers.Dense(128, activation='relu'))

#Second  Hidden Layer
model.add(tf.keras.layers.Dense(4, activation='sigmoid'))
#model.add(tf.keras.layers.Dropout(0.2))

#Output Layer
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

model.compile(optimizer ='adam',loss='binary_crossentropy', metrics =['accuracy'])

model.fit(X_train, y_train, steps_per_epoch = 80, epochs=25, validation_data=(X_test, y_test), callbacks=[callbacks])


Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.History at 0x7f57d620d650>

# Ensemble

In [None]:
install_catboost = True
if install_catboost:
  !pip install catboost
  !pip install ipywidgets
  !pip install lightgbm 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting catboost
  Downloading catboost-1.1.1-cp37-none-manylinux1_x86_64.whl (76.6 MB)
[K     |████████████████████████████████| 76.6 MB 1.1 MB/s 
Installing collected packages: catboost
Successfully installed catboost-1.1.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting jedi>=0.10
  Downloading jedi-0.18.1-py2.py3-none-any.whl (1.6 MB)
[K     |████████████████████████████████| 1.6 MB 4.8 MB/s 
Installing collected packages: jedi
Successfully installed jedi-0.18.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from catboost import CatBoostClassifier
import lightgbm
from lightgbm import LGBMClassifier
import time
from sklearn.model_selection import GridSearchCV

use_probabilities = False

# Classifiers
classifiers = {
    "LogisticRegression" : LogisticRegression(C=1, penalty='l1', solver='liblinear'),
    "KNN" : KNeighborsClassifier(n_neighbors=9, p=2),
    "SVC" : SVC(C= 1.25, gamma= 'scale', kernel= 'rbf',probability=use_probabilities),
    "RandomForest" : RandomForestClassifier(max_depth= 10, n_estimators= 250),
    "LGBM" : LGBMClassifier(learning_rate= 0.05, max_depth= 8, n_estimators= 100),
    "CatBoost" : CatBoostClassifier(learning_rate= 0.15, max_depth= 4, n_estimators= 100),
    "NaiveBayes": GaussianNB(var_smoothing= 1e-07)
}

# Grids for grid search
LR_grid = {'penalty': ['l1','l2'],
           'C': [0.25, 0.5, 0.75, 1, 1.25, 1.5],
           'max_iter': [50, 100, 150]}

KNN_grid = {'n_neighbors': [3, 5, 7, 9],
            'p': [1, 2]}

SVC_grid = {'C': [0.25, 0.5, 0.75, 1, 1.25, 1.5],
            'kernel': ['linear', 'rbf'],
            'gamma': ['scale', 'auto']}

RF_grid = {'n_estimators': [50, 100, 150, 200, 250, 300],
        'max_depth': [4, 6, 8, 10, 12]}

boosted_grid = {'n_estimators': [50, 100, 150, 200],
        'max_depth': [4, 8, 12],
        'learning_rate': [0.05, 0.1, 0.15]}

NB_grid={'var_smoothing': [1e-10, 1e-9, 1e-8, 1e-7]}

# Dictionary of all grids
grid = {
    "LogisticRegression" : LR_grid,
    "KNN" : KNN_grid,
    "SVC" : SVC_grid,
    "RandomForest" : RF_grid,
    "XGBoost" : boosted_grid,
    "LGBM" : boosted_grid,
    "CatBoost" : boosted_grid,
    "NaiveBayes": NB_grid
}

In [None]:
#Evaluate models using accuracy, models are evaluated on X_test which is the validation dataset
from sklearn.metrics import accuracy_score

i=0
clf_best_params=classifiers.copy()
valid_scores=pd.DataFrame({'Classifer':classifiers.keys(), 'Validation accuracy': np.zeros(len(classifiers)), 'Training time': np.zeros(len(classifiers))})
all_predictions={}
for key, classifier in classifiers.items():
    start = time.time()
    #clf = GridSearchCV(estimator=classifier, param_grid=grid[key], n_jobs=-1, cv=None)
    clf = classifier
    # Train and score
    clf.fit(X_train, y_train)
    print(key)
    
    if use_probabilities:
      y_predictions = clf.predict_proba(X_test)
      y_predictions = [el[1] for el in y_predictions]
    else:
      y_predictions = clf.predict(X_test)


    all_predictions[key] = y_predictions
    var_score = [1 if prediction>0.5 else 0 for prediction in y_predictions ]

    valid_scores.iloc[i,1]=accuracy_score(y_test, var_score)

    
    # Print iteration and training time
    stop = time.time()
    valid_scores.iloc[i,2]=np.round((stop - start)/60, 2)
    
    print('Model:', key)
    print('Training time (mins):', valid_scores.iloc[i,2])
    print('')
    i+=1
print(y_predictions)
print(valid_scores)

LogisticRegression
Model: LogisticRegression
Training time (mins): 0.0

KNN
Model: KNN
Training time (mins): 0.0

SVC
Model: SVC
Training time (mins): 0.07

RandomForest
Model: RandomForest
Training time (mins): 0.04

LGBM
Model: LGBM
Training time (mins): 0.02

0:	learn: 0.6536250	total: 56.6ms	remaining: 5.61s
1:	learn: 0.6234058	total: 61.8ms	remaining: 3.03s
2:	learn: 0.5994085	total: 65.9ms	remaining: 2.13s
3:	learn: 0.5820586	total: 69.8ms	remaining: 1.68s
4:	learn: 0.5676408	total: 73.1ms	remaining: 1.39s
5:	learn: 0.5544358	total: 79.4ms	remaining: 1.24s
6:	learn: 0.5446050	total: 84.1ms	remaining: 1.12s
7:	learn: 0.5337131	total: 86.1ms	remaining: 991ms
8:	learn: 0.5253804	total: 89.9ms	remaining: 909ms
9:	learn: 0.5191622	total: 94ms	remaining: 846ms
10:	learn: 0.5116249	total: 98.7ms	remaining: 799ms
11:	learn: 0.5047054	total: 102ms	remaining: 749ms
12:	learn: 0.4986752	total: 106ms	remaining: 711ms
13:	learn: 0.4934857	total: 122ms	remaining: 750ms
14:	learn: 0.4896022	tot

In [None]:
valid_scores

Unnamed: 0,Classifer,Validation accuracy,Training time
0,LogisticRegression,0.793712,0.0
1,KNN,0.749617,0.0
2,SVC,0.800997,0.07
3,RandomForest,0.807132,0.04
4,LGBM,0.804448,0.02
5,CatBoost,0.803681,0.01
6,NaiveBayes,0.662577,0.0


In [None]:
all_predictions

{'LogisticRegression': array([1., 1., 1., ..., 1., 1., 0.]),
 'KNN': array([0., 1., 1., ..., 0., 1., 0.]),
 'SVC': array([1., 1., 1., ..., 1., 1., 0.]),
 'RandomForest': array([1., 1., 1., ..., 1., 1., 1.]),
 'LGBM': array([1., 1., 1., ..., 1., 1., 0.]),
 'CatBoost': array([1., 1., 1., ..., 1., 1., 0.]),
 'NaiveBayes': array([1., 1., 1., ..., 1., 1., 1.])}

In [None]:
#Predictions using ensemble - 5 algorithms
y_predictions_tf = model.predict(X_test)
all_predictions['tf'] = np.array([1 if prediction>0.5 else 0 for prediction in y_predictions_tf ])

#5 models will be used, this problem is of binary classification, so result of prediction is either 0 or 1
#if the sum of the 5 models prediction is 3 or higher it means that a majority of the models predicted that the passengers were transported
best_predictions ={}
best_predictions['RandomForest'] = all_predictions['RandomForest']
best_predictions['LGBM'] = all_predictions['LGBM']
best_predictions['tf'] = all_predictions['tf']
best_predictions['CatBoost'] = all_predictions['CatBoost']
best_predictions['SVC'] = all_predictions['SVC']

sum_predictions = np.zeros(len(best_predictions['tf']))
for pred in best_predictions.values():
  for i, v in enumerate(pred):
    sum_predictions[i] += v


ensemble_predictions = np.array([1 if s>=3 else 0 for s in sum_predictions ])
ensemble_predictions
print('ensemble accuracy on validation set is ', str(accuracy_score(y_test, ensemble_predictions)))


ensemble accuracy on validation set 0.8052147239263804


In [None]:
sum_predictions

array([5., 5., 5., ..., 5., 5., 1.])

In [None]:
print('tensorflow accuracy on validation set is ', str(accuracy_score(y_test, all_predictions['tf'])))

tensorflow accuracy on validation set is  0.8044478527607362


In [None]:
#Predict on the "real" test set, which is df_test2, true results are not know for df_test2
predictions_ensemble_final={}

if 'Transported' in vars_to_use_training:
    vars_to_use_training.remove('Transported')

df_test2 = df_test2[vars_to_use_training]

#predict for algorithms
for key, classifier in classifiers.items():
    clf = classifier
    clf.fit(X_train, y_train)
    
    clf_predictions = clf.predict(df_test2)
    predictions_ensemble_final[key] = clf_predictions

#predict for tensorflow
y_predictions_tf = model.predict(df_test2)
predictions_ensemble_final['tf'] = np.array([1 if prediction>0.5 else 0 for prediction in y_predictions_tf ])

0:	learn: 0.6536250	total: 1.5ms	remaining: 149ms
1:	learn: 0.6234058	total: 3.63ms	remaining: 178ms
2:	learn: 0.5994085	total: 5.29ms	remaining: 171ms
3:	learn: 0.5820586	total: 6.98ms	remaining: 168ms
4:	learn: 0.5676408	total: 8.52ms	remaining: 162ms
5:	learn: 0.5544358	total: 10.3ms	remaining: 161ms
6:	learn: 0.5446050	total: 12.1ms	remaining: 160ms
7:	learn: 0.5337131	total: 13.7ms	remaining: 158ms
8:	learn: 0.5253804	total: 15.4ms	remaining: 156ms
9:	learn: 0.5191622	total: 17.3ms	remaining: 155ms
10:	learn: 0.5116249	total: 18.9ms	remaining: 153ms
11:	learn: 0.5047054	total: 20.5ms	remaining: 151ms
12:	learn: 0.4986752	total: 22.2ms	remaining: 148ms
13:	learn: 0.4934857	total: 23.9ms	remaining: 147ms
14:	learn: 0.4896022	total: 25.3ms	remaining: 144ms
15:	learn: 0.4859399	total: 27.2ms	remaining: 143ms
16:	learn: 0.4826351	total: 30ms	remaining: 147ms
17:	learn: 0.4795551	total: 32ms	remaining: 146ms
18:	learn: 0.4769712	total: 36ms	remaining: 154ms
19:	learn: 0.4743895	total: 3

In [None]:
#5 models will be used, this problem is of binary classification, so result of prediction is either 0 or 1
#if the sum of the 5 models prediction is 3 or higher it means that a majority of the models predicted that the passengers were transported

sub_predictions ={}
sub_predictions['RandomForest'] = predictions_ensemble_final['RandomForest']
sub_predictions['LGBM'] = predictions_ensemble_final['LGBM']
sub_predictions['tf'] = predictions_ensemble_final['tf']
sub_predictions['CatBoost'] = predictions_ensemble_final['CatBoost']
sub_predictions['SVC'] = predictions_ensemble_final['SVC']

sum_predictions = np.zeros(len(sub_predictions['tf']))
for pred in sub_predictions.values():
  for i, v in enumerate(pred):
    sum_predictions[i] += v


final_pred_ensemble = np.array([1 if s>=3 else 0 for s in sum_predictions ])
final_pred_ensemble

array([1, 0, 1, ..., 1, 1, 1])

In [None]:
if use_probabilities:
  #Probabilities avg, instead of using 0 or 1 results, the probabilities of result
  #for each model are gathered and the mean is calculated
  y_predictions_tf = model.predict(X_test)
  all_predictions_prob = all_predictions
  all_predictions_prob['tf'] = y_predictions_tf

  best_predictions_prob ={}
  best_predictions_prob['RandomForest'] = all_predictions_prob['RandomForest']
  best_predictions_prob['tf'] = all_predictions_prob['tf']

  soma = np.zeros(len(best_predictions_prob['tf']))
  for pred in best_predictions_prob.values():
    for i, v in enumerate(pred):
      soma[i] += v

  number_models = len(best_predictions_prob)
  pred_prob = [x / number_models for x in soma]
  print(pred_prob)

  ensemble_predictions_prob = np.array([1 if s>=0.5 else 0 for s in pred_prob])
  ensemble_predictions_prob
  print(accuracy_score(y_test, ensemble_predictions_prob))


In [None]:
len(best_predictions)

5

# Testing

In [None]:
if 'Transported' in vars_to_use_training:
    vars_to_use_training.remove('Transported')
df_test2 = df_test2[vars_to_use_training]

y_predictions = model.predict(df_test2)
print(df_test2.shape)



(4277, 7)


In [None]:
if use_probabilities:
  predictions_for_submission = ensemble_predictions_prob
else:
  predictions_for_submission = final_pred_ensemble

var_score = ['True' if prediction>0.5 else 'False' for prediction in predictions_for_submission]
print(var_score)

score_df = pd.DataFrame(var_score)
score_df.to_csv('score_df.csv')

['True', 'False', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'False', 'False', 'True', 'True', 'False', 'False', 'False', 'True', 'True', 'False', 'True', 'False', 'True', 'True', 'False', 'False', 'True', 'True', 'True', 'True', 'True', 'False', 'True', 'False', 'False', 'False', 'True', 'True', 'True', 'False', 'True', 'False', 'True', 'True', 'False', 'True', 'False', 'False', 'True', 'False', 'True', 'True', 'False', 'True', 'True', 'True', 'False', 'True', 'True', 'False', 'True', 'True', 'False', 'True', 'True', 'False', 'True', 'False', 'True', 'False', 'False', 'True', 'False', 'False', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'False', 'False', 'False', 'True', 'True', 'False', 'True', 'True', 'False', 'False', 'False', 'False', 'False', 'False', 'True', 'False', 'True', 'True', 'True', 'True', 'True', 'False', 'True', 'False', 'True', 'True', 'False', 'True', 'True', 'False', 'True', 'True', 'False', 'True', 'True', 'False', 'False',

In [None]:
score_df

Unnamed: 0,0
0,True
1,False
2,True
3,True
4,True
...,...
4272,True
4273,True
4274,True
4275,True


# Export

In [None]:
df_export = pd.DataFrame({"PassengerId":df_test_original["PassengerId"], "Transported":var_score})
df_export.to_csv('results.csv', index=False)