In [205]:
import numpy as np
import pandas as pd
import seaborn as sns
import warnings
import os
import Directory
import transformation
import pickle
import matplotlib.pyplot as plt
%matplotlib inline

In [206]:
# Ignore warnings

def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

In [207]:
###### Imports ######

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from scipy.stats import uniform
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from xgboost import XGBClassifier

In [208]:
import importlib
importlib.reload(transformation)

<module 'transformation' from 'C:\\Users\\tanse\\Documents\\MyStuff\\Temp\\Kaggle\\titanic\\Code\\transformation.py'>

In [209]:
variables = 'Sex_pclass_cabinexist_isoap_ischild_age'

### Logistic Regression

In [210]:
###### Name of output File ######
file_name = 'LogisticRegression_' + variables + '.csv'

In [211]:
###### Get the data and transform #######

titanicTrain = pd.read_csv(os.path.join(Directory.dataPath,'train.csv'))
X_test = pd.read_csv(os.path.join(Directory.dataPath,'test.csv'))

titanicCleaned = titanicTrain.copy()
titanicCleaned = transformation.fillTitanicNa(titanicCleaned,titanicCleaned,True,True)

# This is the features data from the training set without the PassengerId
X = titanicCleaned.drop(['Survived','PassengerId'],axis=1)

# This is the response variable from the training set
y = titanicCleaned['Survived']

# This is the test data including the PassengerId
test = pd.read_csv(os.path.join(Directory.dataPath,'test.csv'))

# This is the test data without the PassengerId
test2 = test.drop(['PassengerId'],axis=1)

In [212]:
###### Show the columns ######
X.columns

Index(['AgeCleaned', 'CabinExistence', 'IsOAP', 'IsChild', 'male', 'pclass__2',
       'pclass__3'],
      dtype='object')

In [213]:
X

Unnamed: 0,AgeCleaned,CabinExistence,IsOAP,IsChild,male,pclass__2,pclass__3
0,22,False,0,0,1,0,1
1,38,True,0,0,0,0,0
2,26,False,0,0,0,0,1
3,35,True,0,0,0,0,0
4,35,False,0,0,1,0,1
...,...,...,...,...,...,...,...
886,27,False,0,0,1,1,0
887,19,True,0,0,0,0,0
888,32,False,0,0,0,0,1
889,26,True,0,0,1,0,0


In [214]:
###### Get best parameters ######

# The parameters of the Logistic Regression model which we want to iterate over
param_grid = {'penalty':['l2','l1',None], 'solver':['liblinear','newton-cg', 'lbfgs', 'sag', 'saga'],\
              'C':[10000000,1000,500,200,100,20,10,2,1,0.1,0.01,0.001],\
              'fit_intercept':[True,False]}

# Utilise GridSearchCV to run Linear Regression where for each set of parameters it will run cross validation 20 times.
# n_jobs is the number of parallel jobs
lg_search = GridSearchCV(LogisticRegression(), param_grid, cv=10, refit=True, verbose=1,n_jobs=4)
lg_search.fit(X,y)
lg_search.best_estimator_

Fitting 10 folds for each of 360 candidates, totalling 3600 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done 184 tasks      | elapsed:    0.9s
[Parallel(n_jobs=4)]: Done 2584 tasks      | elapsed:   10.4s
[Parallel(n_jobs=4)]: Done 3600 out of 3600 | elapsed:   13.5s finished


LogisticRegression(C=1, penalty='l1', solver='liblinear')

In [215]:
###### Compared optimised model with default model ######

# Default classifier
lg1 = LogisticRegression()

# Optimised classifier
lg2 = lg_search.best_estimator_

# Here we run cross validation for the default model and the optimised model. We run it 200 times
# with a randomised train and test set and get the mean accuracy score
lg1_scores = cross_val_score(lg1, X, y, cv=10, scoring='accuracy',n_jobs=4)
lg2_scores = cross_val_score(lg2, X, y, cv=10, scoring='accuracy',n_jobs=4)
print('The default classifier accuracy score is {}'.format(lg1_scores.mean()))
print('The optimised classifier accuracy score is {}'.format(lg2_scores.mean()))

The default classifier accuracy score is 0.7990761548064919
The optimised classifier accuracy score is 0.8035705368289637


In [216]:
###### Get and Save Predictions ######

# This is the features data from the training set without the PassengerId
X = titanicCleaned.drop(['Survived','PassengerId'],axis=1)

# This is the response variable from the training set
y = titanicCleaned['Survived']

# This is the test data including the PassengerId
test = pd.read_csv(os.path.join(Directory.dataPath,'test.csv'))

# This is the test data without the PassengerId
test2 = test.drop(['PassengerId'],axis=1)

lg2.fit(X,y)

titanicTestCleaned = transformation.fillTitanicNa(test,titanicTrain.copy(),True,True)
titanicTestCleaned.drop('PassengerId',inplace=True,axis=1)
transformation.makeColumnsEqual(titanicTestCleaned,X)

# lg2 was already trained, we use it here in the function to do the prediction
transformation.PredictAndSave2(test2,titanicTrain.copy(),X,lg2,os.path.join(Directory.outputPath,file_name),test).head()

Unnamed: 0_level_0,Survived,AgeCleaned,CabinExistence,IsOAP,IsChild,male,pclass__2,pclass__3
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
892,0,34,False,0,0,1,0,1
893,0,47,False,0,0,0,0,1
894,0,62,False,1,0,1,1,0
895,0,27,False,0,0,1,0,1
896,1,22,False,0,0,0,0,1


### Linear SVC

In [217]:
###### Name of output File ######
file_name = 'LSVC_' + variables + '.csv'

In [218]:
###### Get the data and transform #######

titanicTrain = pd.read_csv(os.path.join(Directory.dataPath,'train.csv'))
X_test = pd.read_csv(os.path.join(Directory.dataPath,'test.csv'))

titanicCleaned = titanicTrain.copy()
titanicCleaned = transformation.fillTitanicNa(titanicCleaned,titanicCleaned,True,True)

# This is the features data from the training set without the PassengerId
X = titanicCleaned.drop(['Survived','PassengerId'],axis=1)

# This is the response variable from the training set
y = titanicCleaned['Survived']

# This is the test data including the PassengerId
test = pd.read_csv(os.path.join(Directory.dataPath,'test.csv'))

# This is the test data without the PassengerId
test2 = test.drop(['PassengerId'],axis=1)

In [219]:
###### Show the columns ######
X.columns

Index(['AgeCleaned', 'CabinExistence', 'IsOAP', 'IsChild', 'male', 'pclass__2',
       'pclass__3'],
      dtype='object')

In [220]:
###### Get best parameters ######

# The parameters of the Logistic Regression model which we want to iterate over
param_grid = {'dual':[True,False], 'C':[0.00001,0.001,0.01,0.1,0.15,1,2,10,20],'penalty':['l2','l1',None]}

# Utilise GridSearchCV to run Linear Regression where for each set of parameters it will run cross validation 20 times.
# n_jobs is the number of parallel jobs
lsvc_search = GridSearchCV(LinearSVC(), param_grid, cv=10, refit=True, verbose=1,n_jobs=1)
lsvc_search.fit(X,y)
print(lsvc_search.best_estimator_)

Fitting 10 folds for each of 54 candidates, totalling 540 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


LinearSVC(C=0.15, dual=False, penalty='l1')


[Parallel(n_jobs=1)]: Done 540 out of 540 | elapsed:    4.1s finished


In [221]:
###### Compared optimised model with default model ######

# Default classifier
lg1 = LinearSVC()

# Optimised classifier
lg2 = lsvc_search.best_estimator_

# Here we run cross validation for the default model and the optimised model. We run it 200 times
# with a randomised train and test set and get the mean accuracy score
lg1_scores = cross_val_score(lg1, X, y, cv=10, scoring='accuracy',n_jobs=4)
lg2_scores = cross_val_score(lg2, X, y, cv=10, scoring='accuracy',n_jobs=4)
print('The default classifier accuracy score is {}'.format(lg1_scores.mean()))
print('The optimised classifier accuracy score is {}'.format(lg2_scores.mean()))

The default classifier accuracy score is 0.7597627965043695
The optimised classifier accuracy score is 0.7912359550561799


In [222]:
###### Get and Save Predictions ######

# This is the features data from the training set without the PassengerId
X = titanicCleaned.drop(['Survived','PassengerId'],axis=1)

# This is the response variable from the training set
y = titanicCleaned['Survived']

# This is the test data including the PassengerId
test = pd.read_csv(os.path.join(Directory.dataPath,'test.csv'))

# This is the test data without the PassengerId
test2 = test.drop(['PassengerId'],axis=1)

lg2.fit(X,y)

titanicTestCleaned = transformation.fillTitanicNa(test,titanicTrain.copy(),True,True)
titanicTestCleaned.drop('PassengerId',inplace=True,axis=1)
transformation.makeColumnsEqual(titanicTestCleaned,X)

# lg2 was already trained, we use it here in the function to do the prediction
transformation.PredictAndSave2(test2,titanicTrain.copy(),X,lg2,os.path.join(Directory.outputPath,file_name),test).head()

Unnamed: 0_level_0,Survived,AgeCleaned,CabinExistence,IsOAP,IsChild,male,pclass__2,pclass__3
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
892,0,34,False,0,0,1,0,1
893,1,47,False,0,0,0,0,1
894,0,62,False,1,0,1,1,0
895,0,27,False,0,0,1,0,1
896,1,22,False,0,0,0,0,1


### Random Forest

In [232]:
###### Name of output File ######
file_name = 'RF_' + variables + '.csv'

In [233]:
###### Get the data and transform #######

titanicTrain = pd.read_csv(os.path.join(Directory.dataPath,'train.csv'))
X_test = pd.read_csv(os.path.join(Directory.dataPath,'test.csv'))

titanicCleaned = titanicTrain.copy()
titanicCleaned = transformation.fillTitanicNa(titanicCleaned,titanicCleaned,True,True)

# This is the features data from the training set without the PassengerId
X = titanicCleaned.drop(['Survived','PassengerId'],axis=1)

# This is the response variable from the training set
y = titanicCleaned['Survived']

# This is the test data including the PassengerId
test = pd.read_csv(os.path.join(Directory.dataPath,'test.csv'))

# This is the test data without the PassengerId
test2 = test.drop(['PassengerId'],axis=1)

In [234]:
###### Show the columns ######
X.columns

Index(['AgeCleaned', 'CabinExistence', 'IsOAP', 'IsChild', 'male', 'pclass__2',
       'pclass__3'],
      dtype='object')

In [235]:
###### Get best parameters ######

# The parameters of the Logistic Regression model which we want to iterate over
param_grid = {'n_estimators':range(10,120,10),'max_depth':list(range(2,10,2))+[None],'criterion':['gini','entropy']}

# Utilise GridSearchCV to run Linear Regression where for each set of parameters it will run cross validation 20 times.
# n_jobs is the number of parallel jobs
rf_search = GridSearchCV(RandomForestClassifier(), param_grid, cv=10, refit=True, verbose=1,n_jobs=4)
rf_search.fit(X,y)
print(rf_search.best_estimator_)

Fitting 10 folds for each of 110 candidates, totalling 1100 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done 128 tasks      | elapsed:    4.8s
[Parallel(n_jobs=4)]: Done 608 tasks      | elapsed:   27.5s
[Parallel(n_jobs=4)]: Done 1036 tasks      | elapsed:   50.5s
[Parallel(n_jobs=4)]: Done 1093 out of 1100 | elapsed:   55.1s remaining:    0.3s


RandomForestClassifier(criterion='entropy', n_estimators=50)


[Parallel(n_jobs=4)]: Done 1100 out of 1100 | elapsed:   55.5s finished


In [236]:
###### Compared optimised model with default model ######

# Default classifier
lg1 = RandomForestClassifier()

# Optimised classifier
lg2 = rf_search.best_estimator_

# Here we run cross validation for the default model and the optimised model. We run it 200 times
# with a randomised train and test set and get the mean accuracy score
lg1_scores = cross_val_score(lg1, X, y, cv=10, scoring='accuracy',n_jobs=4)
lg2_scores = cross_val_score(lg2, X, y, cv=10, scoring='accuracy',n_jobs=4)
print('The default classifier accuracy score is {}'.format(lg1_scores.mean()))
print('The optimised classifier accuracy score is {}'.format(lg2_scores.mean()))

The default classifier accuracy score is 0.8126092384519351
The optimised classifier accuracy score is 0.8148564294631712


In [237]:
###### Get and Save Predictions ######

# This is the features data from the training set without the PassengerId
X = titanicCleaned.drop(['Survived','PassengerId'],axis=1)

# This is the response variable from the training set
y = titanicCleaned['Survived']

# This is the test data including the PassengerId
test = pd.read_csv(os.path.join(Directory.dataPath,'test.csv'))

# This is the test data without the PassengerId
test2 = test.drop(['PassengerId'],axis=1)

lg2.fit(X,y)

titanicTestCleaned = transformation.fillTitanicNa(test,titanicTrain.copy(),True,True)
titanicTestCleaned.drop('PassengerId',inplace=True,axis=1)
transformation.makeColumnsEqual(titanicTestCleaned,X)

# lg2 was already trained, we use it here in the function to do the prediction
transformation.PredictAndSave2(test2,titanicTrain.copy(),X,lg2,os.path.join(Directory.outputPath,file_name),test).head()

Unnamed: 0_level_0,Survived,AgeCleaned,CabinExistence,IsOAP,IsChild,male,pclass__2,pclass__3
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
892,0,34,False,0,0,1,0,1
893,0,47,False,0,0,0,0,1
894,1,62,False,1,0,1,1,0
895,1,27,False,0,0,1,0,1
896,1,22,False,0,0,0,0,1


### Extra Trees

In [238]:
###### Name of output File ######
file_name = 'ET_' + variables + '.csv'

In [239]:
###### Get the data and transform #######

titanicTrain = pd.read_csv(os.path.join(Directory.dataPath,'train.csv'))
X_test = pd.read_csv(os.path.join(Directory.dataPath,'test.csv'))

titanicCleaned = titanicTrain.copy()
titanicCleaned = transformation.fillTitanicNa(titanicCleaned,titanicCleaned,True,True)

# This is the features data from the training set without the PassengerId
X = titanicCleaned.drop(['Survived','PassengerId'],axis=1)

# This is the response variable from the training set
y = titanicCleaned['Survived']

# This is the test data including the PassengerId
test = pd.read_csv(os.path.join(Directory.dataPath,'test.csv'))

# This is the test data without the PassengerId
test2 = test.drop(['PassengerId'],axis=1)

In [240]:
###### Show the columns ######
X.columns

Index(['AgeCleaned', 'CabinExistence', 'IsOAP', 'IsChild', 'male', 'pclass__2',
       'pclass__3'],
      dtype='object')

In [243]:
###### Get best parameters ######

# The parameters of the Logistic Regression model which we want to iterate over
param_grid = {'n_estimators':range(1,120,10), 'criterion':['gini','entropy'],'max_depth':list(range(2,10,2))+[None]}

# Utilise GridSearchCV to run Linear Regression where for each set of parameters it will run cross validation 20 times.
# n_jobs is the number of parallel jobs
et_search = GridSearchCV(ExtraTreesClassifier(), param_grid, cv=10, refit=True, verbose=1,n_jobs=4)
et_search.fit(X,y)
print(et_search.best_estimator_)

Fitting 10 folds for each of 120 candidates, totalling 1200 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done 128 tasks      | elapsed:    3.6s
[Parallel(n_jobs=4)]: Done 728 tasks      | elapsed:   24.8s
[Parallel(n_jobs=4)]: Done 1193 out of 1200 | elapsed:   43.1s remaining:    0.2s


ExtraTreesClassifier(n_estimators=91)


[Parallel(n_jobs=4)]: Done 1200 out of 1200 | elapsed:   43.4s finished


In [244]:
###### Compared optimised model with default model ######

# Default classifier
lg1 = ExtraTreesClassifier()

# Optimised classifier
lg2 = et_search.best_estimator_

# Here we run cross validation for the default model and the optimised model. We run it 200 times
# with a randomised train and test set and get the mean accuracy score
lg1_scores = cross_val_score(lg1, X, y, cv=10, scoring='accuracy',n_jobs=4)
lg2_scores = cross_val_score(lg2, X, y, cv=10, scoring='accuracy',n_jobs=4)
print('The default classifier accuracy score is {}'.format(lg1_scores.mean()))
print('The optimised classifier accuracy score is {}'.format(lg2_scores.mean()))

The default classifier accuracy score is 0.8159800249687891
The optimised classifier accuracy score is 0.817103620474407


In [245]:
###### Get and Save Predictions ######

# This is the features data from the training set without the PassengerId
X = titanicCleaned.drop(['Survived','PassengerId'],axis=1)

# This is the response variable from the training set
y = titanicCleaned['Survived']

# This is the test data including the PassengerId
test = pd.read_csv(os.path.join(Directory.dataPath,'test.csv'))

# This is the test data without the PassengerId
test2 = test.drop(['PassengerId'],axis=1)

lg2.fit(X,y)

titanicTestCleaned = transformation.fillTitanicNa(test,titanicTrain.copy(),True,True)
titanicTestCleaned.drop('PassengerId',inplace=True,axis=1)
transformation.makeColumnsEqual(titanicTestCleaned,X)

# lg2 was already trained, we use it here in the function to do the prediction
transformation.PredictAndSave2(test2,titanicTrain.copy(),X,lg2,os.path.join(Directory.outputPath,file_name),test).head()

Unnamed: 0_level_0,Survived,AgeCleaned,CabinExistence,IsOAP,IsChild,male,pclass__2,pclass__3
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
892,0,34,False,0,0,1,0,1
893,0,47,False,0,0,0,0,1
894,1,62,False,1,0,1,1,0
895,1,27,False,0,0,1,0,1
896,1,22,False,0,0,0,0,1


### XGB Classifier

In [250]:
###### Name of output File ######
file_name = 'XGB_' + variables + '.csv'

In [251]:
###### Get the data and transform #######

titanicTrain = pd.read_csv(os.path.join(Directory.dataPath,'train.csv'))
X_test = pd.read_csv(os.path.join(Directory.dataPath,'test.csv'))

titanicCleaned = titanicTrain.copy()
titanicCleaned = transformation.fillTitanicNa(titanicCleaned,titanicCleaned,True,True)

# This is the features data from the training set without the PassengerId
X = titanicCleaned.drop(['Survived','PassengerId'],axis=1)

# This is the response variable from the training set
y = titanicCleaned['Survived']

# This is the test data including the PassengerId
test = pd.read_csv(os.path.join(Directory.dataPath,'test.csv'))

# This is the test data without the PassengerId
test2 = test.drop(['PassengerId'],axis=1)

In [252]:
###### Show the columns ######
X.columns

Index(['AgeCleaned', 'CabinExistence', 'IsOAP', 'IsChild', 'male', 'pclass__2',
       'pclass__3'],
      dtype='object')

In [254]:
###### Get best parameters ######

# The parameters of the Logistic Regression model which we want to iterate over
param_grid = {'n_estimators':range(1,120,10), 'max_depth':list(range(2,9,1))+[None],\
              'booster':['gbtree'],'gamma': np.arange(0,1,0.2), 'learning_rate': [0.05,0.1,0.15, 0.2,0.25, 0.3]}

# Utilise GridSearchCV to run Linear Regression where for each set of parameters it will run cross validation 20 times.
# n_jobs is the number of parallel jobs
xgb_search = GridSearchCV(XGBClassifier(), param_grid, cv=10, refit=True, verbose=1,n_jobs=16)
xgb_search.fit(X,y)
print(xgb_search.best_estimator_)

Fitting 10 folds for each of 2880 candidates, totalling 28800 fits


[Parallel(n_jobs=16)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    7.5s
[Parallel(n_jobs=16)]: Done 168 tasks      | elapsed:    9.2s
[Parallel(n_jobs=16)]: Done 667 tasks      | elapsed:   19.1s
[Parallel(n_jobs=16)]: Done 1367 tasks      | elapsed:   32.9s
[Parallel(n_jobs=16)]: Done 2267 tasks      | elapsed:   51.6s
[Parallel(n_jobs=16)]: Done 3367 tasks      | elapsed:  1.2min
[Parallel(n_jobs=16)]: Done 4411 tasks      | elapsed:  1.6min
[Parallel(n_jobs=16)]: Done 5911 tasks      | elapsed:  2.1min
[Parallel(n_jobs=16)]: Done 7611 tasks      | elapsed:  2.7min
[Parallel(n_jobs=16)]: Done 9511 tasks      | elapsed:  3.3min
[Parallel(n_jobs=16)]: Done 11604 tasks      | elapsed:  4.0min
[Parallel(n_jobs=16)]: Done 13904 tasks      | elapsed:  4.7min
[Parallel(n_jobs=16)]: Done 16404 tasks      | elapsed:  5.5min
[Parallel(n_jobs=16)]: Done 18664 tasks      | elapsed:  6.2min
[Parallel(n_jobs=16)]: Done 20114 tasks 

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0.2, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.25, max_delta_step=0, max_depth=7,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=41, n_jobs=0, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)


[Parallel(n_jobs=16)]: Done 28800 out of 28800 | elapsed:  9.5min finished


In [255]:
###### Compared optimised model with default model ######

# Default classifier
lg1 = XGBClassifier()

# Optimised classifier
lg2 = xgb_search.best_estimator_

# Here we run cross validation for the default model and the optimised model. We run it 200 times
# with a randomised train and test set and get the mean accuracy score
lg1_scores = cross_val_score(lg1, X, y, cv=10, scoring='accuracy',n_jobs=4)
lg2_scores = cross_val_score(lg2, X, y, cv=10, scoring='accuracy',n_jobs=4)
print('The default classifier accuracy score is {}'.format(lg1_scores.mean()))
print('The optimised classifier accuracy score is {}'.format(lg2_scores.mean()))

The default classifier accuracy score is 0.8204744069912608
The optimised classifier accuracy score is 0.8328339575530587


In [256]:
###### Get and Save Predictions ######

# This is the features data from the training set without the PassengerId
X = titanicCleaned.drop(['Survived','PassengerId'],axis=1)

# This is the response variable from the training set
y = titanicCleaned['Survived']

# This is the test data including the PassengerId
test = pd.read_csv(os.path.join(Directory.dataPath,'test.csv'))

# This is the test data without the PassengerId
test2 = test.drop(['PassengerId'],axis=1)

lg2.fit(X,y)

titanicTestCleaned = transformation.fillTitanicNa(test,titanicTrain.copy(),True,True)
titanicTestCleaned.drop('PassengerId',inplace=True,axis=1)
transformation.makeColumnsEqual(titanicTestCleaned,X)

# lg2 was already trained, we use it here in the function to do the prediction
transformation.PredictAndSave2(test2,titanicTrain.copy(),X,lg2,os.path.join(Directory.outputPath,file_name),test).head()

Unnamed: 0_level_0,Survived,AgeCleaned,CabinExistence,IsOAP,IsChild,male,pclass__2,pclass__3
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
892,0,34,False,0,0,1,0,1
893,0,47,False,0,0,0,0,1
894,0,62,False,1,0,1,1,0
895,0,27,False,0,0,1,0,1
896,1,22,False,0,0,0,0,1
