In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import warnings
import os
import Directory
import transformation
import pickle
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# Ignore warnings

def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

In [3]:
###### Imports ######

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from scipy.stats import uniform
from sklearn.svm import LinearSVC

### Logistic Regression

In [164]:
import importlib
importlib.reload(transformation)

<module 'transformation' from 'C:\\Users\\tanse\\Documents\\MyStuff\\Temp\\Kaggle\\titanic\\Code\\transformation.py'>

In [165]:
###### Name of output File ######
file_name = 'LogisticRegression_Sex_pclass_cabinexist_age.csv'

In [166]:
###### Get the data and transform #######

titanicTrain = pd.read_csv(os.path.join(Directory.dataPath,'train.csv'))
X_test = pd.read_csv(os.path.join(Directory.dataPath,'test.csv'))

titanicCleaned = titanicTrain.copy()
titanicCleaned = transformation.fillTitanicNa(titanicCleaned,titanicCleaned,True,True)

# This is the features data from the training set without the PassengerId
X = titanicCleaned.drop(['Survived','PassengerId'],axis=1)

# This is the response variable from the training set
y = titanicCleaned['Survived']

# This is the test data including the PassengerId
test = pd.read_csv(os.path.join(Directory.dataPath,'test.csv'))

# This is the test data without the PassengerId
test2 = test.drop(['PassengerId'],axis=1)

In [167]:
###### Show the columns ######
X.columns

Index(['AgeCleaned', 'HasAge', 'CabinExistence', 'male', 'pclass__2',
       'pclass__3'],
      dtype='object')

In [168]:
X

Unnamed: 0,AgeCleaned,HasAge,CabinExistence,male,pclass__2,pclass__3
0,22.0,1,False,1,0,1
1,38.0,1,True,0,0,0
2,26.0,1,False,0,0,1
3,35.0,1,True,0,0,0
4,35.0,1,False,1,0,1
...,...,...,...,...,...,...
886,27.0,1,False,1,1,0
887,19.0,1,True,0,0,0
888,0.0,0,False,0,0,1
889,26.0,1,True,1,0,0


In [169]:
###### Get best parameters ######

# The parameters of the Logistic Regression model which we want to iterate over
param_grid = {'penalty':['l2','l1',None], 'solver':['liblinear','newton-cg', 'lbfgs', 'sag', 'saga'],\
              'C':[10000000,1000,500,200,100,20,10,2,1,0.1,0.01,0.001],\
              'fit_intercept':[True,False]}

# Utilise GridSearchCV to run Linear Regression where for each set of parameters it will run cross validation 20 times.
# n_jobs is the number of parallel jobs
lg_search = GridSearchCV(LogisticRegression(), param_grid, cv=10, refit=True, verbose=1,n_jobs=4)
lg_search.fit(X,y)
lg_search.best_estimator_

Fitting 10 folds for each of 360 candidates, totalling 3600 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done 152 tasks      | elapsed:    0.7s
[Parallel(n_jobs=4)]: Done 3546 tasks      | elapsed:   13.4s
[Parallel(n_jobs=4)]: Done 3600 out of 3600 | elapsed:   13.4s finished


LogisticRegression(C=10000000, solver='saga')

In [170]:
###### Compared optimised model with default model ######

# Default classifier
lg1 = LogisticRegression()

# Optimised classifier
lg2 = lg_search.best_estimator_

# Here we run cross validation for the default model and the optimised model. We run it 200 times
# with a randomised train and test set and get the mean accuracy score
lg1_scores = cross_val_score(lg1, X, y, cv=10, scoring='accuracy',n_jobs=4)
lg2_scores = cross_val_score(lg2, X, y, cv=10, scoring='accuracy',n_jobs=4)
print('The default classifier accuracy score is {}'.format(lg1_scores.mean()))
print('The optimised classifier accuracy score is {}'.format(lg2_scores.mean()))

The default classifier accuracy score is 0.7979650436953807
The optimised classifier accuracy score is 0.798039950062422


In [157]:
###### Get and Save Predictions ######

# This is the features data from the training set without the PassengerId
X = titanicCleaned.drop(['Survived','PassengerId'],axis=1)

# This is the response variable from the training set
y = titanicCleaned['Survived']

# This is the test data including the PassengerId
test = pd.read_csv(os.path.join(Directory.dataPath,'test.csv'))

# This is the test data without the PassengerId
test2 = test.drop(['PassengerId'],axis=1)

lg2.fit(X,y)

titanicTestCleaned = transformation.fillTitanicNa(test,titanicTrain.copy(),True,True)
titanicTestCleaned.drop('PassengerId',inplace=True,axis=1)
transformation.makeColumnsEqual(titanicTestCleaned,X)

# lg2 was already trained, we use it here in the function to do the prediction
transformation.PredictAndSave2(test2,titanicTrain.copy(),X,lg2,os.path.join(Directory.outputPath,file_name),test).head()

Unnamed: 0_level_0,Survived,AgeCleaned,CabinExistence,male,pclass__2,pclass__3
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
892,0,34.5,False,1,0,1
893,0,47.0,False,0,0,1
894,0,62.0,False,1,1,0
895,0,27.0,False,1,0,1
896,0,22.0,False,0,0,1


### Linear SVC

In [171]:
###### Name of output File ######
file_name = 'LSVC_Sex_pclass_cabinexist_age.csv'

In [172]:
###### Get the data and transform #######

titanicTrain = pd.read_csv(os.path.join(Directory.dataPath,'train.csv'))
X_test = pd.read_csv(os.path.join(Directory.dataPath,'test.csv'))

titanicCleaned = titanicTrain.copy()
titanicCleaned = transformation.fillTitanicNa(titanicCleaned,titanicCleaned,True,True)

# This is the features data from the training set without the PassengerId
X = titanicCleaned.drop(['Survived','PassengerId'],axis=1)

# This is the response variable from the training set
y = titanicCleaned['Survived']

# This is the test data including the PassengerId
test = pd.read_csv(os.path.join(Directory.dataPath,'test.csv'))

# This is the test data without the PassengerId
test2 = test.drop(['PassengerId'],axis=1)

In [173]:
###### Show the columns ######
X.columns

Index(['AgeCleaned', 'HasAge', 'CabinExistence', 'male', 'pclass__2',
       'pclass__3'],
      dtype='object')

In [174]:
###### Get best parameters ######

# The parameters of the Logistic Regression model which we want to iterate over
param_grid = {'dual':[True,False], 'C':[0.00001,0.001,0.01,0.1,0.15,1,2,10,20],'penalty':['l2','l1',None]}

# Utilise GridSearchCV to run Linear Regression where for each set of parameters it will run cross validation 20 times.
# n_jobs is the number of parallel jobs
lsvc_search = GridSearchCV(LinearSVC(), param_grid, cv=10, refit=True, verbose=1,n_jobs=1)
lsvc_search.fit(X,y)
print(lsvc_search.best_estimator_)

Fitting 10 folds for each of 54 candidates, totalling 540 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


LinearSVC(C=0.15)


[Parallel(n_jobs=1)]: Done 540 out of 540 | elapsed:    3.9s finished


In [175]:
###### Compared optimised model with default model ######

# Default classifier
lg1 = LinearSVC()

# Optimised classifier
lg2 = lg_search.best_estimator_

# Here we run cross validation for the default model and the optimised model. We run it 200 times
# with a randomised train and test set and get the mean accuracy score
lg1_scores = cross_val_score(lg1, X, y, cv=10, scoring='accuracy',n_jobs=4)
lg2_scores = cross_val_score(lg2, X, y, cv=10, scoring='accuracy',n_jobs=4)
print('The default classifier accuracy score is {}'.format(lg1_scores.mean()))
print('The optimised classifier accuracy score is {}'.format(lg2_scores.mean()))

The default classifier accuracy score is 0.724956304619226
The optimised classifier accuracy score is 0.798039950062422


In [163]:
###### Get and Save Predictions ######

# This is the features data from the training set without the PassengerId
X = titanicCleaned.drop(['Survived','PassengerId'],axis=1)

# This is the response variable from the training set
y = titanicCleaned['Survived']

# This is the test data including the PassengerId
test = pd.read_csv(os.path.join(Directory.dataPath,'test.csv'))

# This is the test data without the PassengerId
test2 = test.drop(['PassengerId'],axis=1)

lg2.fit(X,y)

titanicTestCleaned = transformation.fillTitanicNa(test,titanicTrain.copy(),True,True)
titanicTestCleaned.drop('PassengerId',inplace=True,axis=1)
transformation.makeColumnsEqual(titanicTestCleaned,X)

# lg2 was already trained, we use it here in the function to do the prediction
transformation.PredictAndSave2(test2,titanicTrain.copy(),X,lg2,os.path.join(Directory.outputPath,file_name),test).head()

Unnamed: 0_level_0,Survived,AgeCleaned,CabinExistence,male,pclass__2,pclass__3
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
892,0,34.5,False,1,0,1
893,0,47.0,False,0,0,1
894,0,62.0,False,1,1,0
895,0,27.0,False,1,0,1
896,0,22.0,False,0,0,1
