In [1]:
import numpy as np
import pandas as pd
import os
import sys
from cleanse_pipe import cleanse_data
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import math
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
import time
import pycaret
from pycaret.classification import *

In [2]:
train = pd.read_csv('train.csv')
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
data, passengers = cleanse_data(train)
# %%
data.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S,Alone,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Ms,Sex_Male,Farebin,Agebin
0,0,3,22.0,1,0,7.25,0,0,1,0,0,0,1,0,0,1,2,2
1,1,1,38.0,1,0,71.2833,1,0,0,0,0,0,0,1,0,0,5,4
2,1,3,26.0,0,0,7.925,0,0,1,1,0,1,0,0,0,0,2,2
3,1,1,35.0,1,0,53.1,0,0,1,0,0,0,0,1,0,0,5,4
4,0,3,35.0,0,0,8.05,0,0,1,1,0,0,1,0,0,1,2,4


In [4]:
# normalize data since we are going to be using models that need normalization
exp_clf = setup(data, target = 'Survived', train_size = .8, normalize= True, log_experiment= False)

Unnamed: 0,Description,Value
0,session_id,8839
1,Target,Survived
2,Target Type,Binary
3,Label Encoded,"0: 0, 1: 1"
4,Original Data,"(891, 18)"
5,Missing Values,False
6,Numeric Features,10
7,Categorical Features,7
8,Ordinal Features,False
9,High Cardinality Features,False


In [5]:
# exp_clf[0].head()

#%%

best = compare_models()
# %%


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
gbc,Gradient Boosting Classifier,0.8273,0.8781,0.712,0.8057,0.7546,0.6224,0.6263,0.019
catboost,CatBoost Classifier,0.8217,0.8705,0.6966,0.8036,0.7452,0.6092,0.6137,0.968
lightgbm,Light Gradient Boosting Machine,0.8189,0.86,0.7187,0.783,0.7478,0.607,0.6099,0.014
ridge,Ridge Classifier,0.8146,0.0,0.734,0.7675,0.7472,0.6014,0.6048,0.02
lda,Linear Discriminant Analysis,0.8146,0.8539,0.7342,0.7673,0.7472,0.6014,0.6048,0.021
svm,SVM - Linear Kernel,0.8133,0.0,0.716,0.7759,0.7415,0.5962,0.6002,0.023
ada,Ada Boost Classifier,0.8118,0.8465,0.7417,0.7571,0.7474,0.5977,0.5996,0.017
lr,Logistic Regression,0.8103,0.8542,0.7157,0.7667,0.7394,0.5907,0.5924,0.259
knn,K Neighbors Classifier,0.802,0.8474,0.6852,0.764,0.7212,0.5685,0.5716,0.137
xgboost,Extreme Gradient Boosting,0.8006,0.8556,0.7004,0.754,0.7251,0.569,0.5711,0.099


In [6]:
catboost = create_model('rf')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.8472,0.8827,0.7407,0.8333,0.7843,0.6667,0.6694
1,0.7361,0.777,0.5926,0.6667,0.6275,0.4242,0.426
2,0.8028,0.8735,0.6538,0.7727,0.7083,0.561,0.5654
3,0.8028,0.8491,0.7692,0.7143,0.7407,0.582,0.583
4,0.7746,0.8564,0.6538,0.7083,0.68,0.5065,0.5075
5,0.831,0.8981,0.7778,0.7778,0.7778,0.6414,0.6414
6,0.7465,0.7685,0.6296,0.68,0.6538,0.4543,0.4551
7,0.8028,0.8742,0.7407,0.7407,0.7407,0.5816,0.5816
8,0.831,0.8944,0.7407,0.8,0.7692,0.6362,0.6374
9,0.8028,0.8375,0.8148,0.7097,0.7586,0.5933,0.5973


In [7]:
tuned_cat = tune_model(catboost, n_iter= 100)



Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.8611,0.9107,0.7407,0.8696,0.8,0.6947,0.6999
1,0.7917,0.7745,0.6667,0.75,0.7059,0.5455,0.5477
2,0.8451,0.8897,0.7692,0.8,0.7843,0.6635,0.6638
3,0.831,0.8684,0.8077,0.75,0.7778,0.6417,0.6429
4,0.831,0.9124,0.7308,0.7917,0.76,0.6299,0.6311
5,0.8732,0.9066,0.8148,0.8462,0.8302,0.7291,0.7295
6,0.8028,0.811,0.7037,0.76,0.7308,0.5756,0.5766
7,0.8028,0.8737,0.7778,0.7241,0.75,0.5876,0.5886
8,0.8873,0.9091,0.8148,0.88,0.8462,0.7575,0.7589
9,0.831,0.827,0.8519,0.7419,0.7931,0.6514,0.6558


In [None]:
test = pd.read_csv('test.csv')
test_data, passengers1 = cleanse_data(test)

preds = predict_model(tuned_cat, data = test_data)

ready_preds = pd.DataFrame(passengers1, columns = ['PassengerId'])
ready_preds['Survived'] = preds['Label']

ready_preds.to_csv('titanic_preds.csv', index = False)