In [1]:
import pandas as pd 
import numpy as np 

from sklearn.model_selection import train_test_split 
from sklearn.metrics import accuracy_score 
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import  QuadraticDiscriminantAnalysis 
from sklearn.svm import LinearSVC 
from sklearn.neighbors import RadiusNeighborsClassifier 
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

In [2]:
titanic_data = pd.read_csv('titanic/titanic_processed.csv')
titanic_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,0,2,1,30.0,0,0,13.0,0,0,1
1,0,3,1,18.0,0,0,7.775,0,0,1
2,0,2,1,25.0,0,0,13.0,0,0,1
3,0,3,1,7.0,4,1,39.6875,0,0,1
4,0,3,1,39.0,1,5,31.275,0,0,1


In [3]:
Features = list(titanic_data.columns[1:])
Features

['Pclass',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Fare',
 'Embarked_C',
 'Embarked_Q',
 'Embarked_S']

In [4]:
# dictionary that holds results ( metrics )
# of built models
result_dict = {}

In [5]:
# function to summarize diffrent scores

def summary_class(y_test,y_pred):
    acc = accuracy_score(y_test,y_pred,normalize = True)
    num_acc = accuracy_score(y_test,y_pred, normalize = False)
    
    prec = precision_score(y_test,y_pred)
    recall = recall_score(y_test,y_pred)
    return{'accuracy' : acc,
           'precision' : prec,
           'recall' : recall,
           'accuracy_count' : num_acc,}

In [6]:
# function to build and train diffrent classification models 
# classifier_fn : a function we define that takes training data 
#                 instantiate an estimator object and trains the model 
#name_of_y_col :   name of column in our data that contains the target 
#name_of_x_cols : name of features specified in a list 
#dataset        :  data frame that holds training data 

def build_model (classifier_fn,
                name_of_y_col,
                name_of_x_cols,
                dataset,
                test_frac = 0.2):
    x = dataset[name_of_x_cols]
    y = dataset[name_of_y_col]
    
    x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = test_frac)
    
    model  = classifier_fn(x_train,y_train)
    y_pred = model.predict(x_test)
    y_pred_train = model.predict(x_train)
    
    train_summary = summary_class(y_train,y_pred_train)
    test_summary = summary_class(y_test,y_pred)
    pred_results = pd.DataFrame({'y_test':y_test,
                                'y_pred':y_pred})
    
    model_crosstab = pd.crosstab(pred_results.y_pred,pred_results.y_test)
    return {'training': train_summary,
            'test': test_summary,
             'confusion_matrix': model_crosstab}

In [7]:
def compare_results():
    for key in result_dict :
        print('classification :', key)
        
        print()
        print('training data')
        for score in result_dict[key]['training'] :
            print(score,result_dict[key]['training'][score])
        print()
        print('test data')
        for score in result_dict[key]['test']:
            print(score, result_dict[key]['test'][score])
            
        print()

### Logistic Regression

In [8]:
# logistic Regression
def logistic_fn(x_train, y_train):
    model = LogisticRegression(solver = 'liblinear')
    model.fit(x_train,y_train)
    return model 

In [26]:
result_dict['model~logistic'] = build_model(logistic_fn,
                                            'Survived',
                                             Features,
                                             titanic_data)


### Linear Discriminant Analysis

In [10]:
# Linear discriminant Analysis 
def lda_fn(x_train, y_train, solver = 'svd'):
    model = LinearDiscriminantAnalysis(solver = solver)
    model.fit(x_train,y_train)
    return model 

In [11]:
result_dict['model~ LDA'] = build_model(lda_fn,
                                         'Survived',
                                        Features,
                                        titanic_data)

### quadratic Discriminant Analysis

In [13]:
def qda_fn(x_train, y_train, solver = 'svd'):
    model = QuadraticDiscriminantAnalysis()
    model.fit(x_train,y_train)
    return model 

In [14]:
result_dict['model~ QDA'] = build_model(qda_fn,
                                         'Survived',
                                        Features[0:-1], # we drop one column that was OHE to avoid "Dummy trap"
                                        titanic_data)

### stochastic Gradient Descent 

In [19]:
# SGD model 
def sgd_fn(x_train, y_train, max_iter = 2000, tol =1e-3):
    model = SGDClassifier(max_iter=max_iter,tol=tol)
    model.fit(x_train,y_train)
    return model 

In [27]:
result_dict['model~ SGD'] = build_model(sgd_fn,
                                         'Survived',
                                        Features,
                                        titanic_data)


### support vector classifier

In [21]:
def linear_scv_fn(x_train, y_train,C =1.0, max_iter = 2000, tol =1e-3):
    model = LinearSVC(C=C ,max_iter=max_iter,tol=tol, dual = False)
    model.fit(x_train,y_train)
    return model 

In [28]:
result_dict['model~ SVC'] = build_model(linear_scv_fn,
                                         'Survived',
                                        Features,
                                        titanic_data)


### KNN-classifier(Redius neighbor)

In [24]:
def Radius_neighbor_fn (x_train, y_train,radius = 40.0):
    model = RadiusNeighborsClassifier(radius = radius)
    model.fit(x_train,y_train)
    return model 

In [29]:
result_dict['model~ Radius_neighbors'] = build_model(linear_scv_fn,
                                         'Survived',
                                        Features,
                                        titanic_data)


 ### Decision Tree

In [32]:
def Decision_tree (x_train, y_train,max_depth = None, max_features = None ):
    model = DecisionTreeClassifier(max_depth = max_depth, max_features = max_features)
    model.fit(x_train,y_train)
    return model 

In [33]:
result_dict['model~ Decision Tree'] = build_model(Decision_tree,
                                         'Survived',
                                        Features,
                                        titanic_data)

### Naive Bayes

In [36]:
def Naive_Bayes (x_train, y_train,priors = None ):
    model = GaussianNB(priors = priors)
    model.fit(x_train,y_train)
    return model 

In [37]:
result_dict['model~ Naive Bayes'] = build_model(Naive_Bayes,
                                         'Survived',
                                        Features,
                                        titanic_data)

## Results

In [38]:
# display results of all the trained models above 
compare_results()

classification : model~logistic

training data
accuracy 0.7943760984182777
precision 0.7873303167420814
recall 0.7131147540983607
accuracy_count 452

test data
accuracy 0.7622377622377622
precision 0.6041666666666666
recall 0.6590909090909091
accuracy_count 109

classification : model~ LDA

training data
accuracy 0.6942003514938488
precision 0.8085106382978723
recall 0.3275862068965517
accuracy_count 395

test data
accuracy 0.7272727272727273
precision 0.84
recall 0.375
accuracy_count 104

classification : model~ QDA

training data
accuracy 0.7943760984182777
precision 0.7669902912621359
recall 0.6960352422907489
accuracy_count 452

test data
accuracy 0.7692307692307693
precision 0.7692307692307693
recall 0.6557377049180327
accuracy_count 110

classification : model~ SGD

training data
accuracy 0.7082601054481547
precision 0.6024464831804281
recall 0.8454935622317596
accuracy_count 403

test data
accuracy 0.7062937062937062
precision 0.5802469135802469
recall 0.8545454545454545
accurac

# Hyperparameter Tuning 

###### using GridSearchCV on Decision Tree

In [58]:
titanic_data.head()


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,0,2,1,30.0,0,0,13.0,0,0,1
1,0,3,1,18.0,0,0,7.775,0,0,1
2,0,2,1,25.0,0,0,13.0,0,0,1
3,0,3,1,7.0,4,1,39.6875,0,0,1
4,0,3,1,39.0,1,5,31.275,0,0,1


In [59]:
X = titanic_data.drop('Survived', axis = 1)
Y = titanic_data['Survived']
x_train, x_test, y_train, y_test = train_test_split(X,Y, test_size = 0.2)

In [60]:
def summary_classification (y_test, y_pres):
    acc = accuracy_score(y_test,y_pred, normalize = True)
    num_acc = accuracy_score(y_test,y_pred, normalize = False)
    prec = precision_score (y_test,y_pred)
    recall = recall_score(y_test,y_pred)
    
    print("Test data count :", len(y_test))
    print("accuracy_count:", num_acc)
    print("accuracy_score", acc)
    print("precision_score",prec)
    print("recall_score", recall)
    print()

In [61]:
from sklearn.model_selection import GridSearchCV
parameters = {'max_depth' :[2,4,5,7,9,10]}

grid_search = GridSearchCV(DecisionTreeClassifier(), parameters, cv =3,return_train_score = True)
grid_search.fit(x_train, y_train)
grid_search.best_params_

{'max_depth': 4}

In [63]:
# examine and compre all of the other model build wit GridSearch
for i in range(6):
    print('Parameters:', grid_search.cv_results_['params'][i])
    print('Mean Test Score : ', grid_search.cv_results_['mean_test_score'][i])
    print('Rank:', grid_search.cv_results_['rank_test_score'][i])


Parameters: {'max_depth': 2}
Mean Test Score :  0.7908474890931032
Rank: 3
Parameters: {'max_depth': 4}
Mean Test Score :  0.7944026733500418
Rank: 1
Parameters: {'max_depth': 5}
Mean Test Score :  0.7943562610229277
Rank: 2
Parameters: {'max_depth': 7}
Mean Test Score :  0.7785203750116031
Rank: 4
Parameters: {'max_depth': 9}
Mean Test Score :  0.7539404065719856
Rank: 5
Parameters: {'max_depth': 10}
Mean Test Score :  0.7451591942820013
Rank: 6


##### building the model with the best parameters returned by gridSearch

In [64]:
DT_model = DecisionTreeClassifier( \
max_depth =grid_search.best_params_['max_depth'] ).fit(x_train,y_train)

In [65]:
y_pred = DT_model.predict(x_test)

In [66]:
 summary_classification(y_test,y_pred)

Test data count : 143
accuracy_count: 118
accuracy_score 0.8251748251748252
precision_score 0.8918918918918919
recall_score 0.6111111111111112



### hyperparameter tuning a logistic regression classifier

In [69]:
parameters = {'penalty': ['l1','l2'],
              'C':[0.1,0.4,0.8,1,2,5]}

grid_search = GridSearchCV(LogisticRegression(solver = 'liblinear'),parameters,cv=3, return_train_score = True)
grid_search.fit(x_train, y_train)
grid_search.best_params_

{'C': 2, 'penalty': 'l1'}

In [70]:
# examine and compre all of the other model build wit GridSearch
for i in range(12):
    print('Parameters:', grid_search.cv_results_['params'][i])
    print('Mean Test Score : ', grid_search.cv_results_['mean_test_score'][i])
    print('Rank:', grid_search.cv_results_['rank_test_score'][i])


Parameters: {'C': 0.1, 'penalty': 'l1'}
Mean Test Score :  0.7592128469321451
Rank: 12
Parameters: {'C': 0.1, 'penalty': 'l2'}
Mean Test Score :  0.7714842662211083
Rank: 11
Parameters: {'C': 0.4, 'penalty': 'l1'}
Mean Test Score :  0.785602896129212
Rank: 3
Parameters: {'C': 0.4, 'penalty': 'l2'}
Mean Test Score :  0.7768124013738049
Rank: 9
Parameters: {'C': 0.8, 'penalty': 'l1'}
Mean Test Score :  0.7820848417339645
Rank: 5
Parameters: {'C': 0.8, 'penalty': 'l2'}
Mean Test Score :  0.7785575048732943
Rank: 7
Parameters: {'C': 1, 'penalty': 'l1'}
Mean Test Score :  0.7838392276988769
Rank: 4
Parameters: {'C': 1, 'penalty': 'l2'}
Mean Test Score :  0.7785575048732943
Rank: 7
Parameters: {'C': 2, 'penalty': 'l1'}
Mean Test Score :  0.7873572820941241
Rank: 1
Parameters: {'C': 2, 'penalty': 'l2'}
Mean Test Score :  0.7732943469785575
Rank: 10
Parameters: {'C': 5, 'penalty': 'l1'}
Mean Test Score :  0.7873572820941241
Rank: 1
Parameters: {'C': 5, 'penalty': 'l2'}
Mean Test Score :  0.782

In [75]:
LR_model = LogisticRegression( solver='liblinear',\
penalty =grid_search.best_params_['penalty'], C = grid_search.best_params_['C']). \
fit(x_train,y_train)

In [77]:
y_pred = LR_model.predict(x_test)

In [78]:
 summary_classification(y_test,y_pred)

Test data count : 143
accuracy_count: 116
accuracy_score 0.8111888111888111
precision_score 0.7288135593220338
recall_score 0.7962962962962963

