In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.neighbors import RadiusNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

In [2]:
titanic_data = pd.read_csv('archive/titanic_data_proccess.csv')
titanic_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,1,2,0,3.0,1,2,41.5792,1,0,0
1,0,3,0,45.0,1,4,27.9,0,0,1
2,0,3,0,2.0,0,1,10.4625,0,0,1
3,0,2,1,27.0,0,0,13.0,0,0,1
4,0,2,1,24.0,2,0,73.5,0,0,1


In [3]:
Features = list(titanic_data.columns[1:])
Target = 'Survived'
print(Features)

['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked_C', 'Embarked_Q', 'Embarked_S']


In [4]:
result_dict = {}

In [5]:
def summarise_classification(y_test,y_pred):
    acc = accuracy_score(y_test,y_pred,normalize=True)
    num_acc = accuracy_score(y_test,y_pred,normalize=False)
    
    prec = precision_score(y_test,y_pred)
    recall = recall_score(y_test,y_pred)
    return {'accuracy':acc,
            'precision':prec,
            'recall':recall,
            'accuracy_count':num_acc}

In [6]:
def build_model(classifier_fn,
                name_of_y_col,
                names_of_x_cols,
                dataset,
                test_frac=0.2):
    X = dataset[names_of_x_cols]
    y = dataset[name_of_y_col]
    
    X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=test_frac)
    model = classifier_fn(X_train,y_train)
    
    y_pred = model.predict(X_test)
    y_pred_train = model.predict(X_train)
    
    train_summary = summarise_classification(y_train,y_pred_train)
    test_summary = summarise_classification(y_test,y_pred)
    
    pred_results = pd.DataFrame({'y_test':y_test,
                                 'y_pred':y_pred})
    
    model_crosstab = pd.crosstab(pred_results.y_pred, pred_results.y_test)
    return {'training':train_summary,
            'test': test_summary,
            'confusion_matrix':model_crosstab}

In [7]:
def compare_results():
    for key in result_dict:
        print('Classification: ', key)
        print()
        
        print('Training data')
        for score in result_dict[key]['training']:
            print(score, result_dict[key]['training'][score])    
        print()
        
        print('Test data')
        for score in result_dict[key]['test']:
            print(score,result_dict[key]['test'][score])
        print()

In [8]:
def logistic_fn(X_train,y_train):
    model = LogisticRegression(solver='liblinear')
    model.fit(X_train,y_train)
    return model

In [9]:
result_dict['survived - logistic'] = build_model(logistic_fn,
                                                 Target,
                                                Features,
                                                titanic_data)
compare_results()

Classification:  survived - logistic

Training data
accuracy 0.8014059753954306
precision 0.7729468599033816
recall 0.7079646017699115
accuracy_count 456

Test data
accuracy 0.7832167832167832
precision 0.803921568627451
recall 0.6612903225806451
accuracy_count 112



In [10]:
#Linear_discriminant_analysis is more like PCA, it reduces dim on the best axis that best separate data into
#different classes
#there is many solvers we can use but svd is the default
def linear_discriminant_fn(x_train,y_train,solver='svd'):
    model = LinearDiscriminantAnalysis(solver=solver)
    model.fit(x_train, y_train)
    return model

In [11]:
result_dict['survived - LinearDiscriminantAnalysis'] = build_model(linear_discriminant_fn,
                                                                   Target,
                                                                   Features,
                                                                   titanic_data)
compare_results()

Classification:  survived - logistic

Training data
accuracy 0.8014059753954306
precision 0.7729468599033816
recall 0.7079646017699115
accuracy_count 456

Test data
accuracy 0.7832167832167832
precision 0.803921568627451
recall 0.6612903225806451
accuracy_count 112

Classification:  survived - LinearDiscriminantAnalysis

Training data
accuracy 0.7961335676625659
precision 0.7603686635944701
recall 0.7205240174672489
accuracy_count 453

Test data
accuracy 0.8041958041958042
precision 0.8163265306122449
recall 0.6779661016949152
accuracy_count 115



In [12]:
#quadratic_discriminant_fn is like linear but instead of linear axis we make quadratic decision boundary
#we use quadratic discriminant when features covariance to the target are different per feature
def quadratic_discriminant_fn(x_train,y_train):
    model = QuadraticDiscriminantAnalysis()
    model.fit(x_train,y_train)
    
    return model

In [13]:
#we removed last feature due to onehot encoding varaible collinearity warning from sklearn
result_dict['survived - QuadraticDiscriminantAnalysis'] = build_model(quadratic_discriminant_fn,
                                                                     Target,
                                                                     Features[0:-1],
                                                                     titanic_data)
compare_results()

Classification:  survived - logistic

Training data
accuracy 0.8014059753954306
precision 0.7729468599033816
recall 0.7079646017699115
accuracy_count 456

Test data
accuracy 0.7832167832167832
precision 0.803921568627451
recall 0.6612903225806451
accuracy_count 112

Classification:  survived - LinearDiscriminantAnalysis

Training data
accuracy 0.7961335676625659
precision 0.7603686635944701
recall 0.7205240174672489
accuracy_count 453

Test data
accuracy 0.8041958041958042
precision 0.8163265306122449
recall 0.6779661016949152
accuracy_count 115

Classification:  survived - QuadraticDiscriminantAnalysis

Training data
accuracy 0.789103690685413
precision 0.7570093457943925
recall 0.7043478260869566
accuracy_count 449

Test data
accuracy 0.7972027972027972
precision 0.7543859649122807
recall 0.7413793103448276
accuracy_count 114



In [26]:
#Stochastic Gradient Descent iteratively converges to the best model
#max_iteration tolerence
def sgd_fn(x_train,y_train,max_iter=10000, tol=1e-3):
    model = SGDClassifier(max_iter=max_iter, tol=tol)
    model.fit(x_train,y_train)
    return model

In [31]:
result_dict['survived - SGD'] = build_model(sgd_fn,
                                            Target,
                                            Features,
                                           titanic_data)
compare_results()

Classification:  survived - logistic

Training data
accuracy 0.8014059753954306
precision 0.7729468599033816
recall 0.7079646017699115
accuracy_count 456

Test data
accuracy 0.7832167832167832
precision 0.803921568627451
recall 0.6612903225806451
accuracy_count 112

Classification:  survived - LinearDiscriminantAnalysis

Training data
accuracy 0.7961335676625659
precision 0.7603686635944701
recall 0.7205240174672489
accuracy_count 453

Test data
accuracy 0.8041958041958042
precision 0.8163265306122449
recall 0.6779661016949152
accuracy_count 115

Classification:  survived - QuadraticDiscriminantAnalysis

Training data
accuracy 0.789103690685413
precision 0.7570093457943925
recall 0.7043478260869566
accuracy_count 449

Test data
accuracy 0.7972027972027972
precision 0.7543859649122807
recall 0.7413793103448276
accuracy_count 114

Classification:  survived - SGD

Training data
accuracy 0.7117750439367311
precision 0.6290909090909091
recall 0.7361702127659574
accuracy_count 405

Test data

In [36]:
#SVM classification can be extended to almost any data using something called the kernel trick
#SVM tries to seperate classes using hyperplane or seperation line
#C is value of penality applied to outliers in the wrong side of the hyperplane, small c values indicates
#hard hyperplanes which consider outliers, high values indicates soft hyperplance which is more 
#tolerante to outliers
#prefer dual=False when n_samples >n_features
def linear_svc_fn(x_train,y_train,C=1.0,max_iter=1000, tol=1e-3):
    model = LinearSVC(C=C, max_iter=max_iter, tol=tol, dual=False)
#   model = SVC(kernal='linear',C=C, max_iter=max_iter, tol=tol, dual=False)
    model.fit(x_train,y_train)
    return model

In [38]:
result_dict['survived - linear_svc'] = build_model(linear_svc_fn,
                                                   Target,
                                                   Features,
                                                   titanic_data)
compare_results()

Classification:  survived - logistic

Training data
accuracy 0.8014059753954306
precision 0.7729468599033816
recall 0.7079646017699115
accuracy_count 456

Test data
accuracy 0.7832167832167832
precision 0.803921568627451
recall 0.6612903225806451
accuracy_count 112

Classification:  survived - LinearDiscriminantAnalysis

Training data
accuracy 0.7961335676625659
precision 0.7603686635944701
recall 0.7205240174672489
accuracy_count 453

Test data
accuracy 0.8041958041958042
precision 0.8163265306122449
recall 0.6779661016949152
accuracy_count 115

Classification:  survived - QuadraticDiscriminantAnalysis

Training data
accuracy 0.789103690685413
precision 0.7570093457943925
recall 0.7043478260869566
accuracy_count 449

Test data
accuracy 0.7972027972027972
precision 0.7543859649122807
recall 0.7413793103448276
accuracy_count 114

Classification:  survived - SGD

Training data
accuracy 0.7117750439367311
precision 0.6290909090909091
recall 0.7361702127659574
accuracy_count 405

Test data

In [48]:
#Nearest Neighbors
def radius_neighbor_fn(x_train,y_train,radius=40.0):
    model = RadiusNeighborsClassifier(radius=radius)
    model.fit(x_train,y_train)
    
    return model

In [49]:
result_dict['survived - radius_neighbor'] = build_model(radius_neighbor_fn,
                                                        Target,
                                                        Features,
                                                       titanic_data)
compare_results()

Classification:  survived - logistic

Training data
accuracy 0.8014059753954306
precision 0.7729468599033816
recall 0.7079646017699115
accuracy_count 456

Test data
accuracy 0.7832167832167832
precision 0.803921568627451
recall 0.6612903225806451
accuracy_count 112

Classification:  survived - LinearDiscriminantAnalysis

Training data
accuracy 0.7961335676625659
precision 0.7603686635944701
recall 0.7205240174672489
accuracy_count 453

Test data
accuracy 0.8041958041958042
precision 0.8163265306122449
recall 0.6779661016949152
accuracy_count 115

Classification:  survived - QuadraticDiscriminantAnalysis

Training data
accuracy 0.789103690685413
precision 0.7570093457943925
recall 0.7043478260869566
accuracy_count 449

Test data
accuracy 0.7972027972027972
precision 0.7543859649122807
recall 0.7413793103448276
accuracy_count 114

Classification:  survived - SGD

Training data
accuracy 0.7117750439367311
precision 0.6290909090909091
recall 0.7361702127659574
accuracy_count 405

Test data

In [52]:
#decision tress set up a tree structure on training data which helps make decisions based on rules
def decision_tree_fn(x_train,y_train,max_depth=None,max_features=None):
    model = DecisionTreeClassifier(max_depth=max_depth,max_features=max_features)
    model.fit(x_train,y_train)
    return model

In [56]:
result_dict['survived - decision_tree'] = build_model(decision_tree_fn,
                                                      Target,
                                                      Features,
                                                      titanic_data)
compare_results()

Classification:  survived - logistic

Training data
accuracy 0.8014059753954306
precision 0.7729468599033816
recall 0.7079646017699115
accuracy_count 456

Test data
accuracy 0.7832167832167832
precision 0.803921568627451
recall 0.6612903225806451
accuracy_count 112

Classification:  survived - LinearDiscriminantAnalysis

Training data
accuracy 0.7961335676625659
precision 0.7603686635944701
recall 0.7205240174672489
accuracy_count 453

Test data
accuracy 0.8041958041958042
precision 0.8163265306122449
recall 0.6779661016949152
accuracy_count 115

Classification:  survived - QuadraticDiscriminantAnalysis

Training data
accuracy 0.789103690685413
precision 0.7570093457943925
recall 0.7043478260869566
accuracy_count 449

Test data
accuracy 0.7972027972027972
precision 0.7543859649122807
recall 0.7413793103448276
accuracy_count 114

Classification:  survived - SGD

Training data
accuracy 0.7117750439367311
precision 0.6290909090909091
recall 0.7361702127659574
accuracy_count 405

Test data

In [57]:
#naive bayes makes naive(strong) assumptions about independence of features
def naive_bayes_fn(x_train,y_train, priors=None):
    model = GaussianNB(priors=priors)
    model.fit(x_train,y_train)
    return model

In [58]:
result_dict['survived - naive_based'] = build_model(naive_bayes_fn,
                                                    Target,
                                                    Features,
                                                    titanic_data)
compare_results()

Classification:  survived - logistic

Training data
accuracy 0.8014059753954306
precision 0.7729468599033816
recall 0.7079646017699115
accuracy_count 456

Test data
accuracy 0.7832167832167832
precision 0.803921568627451
recall 0.6612903225806451
accuracy_count 112

Classification:  survived - LinearDiscriminantAnalysis

Training data
accuracy 0.7961335676625659
precision 0.7603686635944701
recall 0.7205240174672489
accuracy_count 453

Test data
accuracy 0.8041958041958042
precision 0.8163265306122449
recall 0.6779661016949152
accuracy_count 115

Classification:  survived - QuadraticDiscriminantAnalysis

Training data
accuracy 0.789103690685413
precision 0.7570093457943925
recall 0.7043478260869566
accuracy_count 449

Test data
accuracy 0.7972027972027972
precision 0.7543859649122807
recall 0.7413793103448276
accuracy_count 114

Classification:  survived - SGD

Training data
accuracy 0.7117750439367311
precision 0.6290909090909091
recall 0.7361702127659574
accuracy_count 405

Test data