In [3]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.neighbors import RadiusNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

In [10]:
titanic_data = pd.read_csv('archive/titanic_data_proccess.csv')
titanic_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,1,2,0,3.0,1,2,41.5792,1,0,0
1,0,3,0,45.0,1,4,27.9,0,0,1
2,0,3,0,2.0,0,1,10.4625,0,0,1
3,0,2,1,27.0,0,0,13.0,0,0,1
4,0,2,1,24.0,2,0,73.5,0,0,1


In [20]:
Features = list(titanic_data.columns[1:])
Target = 'Survived'
print(Features)

['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked_C', 'Embarked_Q', 'Embarked_S']


In [11]:
result_dict = {}

In [28]:
def summarise_classification(y_test,y_pred):
    acc = accuracy_score(y_test,y_pred,normalize=True)
    num_acc = accuracy_score(y_test,y_pred,normalize=False)
    
    prec = precision_score(y_test,y_pred)
    recall = recall_score(y_test,y_pred)
    return {'accuracy':acc,
            'precision':prec,
            'recall':recall,
            'accuracy_count':num_acc}

In [32]:
def build_model(classifier_fn,
                name_of_y_col,
                names_of_x_cols,
                dataset,
                test_frac=0.2):
    X = dataset[names_of_x_cols]
    y = dataset[name_of_y_col]
    
    X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=test_frac)
    model = classifier_fn(X_train,y_train)
    
    y_pred = model.predict(X_test)
    y_pred_train = model.predict(X_train)
    
    train_summary = summarise_classification(y_train,y_pred_train)
    test_summary = summarise_classification(y_test,y_pred)
    
    pred_results = pd.DataFrame({'y_test':y_test,
                                 'y_pred':y_pred})
    
    model_crosstab = pd.crosstab(pred_results.y_pred, pred_results.y_test)
    return {'training':train_summary,
            'test': test_summary,
            'confusion_matrix':model_crosstab}

In [14]:
def compare_results():
    for key in result_dict:
        print('Classification: ', key)
        print()
        
        print('Training data')
        for score in result_dict[key]['training']:
            print(score, result_dict[key]['training'][score])    
        print()
        
        print('Test data')
        for score in result_dict[key]['test']:
            print(score,result_dict[key]['test'][score])
        print()

In [15]:
def logistic_fn(X_train,y_train):
    model = LogisticRegression(solver='liblinear')
    model.fit(X_train,y_train)
    return model

In [33]:
result_dict['survived - logistic'] = build_model(logistic_fn,
                                                 Target,
                                                Features,
                                                titanic_data)
compare_results()

Classification:  survived - logistic

Training data
accuracy 0.8084358523725835
precision 0.7959183673469388
recall 0.6933333333333334
accuracy_count 460

Test data
accuracy 0.7832167832167832
precision 0.7962962962962963
recall 0.6825396825396826
accuracy_count 112

