In [3]:
# Importing the required libraries
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import RadiusNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

In [4]:
# Reading the clean and preprocessed Titanic Dataset
def read_dataset():
    return pd.read_csv('datasets/titanic_processed.csv')

In [5]:
titanic_df = read_dataset()
titanic_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,1,3,0,63.0,0,0,9.5875,0,0,1
1,0,3,1,26.0,2,0,8.6625,0,0,1
2,1,1,0,50.0,0,1,247.5208,1,0,0
3,1,1,1,48.0,1,0,52.0,0,0,1
4,1,1,0,39.0,1,1,83.1583,1,0,0


In [9]:
result_dict = {}
FEATURES = list(titanic_df.columns[1:])
FEATURES

['Pclass',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Fare',
 'Embarked_C',
 'Embarked_Q',
 'Embarked_S']

In [10]:
# Function to display the Classification Metrics
def summarize_classification(y_test, y_pred):
    return {
        'accuracy' : accuracy_score(y_test, y_pred),
        'accuracy_count': accuracy_score(y_test, y_pred, normalize=False),
        'precision': precision_score(y_test, y_pred),
        'recall': recall_score(y_test, y_pred)
    }

In [20]:
# Function to split data and build model
def build_model(classifier_fn, name_of_y_col,names_of_x_cols, dataset, test_size = 0.2, random_state=42):
    
    X = dataset[names_of_x_cols]
    Y = dataset[name_of_y_col]
    
    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=random_state)
    
    model = classifier_fn(x_train, y_train)
    
    y_pred = model.predict(x_test)
    y_pred_train = model.predict(x_train)
    
    train_summary = summarize_classification(y_train, y_pred_train)
    test_summary = summarize_classification(y_test, y_pred)
    
    pred_results = pd.DataFrame({'actual': y_test, 'predicted': y_pred})
    model_crosstab = pd.crosstab(pred_results['actual'], pred_results['predicted'])
    
    return {
        'training': train_summary,
        'test': test_summary,
        'confusion_matrix': model_crosstab
    }

In [25]:
def compare_results():
    for key in result_dict:
        print('Classification: ', key)

        print()
        print('Training data')
        for score in result_dict[key]['training']:
            print(score, result_dict[key]['training'][score])

        print()
        print('Test data')
        for score in result_dict[key]['test']:
            print(score, result_dict[key]['test'][score])
       
        print()

In [26]:
def logistic_fn(x_train, y_train):
    
    model = LogisticRegression(solver='liblinear')
    model.fit(x_train, y_train)
    
    return model

In [27]:
result_dict['survived ~ logistic'] = build_model(logistic_fn,
                                              'Survived',
                                               FEATURES,
                                               titanic_df)

compare_results()

Classification:  survived ~ logistic

Training data
accuracy 0.804920913884007
accuracy_count 458
precision 0.7871287128712872
recall 0.7004405286343612

Test data
accuracy 0.7902097902097902
accuracy_count 113
precision 0.8297872340425532
recall 0.639344262295082

