In [1]:
# --- Defining Helper Functions to Train and Evaluate Classification Models --- #

In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

# The libraries we will use to implement different ML models
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.neighbors import RadiusNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

In [3]:
titanic_df = pd.read_csv('datasets/titanic/processed.csv')

titanic_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,1,2,1,2.0,1,1,26.0,0,0,1
1,0,3,1,21.0,0,0,7.925,0,0,1
2,0,3,1,44.0,0,0,8.05,0,0,1
3,1,3,0,22.0,0,0,7.75,0,0,1
4,0,3,0,45.0,1,4,27.9,0,0,1


In [4]:
# Every column other than the predicted column (i.e., Survived) is a feature
FEATURES = list(titanic_df.columns[1:])

FEATURES

['Pclass',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Fare',
 'Embarked_C',
 'Embarked_Q',
 'Embarked_S']

In [5]:
# dict keys are the different models
# dict values are the evaluation metrics 
result_dict = {}

In [6]:
# A helper function that returns the summary metrics for a model
def summarize_classification(y_test, y_pred):
    # accuracy_score normalize=True gets accuracy as fraction
    acc = accuracy_score(y_test, y_pred, normalize=True)
    # accuracy_score normalize=False gets number of accurately predicted labels
    num_acc = accuracy_score(y_test, y_pred, normalize=False)
    # precision
    prec = precision_score(y_test, y_pred)
    # recall
    recall = recall_score(y_test, y_pred)
    
    return {
        'accuracy': acc,
        'precision': prec,
        'recall': recall,
        'accuracy_count': num_acc
    }

In [7]:
def build_model(classifier_fn, name_of_y_col, names_of_x_cols, dataset, test_frac=0.2):
    """
    A helper function that generalizes the process of building a model.
    
    :param classifier_fn: A fn that takes x_train and y_train, instantiates an estimator, and trains the model.
    :param name_of_y_col: A string name of the df column that contains the target labels to use for training.
    :param names_of_x_cols: A string list of the feature names to use for training.
    :param dataset: A df that contains the training data.
    :param test_frac: The portion of the training data that should be held out for testing
    
    :return: A dict containing the model summaries for both test and train data, as well as a confusion matrix.
    """
    
    X = dataset[names_of_x_cols]
    Y = dataset[name_of_y_col]
    
    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=test_frac)
    
    model = classifier_fn(x_train, y_train)
    
    # Predictions on the unseen test data
    y_pred = model.predict(x_test)
    
    # Predictions on already seen train data
    y_pred_train = model.predict(x_train)
    
    # Get the summary metrics for the model against both train and test data
    train_summary = summarize_classification(y_train, y_pred_train)
    test_summary = summarize_classification(y_test, y_pred)
    
    # Actual vs predicated values
    pred_results = pd.DataFrame({
        'y_test': y_test,
        'y_pred': y_pred
    })
    
    # Confusion matrix
    model_crosstab = pd.crosstab(pred_results.y_pred, pred_results.y_test)
    
    return {
        'training': train_summary,
        'test': test_summary,
        'confusion_matrix': model_crosstab
    }

In [8]:
# A helper function that prints the training and test data summaries from each model
def compare_results():
    for key in result_dict:
        print('Classification: ', key)
        
        print()
        print('Training data')
        for score in result_dict[key]['training']:
            print(score, result_dict[key]['training'][score])
            
        print()
        print('Test data')
        for score in result_dict[key]['test']:
            print(score, result_dict[key]['test'][score])
            
        print()

In [9]:
# Build and train a logistic regression model
def logistic_fn(x_train, y_train):
    model = LogisticRegression(solver='liblinear')
    model.fit(x_train, y_train)
    
    return model

In [10]:
result_dict['survived ~ logistic'] = build_model(logistic_fn, 'Survived', FEATURES, titanic_df)

compare_results()

Classification:  survived ~ logistic

Training data
accuracy 0.7996485061511424
precision 0.7881773399014779
recall 0.6926406926406926
accuracy_count 455

Test data
accuracy 0.8321678321678322
precision 0.8235294117647058
recall 0.7368421052631579
accuracy_count 119



In [11]:
### --- ( 4 ) Performing Classification Using Multiple Techniques --- ###

In [12]:
# - Implementing Linear Discriminant Analysis Classification (LDA) - #

In [13]:
# svd = singular value decomposition solver (the default)
# svd finds the best axes to fit the data without calculating the covariance matrix of features
# useful when we have many features or many rows in a dataset
def linear_discriminant_fn(x_train, y_train, solver='svd'):
    
    model = LinearDiscriminantAnalysis(solver=solver)
    model.fit(x_train, y_train)
    
    return model

In [14]:
result_dict['survived ~ linear_discriminant_analysis'] = build_model(linear_discriminant_fn, 'Survived', FEATURES, titanic_df)

compare_results()

"""
# The tutorial received a warning about collinearity that did not occur in my copy
# This problem is common when a dataset has one-hot encoded features and includes all of them in the training data
# It can cause a dummy trap, a perfect collinearity between 2 or more features
# This can often be solved through "dummy encoding" where we drop one of the one-hot encoded columns (see below)
# Some estimators automatically handle this, it's possible this version of scikit now handles it, thus no warning
# However, the tutorial was posted June 2019 and this scikit commit from July 2019 seems to just remove the warning:
# https://github.com/scikit-learn/scikit-learn/issues/14361
"""

Classification:  survived ~ logistic

Training data
accuracy 0.7996485061511424
precision 0.7881773399014779
recall 0.6926406926406926
accuracy_count 455

Test data
accuracy 0.8321678321678322
precision 0.8235294117647058
recall 0.7368421052631579
accuracy_count 119

Classification:  survived ~ linear_discriminant_analysis

Training data
accuracy 0.7855887521968365
precision 0.7476635514018691
recall 0.7017543859649122
accuracy_count 447

Test data
accuracy 0.8111888111888111
precision 0.7894736842105263
recall 0.75
accuracy_count 116





In [15]:
# FEATURES[0:-1] drops the last column
result_dict['survived ~ linear_discriminant_analysis'] = build_model(linear_discriminant_fn, 'Survived', FEATURES[0:-1], titanic_df)

compare_results()

Classification:  survived ~ logistic

Training data
accuracy 0.7996485061511424
precision 0.7881773399014779
recall 0.6926406926406926
accuracy_count 455

Test data
accuracy 0.8321678321678322
precision 0.8235294117647058
recall 0.7368421052631579
accuracy_count 119

Classification:  survived ~ linear_discriminant_analysis

Training data
accuracy 0.8031634446397188
precision 0.7772511848341233
recall 0.7161572052401747
accuracy_count 457

Test data
accuracy 0.7482517482517482
precision 0.7090909090909091
recall 0.6610169491525424
accuracy_count 107



In [16]:
# - Implementing Quadratic Discriminant Analysis Classification (QDA) - #

In [17]:
# Find axes to best separate the classes such that all instances of a class are in the same quadrant 
# but the decision boundary is quadratic

# Useful when the X variables corresponding to different labels have different covariances
# i.e., covariances are different for X for all values of Y
def quadratic_discriminant_fn(x_train, y_train):
    
    model = QuadraticDiscriminantAnalysis()
    model.fit(x_train, y_train)
        
    return model

In [18]:
# Note prone to dummy trap and last column is dropped
result_dict['survived ~ quadratic_discriminant_analysis'] = build_model(quadratic_discriminant_fn, 'Survived', FEATURES[0:-1], titanic_df)

compare_results()

Classification:  survived ~ logistic

Training data
accuracy 0.7996485061511424
precision 0.7881773399014779
recall 0.6926406926406926
accuracy_count 455

Test data
accuracy 0.8321678321678322
precision 0.8235294117647058
recall 0.7368421052631579
accuracy_count 119

Classification:  survived ~ linear_discriminant_analysis

Training data
accuracy 0.8031634446397188
precision 0.7772511848341233
recall 0.7161572052401747
accuracy_count 457

Test data
accuracy 0.7482517482517482
precision 0.7090909090909091
recall 0.6610169491525424
accuracy_count 107

Classification:  survived ~ quadratic_discriminant_analysis

Training data
accuracy 0.7943760984182777
precision 0.7757847533632287
recall 0.7208333333333333
accuracy_count 452

Test data
accuracy 0.8111888111888111
precision 0.723404255319149
recall 0.7083333333333334
accuracy_count 116



In [19]:
# --- Implementing Stochastic Gradient Descent Classifiers (SGD) --- #

"""
In Logistic Regression model:
Loss (Cost) Function is the cross entropy, it measures how well the estimated probabilities match the actual labels.
The training process will try to minimize cross entropy.

Cross entropy diagram with two varibes but three lines:
- w (weights) - x axis
- b (biases) - diagnal from corner
- cross entropy - y axis

Stochastic: Randomly determined
"""

'\nIn Logistic Regression model:\nLoss (Cost) Function is the cross entropy, it measures how well the estimated probabilities match the actual labels.\nThe training process will try to minimize cross entropy.\n\nCross entropy diagram with two varibes but three lines:\n- w (weights) - x axis\n- b (biases) - diagnal from corner\n- cross entropy - y axis\n\nStochastic: Randomly determined\n'

In [24]:
# Iteratively converges to the best model
# Performs numerical optimization - one training instance at a time to find the best model parameters
# Specify hyperparameters to help design the right model for use case
    # max_iter max number of iteration for which the model should train
    # tol: the stopping criteria for training (if the change of loss falls below tol, the model is no longer improving)
def sgd_fn(x_train, y_train, max_iter=10000, tol=1e-3):
    
    model = SGDClassifier(max_iter=max_iter, tol=tol)
    model.fit(x_train, y_train)
    
    return model

In [25]:
result_dict['survived ~ sgd'] = build_model(sgd_fn, 'Survived', FEATURES, titanic_df)

# Tutorial started with sgd_fn max_iter 1000 with low accuracy then 10000 produced higher accuracy
compare_results()

Classification:  survived ~ logistic

Training data
accuracy 0.7996485061511424
precision 0.7881773399014779
recall 0.6926406926406926
accuracy_count 455

Test data
accuracy 0.8321678321678322
precision 0.8235294117647058
recall 0.7368421052631579
accuracy_count 119

Classification:  survived ~ linear_discriminant_analysis

Training data
accuracy 0.8031634446397188
precision 0.7772511848341233
recall 0.7161572052401747
accuracy_count 457

Test data
accuracy 0.7482517482517482
precision 0.7090909090909091
recall 0.6610169491525424
accuracy_count 107

Classification:  survived ~ quadratic_discriminant_analysis

Training data
accuracy 0.7943760984182777
precision 0.7757847533632287
recall 0.7208333333333333
accuracy_count 452

Test data
accuracy 0.8111888111888111
precision 0.723404255319149
recall 0.7083333333333334
accuracy_count 116

Classification:  survived ~ sgd

Training data
accuracy 0.7627416520210897
precision 0.6619217081850534
recall 0.8230088495575221
accuracy_count 434

Test

In [26]:
# --- Support Vector Machines --- #

In [27]:
# Find a hyperplane that separates points so all points on the same side belong to the same class
def linear_svc_fn(x_train, y_train, C=1.0, max_iter=1000, tol=1e-3):
    """
    C specifies the inverse strength of the regularization (smaller values indicate stronger reguarlization)
    Penalize points on the wrong side of the margin
    
    tol describes number that will stop training after 2 consecutive iterations of sub tol improvement
    
    dual is a optimization setting that could convert primal to dual, dual is easier to solve using optimization
    prefer dual=False when n_samples > n_features
    """
    
    # LinearSVC == SVC(kernel="linear")
    # Kernal referes to data tranformation performed by estimator so model is easier to optimize
    model = LinearSVC(C=C, max_iter=max_iter, tol=tol, dual=False)
    model.fit(x_train, y_train)
    
    return model

In [29]:
result_dict['survived ~ linear_svc'] = build_model(linear_svc_fn, 'Survived', FEATURES, titanic_df)

compare_results()

Classification:  survived ~ logistic

Training data
accuracy 0.7996485061511424
precision 0.7881773399014779
recall 0.6926406926406926
accuracy_count 455

Test data
accuracy 0.8321678321678322
precision 0.8235294117647058
recall 0.7368421052631579
accuracy_count 119

Classification:  survived ~ linear_discriminant_analysis

Training data
accuracy 0.8031634446397188
precision 0.7772511848341233
recall 0.7161572052401747
accuracy_count 457

Test data
accuracy 0.7482517482517482
precision 0.7090909090909091
recall 0.6610169491525424
accuracy_count 107

Classification:  survived ~ quadratic_discriminant_analysis

Training data
accuracy 0.7943760984182777
precision 0.7757847533632287
recall 0.7208333333333333
accuracy_count 452

Test data
accuracy 0.8111888111888111
precision 0.723404255319149
recall 0.7083333333333334
accuracy_count 116

Classification:  survived ~ sgd

Training data
accuracy 0.7627416520210897
precision 0.6619217081850534
recall 0.8230088495575221
accuracy_count 434

Test

In [30]:
# --- Implementing K-nearest-neighbors classification --- #

"""
Uses training data to find what is most similiar to the current sample
Predictions for a new sample involves figuring our which element in the training data it is similiar to 
Distance (Euclidean)

1. K-nearest-neighbors (voting among K nearest neighbors)
2. Radius Neighbors (voting among all neighbors within radius)

Use hyperparameter tuning on K or radius to improve results
"""

'\nUses training data to find what is most similiar to the current sample\n'

In [31]:
# Look for neighbors within the specified radius
def radius_neighbor_fn(x_train, y_train, radius=40.0): 
    
    model = RadiusNeighborsClassifier(radius=radius)
    model.fit(x_train, y_train)
    
    return model

In [32]:
result_dict['survived ~ radius_neighbors'] = build_model(radius_neighbor_fn, 'Survived', FEATURES, titanic_df)

compare_results()

Classification:  survived ~ logistic

Training data
accuracy 0.7996485061511424
precision 0.7881773399014779
recall 0.6926406926406926
accuracy_count 455

Test data
accuracy 0.8321678321678322
precision 0.8235294117647058
recall 0.7368421052631579
accuracy_count 119

Classification:  survived ~ linear_discriminant_analysis

Training data
accuracy 0.8031634446397188
precision 0.7772511848341233
recall 0.7161572052401747
accuracy_count 457

Test data
accuracy 0.7482517482517482
precision 0.7090909090909091
recall 0.6610169491525424
accuracy_count 107

Classification:  survived ~ quadratic_discriminant_analysis

Training data
accuracy 0.7943760984182777
precision 0.7757847533632287
recall 0.7208333333333333
accuracy_count 452

Test data
accuracy 0.8111888111888111
precision 0.723404255319149
recall 0.7083333333333334
accuracy_count 116

Classification:  survived ~ sgd

Training data
accuracy 0.7627416520210897
precision 0.6619217081850534
recall 0.8230088495575221
accuracy_count 434

Test

In [33]:
# --- Implementing Decision Tree classifiers --- #
"""
Set up a tree structure on training data ewhich helps make decision based on rules

Can be prone to overfitting, seeing high accuracy for train data but not test
"""

In [34]:
# Fit a decision tree to training data using CART (Classification and Regression Tree) algorithm
def decision_tree_fn(x_train, y_train, max_depth=None, max_features=None):
    
    model = DecisionTreeClassifier(max_depth=max_depth, max_features=max_features)
    model.fit(x_train, y_train)
    
    return model

In [35]:
result_dict['survived ~ decision_tree'] = build_model(decision_tree_fn, 'Survived', FEATURES, titanic_df)

compare_results()

Classification:  survived ~ logistic

Training data
accuracy 0.7996485061511424
precision 0.7881773399014779
recall 0.6926406926406926
accuracy_count 455

Test data
accuracy 0.8321678321678322
precision 0.8235294117647058
recall 0.7368421052631579
accuracy_count 119

Classification:  survived ~ linear_discriminant_analysis

Training data
accuracy 0.8031634446397188
precision 0.7772511848341233
recall 0.7161572052401747
accuracy_count 457

Test data
accuracy 0.7482517482517482
precision 0.7090909090909091
recall 0.6610169491525424
accuracy_count 107

Classification:  survived ~ quadratic_discriminant_analysis

Training data
accuracy 0.7943760984182777
precision 0.7757847533632287
recall 0.7208333333333333
accuracy_count 452

Test data
accuracy 0.8111888111888111
precision 0.723404255319149
recall 0.7083333333333334
accuracy_count 116

Classification:  survived ~ sgd

Training data
accuracy 0.7627416520210897
precision 0.6619217081850534
recall 0.8230088495575221
accuracy_count 434

Test

In [37]:
# --- Implementing Naive Bayes Classifieres--- #
"""
A Priori Probabilities are general probabilities before knowing anything specific about a given sample
Conditional probabilities are specific to the sample

Makes naive (strong) assumptions about independence of features (doesn't take into account that features could be related)

Can create robust models
"""

"\nA Priori Probabilities are general probabilities before knowing anything specific about a given sample\nConditional probabilities are specific to the sample\n\nMakes naive (strong) assumptions about independence of features (doesn't take into account that features could be related)\n"

In [38]:
# Use Bayes' theorem to find which label is most likely given the attributes observed in the feature vector
# and given how often the different labels occur in the data
def naive_bayes_fn(x_train, y_train, priors=None):
    
    model = GaussianNB(priors=priors)
    model.fit(x_train, y_train)
    
    return model

In [40]:
result_dict['survived ~ naive_bayes'] = build_model(naive_bayes_fn, 'Survived', FEATURES, titanic_df)

compare_results()

Classification:  survived ~ logistic

Training data
accuracy 0.7996485061511424
precision 0.7881773399014779
recall 0.6926406926406926
accuracy_count 455

Test data
accuracy 0.8321678321678322
precision 0.8235294117647058
recall 0.7368421052631579
accuracy_count 119

Classification:  survived ~ linear_discriminant_analysis

Training data
accuracy 0.8031634446397188
precision 0.7772511848341233
recall 0.7161572052401747
accuracy_count 457

Test data
accuracy 0.7482517482517482
precision 0.7090909090909091
recall 0.6610169491525424
accuracy_count 107

Classification:  survived ~ quadratic_discriminant_analysis

Training data
accuracy 0.7943760984182777
precision 0.7757847533632287
recall 0.7208333333333333
accuracy_count 452

Test data
accuracy 0.8111888111888111
precision 0.723404255319149
recall 0.7083333333333334
accuracy_count 116

Classification:  survived ~ sgd

Training data
accuracy 0.7627416520210897
precision 0.6619217081850534
recall 0.8230088495575221
accuracy_count 434

Test

In [None]:
# --- (5) Hyperparameter Tuning for Classification Models --- #
# See notebooks/HyperparameterTuningWithGridSearch.ipynb
"""
Hyperparameters are model configuration properties that define a model and remain constant during training
They are part of the model design and do not change

Model Inputs - train data (this trains the parameters)
Model Parameters - found during training (these are learned, e.g., model coefficient and intercept)
Model Hyperparameters - part of the model design (e.g., depth of tree, k neighbors)

Grid search is a scikit utility that creates a grid of possible values for each hyperparameter, each cell is a candidate model
gridsearchcv evaluates each candidate model (using cross validation)
It is computationally very expensive (also actual cost in cloud can be expensive)
Does not differentiate between important and trivial hyperparameters

Alternatively, random search of hyperparameter space can be done
"""