In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils import resample
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

In [2]:
# Read in data
mod_data = pd.read_csv('appointments_clean.csv')

In [3]:
# Split into X and y
X = mod_data.drop(
    ['PatientId', 'AppointmentID', 'ScheduledDay', 'AppointmentDay', 
     'Neighbourhood', 'No-show', 'NoShow'], 
    axis = 1
)
y = mod_data.NoShow

# Test/train split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size = 0.3, random_state = 314
)

In [4]:
# Scale data
scaler = MinMaxScaler().fit(X_train)

X_train = scaler.transform(X_train)
X_test  = scaler.transform(X_test)

  return self.partial_fit(X, y)


In [5]:
# KNN
clf_kn  = KNeighborsClassifier().fit(X_train, y_train)
pred_kn = clf_kn.predict(X_test)

In [6]:
# Logistic Regression (L2)
clf_lr  = LogisticRegression(penalty = 'l2').fit(X_train, y_train)
pred_lr = clf_lr.predict(X_test)



In [7]:
# Logistic Regression (L1)
clf_rr  = LogisticRegression(penalty = 'l1').fit(X_train, y_train)
pred_rr = clf_rr.predict(X_test)



In [8]:
# Neural Net
clf_nn  = MLPClassifier().fit(X_train, y_train)
pred_nn = clf_nn.predict(X_test)

In [9]:
# Check model performance - binary outcome
print('KNN')
print(classification_report(y_test, pred_kn))

print('L2 Regularized Logistic Regression')
print(classification_report(y_test, pred_lr))

print('L1 Regularized Logistic Regression')
print(classification_report(y_test, pred_rr))

print('Neural Network')
print(classification_report(y_test, pred_nn))

KNN
              precision    recall  f1-score   support

           0       0.95      0.93      0.94     26377
           1       0.75      0.81      0.78      6782

   micro avg       0.91      0.91      0.91     33159
   macro avg       0.85      0.87      0.86     33159
weighted avg       0.91      0.91      0.91     33159

L2 Regularized Logistic Regression
              precision    recall  f1-score   support

           0       0.87      0.96      0.91     26377
           1       0.74      0.47      0.57      6782

   micro avg       0.86      0.86      0.86     33159
   macro avg       0.81      0.71      0.74     33159
weighted avg       0.85      0.86      0.84     33159

L1 Regularized Logistic Regression
              precision    recall  f1-score   support

           0       0.89      0.95      0.92     26377
           1       0.74      0.53      0.62      6782

   micro avg       0.87      0.87      0.87     33159
   macro avg       0.81      0.74      0.77     33159


In [10]:
# Probability Assessment Function
def prob_summary(clf, data, actual):
    
    # Generate predicted probabilities
    probs = clf.predict_proba(data)
    probs = pd.Series([i[1] for i in probs])
    
    # Add probabilities to actuals in dataframe
    probs_df = pd.concat([probs, actual.reset_index()], axis = 1).reset_index()
    
    # Clean dataframe
    probs_df.columns = ['level_0', 'Prob', 'index', 'Actual']
    probs_df         = probs_df.drop(['level_0', 'index'], axis = 1)
    
    # Round probabilities to nearest tenth
    probs_df['RoundedProb'] = (probs_df['Prob']).round(1)
    
    # Summarize
    summary = probs_df.groupby(
        'RoundedProb'
    ).agg(
        {
            'Actual':['count', 'sum']
        }
    )
    
    # Fix column names
    summary.columns = ["_".join(i) for i in summary.columns.ravel()]
    
    # Calculate predicted percentages
    summary['Actual_percentage'] = (summary['Actual_sum'] / summary['Actual_count']).round(2)
    return(summary)

In [11]:
# Probability Assessment
# Check model performance - probabilities
print('KNN')
print(prob_summary(clf_kn, X_test, y_test), '\n')

print('L2 Regularized Logistic Regression')
print(prob_summary(clf_lr, X_test, y_test), '\n')

print('L1 Regularized Logistic Regression')
print(prob_summary(clf_rr, X_test, y_test), '\n')

print('Neural Network')
print(prob_summary(clf_nn, X_test, y_test), '\n')

KNN
             Actual_count  Actual_sum  Actual_percentage
RoundedProb                                             
0.0                 23208         125               0.01
0.2                  1096         368               0.34
0.4                  1488         782               0.53
0.6                  2287        1519               0.66
0.8                  2839        2158               0.76
1.0                  2241        1830               0.82 

L2 Regularized Logistic Regression
             Actual_count  Actual_sum  Actual_percentage
RoundedProb                                             
0.0                 14179           0               0.00
0.1                  8313           0               0.00
0.2                   619          55               0.09
0.3                  2426        1194               0.49
0.4                  2296        1585               0.69
0.5                  1818        1420               0.78
0.6                   862         722          