In [1]:
import numpy as np
import pandas as pd
import pickle

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.utils import resample
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.externals import joblib

In [2]:
# Read in data
mod_data = pd.read_csv('appointments_clean.csv')

In [3]:
mod_data.columns

Index(['PatientId', 'AppointmentID', 'Gender', 'ScheduledDay',
       'AppointmentDay', 'Age', 'Neighbourhood', 'Scholarship', 'Hipertension',
       'Diabetes', 'Alcoholism', 'Handcap', 'SMS_received', 'No-show',
       'DaysBetween', 'NoShow', 'PreviousMiss'],
      dtype='object')

In [4]:
# Split into X and y
X = mod_data.drop(
    [
        'PatientId', 'AppointmentID', 'ScheduledDay', 'AppointmentDay', 
        'Neighbourhood', 'No-show', 'NoShow'
    ], 
    axis = 1
)
y = mod_data.NoShow

# Test/train split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size = 0.3, random_state = 314
)

In [5]:
# Logistic regression
clf_lr = LogisticRegression(penalty = 'l2').fit(X_train, y_train)
pred_lr = clf_lr.predict(X_test)




In [6]:
# Regularized regression
clf_rr = LogisticRegression(penalty = 'l1').fit(X_train, y_train)
pred_rr = clf_rr.predict(X_test)




In [7]:
# Random forest
clf_rf = RandomForestClassifier(
    n_estimators = 100,
    random_state = 314
).fit(X_train, y_train)

pred_rf = clf_rf.predict(X_test)


In [8]:
# Gradient boosting
clf_gb = GradientBoostingClassifier(
    random_state = 314
).fit(X_train, y_train)

pred_gb = clf_gb.predict(X_test)


In [9]:
# Neural net
clf_nn = MLPClassifier(random_state = 314).fit(X_train, y_train)
pred_nn = clf_nn.predict(X_test)


In [10]:
# Check model performance - binary outcome
print('L2 Regularized Logistic Regression')
print(classification_report(y_test, pred_lr))

print('L1 Regularized Logistic Regression')
print(classification_report(y_test, pred_rr))

print('Random Forest')
print(classification_report(y_test, pred_rf))

print('Gradient Boosting')
print(classification_report(y_test, pred_gb))

print('Neural Network')
print(classification_report(y_test, pred_nn))


L2 Regularized Logistic Regression
              precision    recall  f1-score   support

           0       0.89      0.95      0.92     26377
           1       0.74      0.54      0.62      6782

   micro avg       0.87      0.87      0.87     33159
   macro avg       0.81      0.74      0.77     33159
weighted avg       0.86      0.87      0.86     33159

L1 Regularized Logistic Regression
              precision    recall  f1-score   support

           0       0.89      0.95      0.92     26377
           1       0.74      0.54      0.62      6782

   micro avg       0.87      0.87      0.87     33159
   macro avg       0.81      0.74      0.77     33159
weighted avg       0.86      0.87      0.86     33159

Random Forest
              precision    recall  f1-score   support

           0       0.95      0.94      0.94     26377
           1       0.76      0.81      0.79      6782

   micro avg       0.91      0.91      0.91     33159
   macro avg       0.86      0.87      0.87 

In [11]:
def prob_summary(clf, data, actual):
    
    # Generate predicted probabilities
    probs = clf.predict_proba(data)
    probs = pd.Series([i[1] for i in probs])
    
    # Add probabilities to actuals in dataframe
    probs_df = pd.concat([probs, actual.reset_index()], axis = 1).reset_index()
    
    # Clean dataframe
    probs_df.columns = ['level_0', 'Prob', 'index', 'Actual']
    probs_df         = probs_df.drop(['level_0', 'index'], axis = 1)
    
    # Round probabilities to nearest tenth
    probs_df['RoundedProb'] = (probs_df['Prob']).round(1)
    
    # Summarize
    summary = probs_df.groupby(
        'RoundedProb'
    ).agg(
        {
            'Actual':['count', 'sum']
        }
    )
    
    # Fix column names
    summary.columns = ["_".join(i) for i in summary.columns.ravel()]
    
    # Calculate predicted percentages
    summary['Actual_percentage'] = (summary['Actual_sum'] / summary['Actual_count']).round(2)
    return(summary)

In [12]:
# Check model performance - probabilities
print('L2 Regularized Logistic Regression')
print(prob_summary(clf_lr, X_test, y_test), '\n')

print('L1 Regularized Logistic Regression')
print(prob_summary(clf_rr, X_test, y_test), '\n')

print('Random Forest')
print(prob_summary(clf_rf, X_test, y_test), '\n')

print('Gradient Boosting')
print(prob_summary(clf_gb, X_test, y_test), '\n')

print('Neural Network')
print(prob_summary(clf_nn, X_test, y_test), '\n')

L2 Regularized Logistic Regression
             Actual_count  Actual_sum  Actual_percentage
RoundedProb                                             
0.0                 16869           0               0.00
0.1                  5835           0               0.00
0.2                   352          15               0.04
0.3                  1890         908               0.48
0.4                  2370        1499               0.63
0.5                  1828        1414               0.77
0.6                  1157         955               0.83
0.7                   458         400               0.87
0.8                   180         151               0.84
0.9                  1211         728               0.60
1.0                  1009         712               0.71 

L1 Regularized Logistic Regression
             Actual_count  Actual_sum  Actual_percentage
RoundedProb                                             
0.0                 16862           0               0.00
0.1             

In [None]:
# Check variable importance of best models




In [13]:
with open("Schedule_gb00.pkl", 'wb') as file:  
    pickle.dump(clf_gb, file)
    
with open("Schedule_mlp00.pkl", 'wb') as file:  
    pickle.dump(clf_nn, file)