In [1]:
import numpy as np
import pandas as pd
import pickle

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.utils import resample
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.externals import joblib

In [2]:
# Read in data
mod_data = pd.read_csv('appointments_clean.csv')

In [3]:
mod_data.columns

Index(['PatientId', 'AppointmentID', 'Gender', 'ScheduledDay',
       'AppointmentDay', 'Age', 'Neighbourhood', 'Scholarship', 'Hipertension',
       'Diabetes', 'Alcoholism', 'Handcap', 'SMS_received', 'No-show',
       'DaysBetween', 'NoShow', 'PreviousMiss'],
      dtype='object')

In [29]:
# Split into X and y
X = mod_data[['DaysBetween', 'Age', 'Gender']]
y = mod_data.NoShow

# Test/train split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size = 0.3, random_state = 314
)

In [30]:
# Logistic regression
clf_lr = LogisticRegression(penalty = 'l2').fit(X_train, y_train)
pred_lr = clf_lr.predict(X_test)




In [31]:
# Regularized regression
clf_rr = LogisticRegression(penalty = 'l1').fit(X_train, y_train)
pred_rr = clf_rr.predict(X_test)




In [32]:
# Random forest
clf_rf = RandomForestClassifier(
    n_estimators = 100,
    random_state = 314
).fit(X_train, y_train)

pred_rf = clf_rf.predict(X_test)


In [33]:
# Gradient boosting
clf_gb = GradientBoostingClassifier(
    random_state = 314
).fit(X_train, y_train)

pred_gb = clf_gb.predict(X_test)


In [34]:
# Neural net
clf_nn = MLPClassifier(random_state = 314).fit(X_train, y_train)
pred_nn = clf_nn.predict(X_test)


In [35]:
# Check model performance - binary outcome
print('L2 Regularized Logistic Regression')
print(classification_report(y_test, pred_lr))

print('L1 Regularized Logistic Regression')
print(classification_report(y_test, pred_rr))

print('Random Forest')
print(classification_report(y_test, pred_rf))

print('Gradient Boosting')
print(classification_report(y_test, pred_gb))

print('Neural Network')
print(classification_report(y_test, pred_nn))


L2 Regularized Logistic Regression
              precision    recall  f1-score   support

           0       0.80      0.99      0.88     26377
           1       0.37      0.02      0.03      6782

   micro avg       0.79      0.79      0.79     33159
   macro avg       0.59      0.51      0.46     33159
weighted avg       0.71      0.79      0.71     33159

L1 Regularized Logistic Regression
              precision    recall  f1-score   support

           0       0.80      0.99      0.88     26377
           1       0.37      0.02      0.03      6782

   micro avg       0.79      0.79      0.79     33159
   macro avg       0.59      0.51      0.46     33159
weighted avg       0.71      0.79      0.71     33159

Random Forest
              precision    recall  f1-score   support

           0       0.81      0.94      0.87     26377
           1       0.37      0.13      0.20      6782

   micro avg       0.78      0.78      0.78     33159
   macro avg       0.59      0.54      0.53 

  'precision', 'predicted', average, warn_for)


In [36]:
def prob_summary(clf, data, actual):
    
    # Generate predicted probabilities
    probs = clf.predict_proba(data)
    probs = pd.Series([i[1] for i in probs])
    
    # Add probabilities to actuals in dataframe
    probs_df = pd.concat([probs, actual.reset_index()], axis = 1).reset_index()
    
    # Clean dataframe
    probs_df.columns = ['level_0', 'Prob', 'index', 'Actual']
    probs_df         = probs_df.drop(['level_0', 'index'], axis = 1)
    
    # Round probabilities to nearest tenth
    probs_df['RoundedProb'] = (probs_df['Prob']).round(1)
    
    # Summarize
    summary = probs_df.groupby(
        'RoundedProb'
    ).agg(
        {
            'Actual':['count', 'sum']
        }
    )
    
    # Fix column names
    summary.columns = ["_".join(i) for i in summary.columns.ravel()]
    
    # Calculate predicted percentages
    summary['Actual_percentage'] = (summary['Actual_sum'] / summary['Actual_count']).round(2)
    return(summary)

In [37]:
# Check model performance - probabilities
print('L2 Regularized Logistic Regression')
print(prob_summary(clf_lr, X_test, y_test), '\n')

print('L1 Regularized Logistic Regression')
print(prob_summary(clf_rr, X_test, y_test), '\n')

print('Random Forest')
print(prob_summary(clf_rf, X_test, y_test), '\n')

print('Gradient Boosting')
print(prob_summary(clf_gb, X_test, y_test), '\n')

print('Neural Network')
print(prob_summary(clf_nn, X_test, y_test), '\n')

L2 Regularized Logistic Regression
             Actual_count  Actual_sum  Actual_percentage
RoundedProb                                             
0.1                  7647         753               0.10
0.2                 19898        4051               0.20
0.3                  3973        1391               0.35
0.4                  1069         383               0.36
0.5                   403         151               0.37
0.6                   122          39               0.32
0.7                    24           7               0.29
0.8                    10           4               0.40
0.9                    13           3               0.23 

L1 Regularized Logistic Regression
             Actual_count  Actual_sum  Actual_percentage
RoundedProb                                             
0.1                  7647         753               0.10
0.2                 19899        4051               0.20
0.3                  3972        1391               0.35
0.4             

In [40]:
with open("Calendar_gb00.pkl", 'wb') as file:  
    pickle.dump(clf_gb, file)
    
with open("Calendar_mlp00.pkl", 'wb') as file:  
    pickle.dump(clf_nn, file)