In [5]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.utils import resample
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier

In [6]:
# Read in data
mod_data = pd.read_csv('appointments_clean.csv')

In [8]:
# Split into X and y
X = mod_data.drop(
    ['PatientId', 'AppointmentID', 'ScheduledDay', 'AppointmentDay', 
     'Neighbourhood', 'No-show', 'NoShow'], 
    axis = 1
)
y = mod_data.NoShow

# Test/train split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size = 0.3, random_state = 314
)

# Scale data
scaler = MinMaxScaler().fit(X_train)

X_train = scaler.transform(X_train)
X_test  = scaler.transform(X_test)


  return self.partial_fit(X, y)


In [18]:
# Reconnect x and y for each set to make resampling easier
train = pd.DataFrame(X_train.copy())
train['NoShow'] = y_train

test = pd.DataFrame(X_test.copy())
test['NoShow'] = y_test

In [19]:
# Separate majority and minority classes
train_majority = train[train['NoShow'] == 0]
train_minority = train[train['NoShow'] == 1]

In [23]:
# Down sample major class
train_majority_downsampled = resample(
    train_majority,
    replace = False,
    n_samples = train_minority.shape[0],
    random_state = 314
)

# Combine minority class with downsampled majority class
train_downsampled = pd.concat([train_minority, train_majority_downsampled])

# Major downsample logistic regression
X_train = train_downsampled.drop('NoShow', axis = 1).values
y_train = train_downsampled['NoShow']


In [25]:
# Random forest
clf_rf = RandomForestClassifier(
    n_estimators = 100,
    random_state = 314
).fit(X_train, y_train)

pred_rf = clf_rf.predict(X_test)

# Gradient boosting
clf_gb = GradientBoostingClassifier(
    random_state = 314
).fit(X_train, y_train)

pred_gb = clf_gb.predict(X_test)

# Neural net
clf_nn = MLPClassifier(random_state = 314).fit(X_train, y_train)
pred_nn = clf_nn.predict(X_test)

In [27]:
# Check model performance - binary outcome
print('Random Forest')
print(classification_report(y_test, pred_rf))

print('Gradient Boosting')
print(classification_report(y_test, pred_gb))

print('Neural Network')
print(classification_report(y_test, pred_nn))

Random Forest
              precision    recall  f1-score   support

           0       0.79      0.49      0.60     26377
           1       0.19      0.48      0.28      6782

   micro avg       0.49      0.49      0.49     33159
   macro avg       0.49      0.48      0.44     33159
weighted avg       0.66      0.49      0.54     33159

Gradient Boosting
              precision    recall  f1-score   support

           0       0.77      0.40      0.53     26377
           1       0.18      0.52      0.27      6782

   micro avg       0.43      0.43      0.43     33159
   macro avg       0.47      0.46      0.40     33159
weighted avg       0.65      0.43      0.48     33159

Neural Network
              precision    recall  f1-score   support

           0       0.74      0.39      0.51     26377
           1       0.16      0.47      0.24      6782

   micro avg       0.40      0.40      0.40     33159
   macro avg       0.45      0.43      0.38     33159
weighted avg       0.62    

In [28]:
def prob_summary(clf, data, actual):
    
    # Generate predicted probabilities
    probs = clf.predict_proba(data)
    probs = pd.Series([i[1] for i in probs])
    
    # Add probabilities to actuals in dataframe
    probs_df = pd.concat([probs, actual.reset_index()], axis = 1).reset_index()
    
    # Clean dataframe
    probs_df.columns = ['level_0', 'Prob', 'index', 'Actual']
    probs_df         = probs_df.drop(['level_0', 'index'], axis = 1)
    
    # Round probabilities to nearest tenth
    probs_df['RoundedProb'] = (probs_df['Prob']).round(1)
    
    # Summarize
    summary = probs_df.groupby(
        'RoundedProb'
    ).agg(
        {
            'Actual':['count', 'sum']
        }
    )
    
    # Fix column names
    summary.columns = ["_".join(i) for i in summary.columns.ravel()]
    
    # Calculate predicted percentages
    summary['Actual_percentage'] = (summary['Actual_sum'] / summary['Actual_count']).round(2)
    return(summary)

In [29]:
# Check model performance - probabilities
print('Random Forest')
print(prob_summary(clf_rf, X_test, y_test), '\n')

print('Gradient Boosting')
print(prob_summary(clf_gb, X_test, y_test), '\n')

print('Neural Network')
print(prob_summary(clf_nn, X_test, y_test), '\n')

Random Forest
             Actual_count  Actual_sum  Actual_percentage
RoundedProb                                             
0.0                   622         107               0.17
0.1                  1661         388               0.23
0.2                  2598         661               0.25
0.3                  3427         810               0.24
0.4                  4899        1005               0.21
0.5                  7247        1042               0.14
0.6                  4615         996               0.22
0.7                  3119         721               0.23
0.8                  2616         603               0.23
0.9                  1718         348               0.20
1.0                   637         101               0.16 

Gradient Boosting
             Actual_count  Actual_sum  Actual_percentage
RoundedProb                                             
0.2                    21           4               0.19
0.3                    72          30               0.