In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import accuracy_score as acc
from sklearn.metrics import mean_squared_error as mse

In [2]:
def to_seconds(X):
    return X.hour * 3600 + X.minute * 60 + X.second

In [40]:
data = pd.read_csv("medical_no_shows.csv")
data["No-show"] = data["No-show"] == 'Yes'
data["No-show"] = y.astype(int)

In [4]:
X = data.drop(columns = ['PatientId','No-show','AppointmentID'])
X["Gender"] = X["Gender"] == "M"
X["Gender"] = X["Gender"].astype(int)

X["ScheduledDay"] = pd.to_datetime(X["ScheduledDay"])
X["Schedule_Weekday"] = X["ScheduledDay"].dt.weekday
X["Schedule_Seconds"] = X["ScheduledDay"].dt.time.apply(to_seconds)

X["AppointmentDay"] = pd.to_datetime(X["AppointmentDay"])
X["Appointment_Weekday"] = X["AppointmentDay"].dt.weekday

X = pd.concat((X, pd.get_dummies(X["Neighbourhood"])), axis = 1)

X = X.drop(columns = ['ScheduledDay', 'AppointmentDay', 'Neighbourhood'])

y = data["No-show"] == 'Yes'
y = y.astype(int)

In [35]:
for col in data.drop(columns = 'No-show').columns:
    print(data.groupby(col)["No-show"].value_counts(normalize = True))

PatientId     No-show
3.921784e+04  No         1.0
4.374176e+04  No         1.0
9.377953e+04  No         1.0
1.417242e+05  No         1.0
5.376153e+05  No         1.0
                        ... 
9.999320e+14  No         1.0
9.999350e+14  No         1.0
9.999465e+14  No         1.0
9.999686e+14  No         1.0
9.999816e+14  No         1.0
Name: No-show, Length: 71817, dtype: float64
AppointmentID  No-show
5030230        No         1.0
5122866        Yes        1.0
5134197        Yes        1.0
5134220        No         1.0
5134223        No         1.0
                         ... 
5790461        No         1.0
5790464        No         1.0
5790466        No         1.0
5790481        No         1.0
5790484        No         1.0
Name: No-show, Length: 110527, dtype: float64
Gender  No-show
F       No         0.796854
        Yes        0.203146
M       No         0.800321
        Yes        0.199679
Name: No-show, dtype: float64
ScheduledDay          No-show
2015-11-10T07:13:56Z  No   

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [7]:

rf_clf = RandomForestClassifier(n_estimators = 100, n_jobs = -1, 
                               max_depth = 5, random_state = 42)
# svc = SVC(gamma = 0.1, C = 100)
ada_clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth = 1), n_estimators = 100, 
                             learning_rate = 1, random_state = 42)

voting_clf = VotingClassifier(
                estimators = [('rf', rf_clf), ('ada', ada_clf)], 
                voting = 'soft', n_jobs = -1)

In [8]:
voting_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('rf',
                              RandomForestClassifier(bootstrap=True,
                                                     ccp_alpha=0.0,
                                                     class_weight=None,
                                                     criterion='gini',
                                                     max_depth=5,
                                                     max_features='auto',
                                                     max_leaf_nodes=None,
                                                     max_samples=None,
                                                     min_impurity_decrease=0.0,
                                                     min_impurity_split=None,
                                                     min_samples_leaf=1,
                                                     min_samples_split=2,
                                                     min_weight_fraction_leaf=0.0,
           

In [49]:
pred = voting_clf.predict(X_test)
print("""
Acc: {}
MSE: {}
""".format(acc(y_test, pred), mse(y_test, pred)))


Acc: 0.7986754487550666
MSE: 0.2013245512449334



In [48]:
from sklearn.dummy import DummyClassifier
dummy = DummyClassifier()
dummy.fit(X_test, y_test)
pred = dummy.predict(X_test)
print("""
Acc: {}
MSE: {}
""".format(acc(y_test, pred), mse(y_test, pred)))


Acc: 0.6771134916039374
MSE: 0.32288650839606253



