In [39]:
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.utils import resample
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

%matplotlib inline

In [10]:
# Import the data
raw_data = pd.read_csv("KaggleV2-May-2016.csv")
print("Rows: {}\nColumns: {}".format(raw_data.shape[0], raw_data.shape[1]))

# Clean up column names
raw_data.columns = ["patient_id", "appointment_id", "gender", "scheduled_day",
                   "appointment_day", "age", "neighborhood", "scholarship",
                    "hypertension", "diabetes", "alcoholism", "handicap", "sms_received",
                   "no_show"]
raw_data.head()

Rows: 110527
Columns: 14


Unnamed: 0,patient_id,appointment_id,gender,scheduled_day,appointment_day,age,neighborhood,scholarship,hypertension,diabetes,alcoholism,handicap,sms_received,no_show
0,29872500000000.0,5642903,F,2016-04-29T18:38:08Z,2016-04-29T00:00:00Z,62,JARDIM DA PENHA,0,1,0,0,0,0,No
1,558997800000000.0,5642503,M,2016-04-29T16:08:27Z,2016-04-29T00:00:00Z,56,JARDIM DA PENHA,0,0,0,0,0,0,No
2,4262962000000.0,5642549,F,2016-04-29T16:19:04Z,2016-04-29T00:00:00Z,62,MATA DA PRAIA,0,0,0,0,0,0,No
3,867951200000.0,5642828,F,2016-04-29T17:29:31Z,2016-04-29T00:00:00Z,8,PONTAL DE CAMBURI,0,0,0,0,0,0,No
4,8841186000000.0,5642494,F,2016-04-29T16:07:23Z,2016-04-29T00:00:00Z,56,JARDIM DA PENHA,0,1,1,0,0,0,No


In [11]:
# Get data details
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110527 entries, 0 to 110526
Data columns (total 14 columns):
patient_id         110527 non-null float64
appointment_id     110527 non-null int64
gender             110527 non-null object
scheduled_day      110527 non-null object
appointment_day    110527 non-null object
age                110527 non-null int64
neighborhood       110527 non-null object
scholarship        110527 non-null int64
hypertension       110527 non-null int64
diabetes           110527 non-null int64
alcoholism         110527 non-null int64
handicap           110527 non-null int64
sms_received       110527 non-null int64
no_show            110527 non-null object
dtypes: float64(1), int64(8), object(5)
memory usage: 11.8+ MB


In [12]:
# Check for missing values
raw_data.isnull().sum()

patient_id         0
appointment_id     0
gender             0
scheduled_day      0
appointment_day    0
age                0
neighborhood       0
scholarship        0
hypertension       0
diabetes           0
alcoholism         0
handicap           0
sms_received       0
no_show            0
dtype: int64

In [13]:
# Check for duplicates using the appointment unique identifier column
raw_data.duplicated("appointment_id").sum()

0

In [14]:
# Drop unneeded columns
raw_data.drop(["patient_id", "appointment_id"], axis=1, inplace=True)

# Convert scheduled_day and appointment_day to datetime format
raw_data["scheduled_day"] = pd.to_datetime(raw_data["scheduled_day"])
raw_data["appointment_day"] = pd.to_datetime(raw_data["appointment_day"])

# Convert qualitative columns to category type
def convert_to_category(df, cols):
    for col in cols:
        df[col] = df[col].astype("category")

convert_to_category(raw_data, ["gender", "neighborhood", "scholarship", "hypertension",
                              "diabetes", "alcoholism", "handicap", "sms_received", "no_show"])
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110527 entries, 0 to 110526
Data columns (total 12 columns):
gender             110527 non-null category
scheduled_day      110527 non-null datetime64[ns]
appointment_day    110527 non-null datetime64[ns]
age                110527 non-null int64
neighborhood       110527 non-null category
scholarship        110527 non-null category
hypertension       110527 non-null category
diabetes           110527 non-null category
alcoholism         110527 non-null category
handicap           110527 non-null category
sms_received       110527 non-null category
no_show            110527 non-null category
dtypes: category(9), datetime64[ns](2), int64(1)
memory usage: 3.5 MB


In [15]:
# Create new feature that represents the elapsed time (in days) between
# making a medical appointment and the appointment itself

def calc_time_between(cols):
    appointment_day = cols[0]
    scheduled_day = cols[1]
    return (appointment_day.date() - scheduled_day.date()).days

raw_data["scheduled_to_appointment"] = raw_data[["appointment_day", "scheduled_day"]].apply(calc_time_between, axis=1)
raw_data = raw_data[raw_data["scheduled_to_appointment"] >= 0]

print("##### scheduled_to_appointment #####")
raw_data["scheduled_to_appointment"].describe()

##### scheduled_to_appointment #####


count    110522.000000
mean         10.184253
std          15.255115
min           0.000000
25%           0.000000
50%           4.000000
75%          15.000000
max         179.000000
Name: scheduled_to_appointment, dtype: float64

In [18]:
# Create new feature for the day of the week any given appointment occured on
def get_weekday(date):
    weekday_codes = {0: "monday", 1: "tuesday",
                    2: "wednesday", 3: "thursday",
                    4: "friday", 5: "saturday",
                    6: "sunday"}
    return weekday_codes[date.weekday()]

raw_data["appointment_weekday"] = raw_data["appointment_day"].apply(lambda x: get_weekday(x)).astype("category")
raw_data["appointment_weekday"].describe()

count        110522
unique            6
top       wednesday
freq          25866
Name: appointment_weekday, dtype: object

In [19]:
# Drop values below zero and equal 115
raw_data = raw_data[(raw_data["age"] >= 0) & (raw_data["age"] != 115)]
print("##### age #####\n")
print("Rows: {}\nColumns: {}\n".format(raw_data.shape[0], raw_data.shape[1]))
print(raw_data["age"].describe())

##### age #####

Rows: 110516
Columns: 14

count    110516.000000
mean         37.085861
std          23.104465
min           0.000000
25%          18.000000
50%          37.000000
75%          55.000000
max         102.000000
Name: age, dtype: float64


In [20]:
# Create age group feature
def age_groups(age):
    if age < 18:
        return "child"
    elif (age >= 18) & (age < 65):
        return "adult"
    else:
        return "senior"

raw_data["age_group"] = raw_data["age"].apply(lambda x: age_groups(x)).astype("category")
print(raw_data["age_group"].describe())

count     110516
unique         3
top        adult
freq       68742
Name: age_group, dtype: object


In [21]:
# Create same-day appointment feature
raw_data["same_day_appointment"] = raw_data["scheduled_to_appointment"].apply(lambda x: 1 if x == 0 else 0).astype("category")
print(raw_data["same_day_appointment"].describe())

count     110516
unique         2
top            0
freq       71955
Name: same_day_appointment, dtype: int64


In [22]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 110516 entries, 0 to 110526
Data columns (total 16 columns):
gender                      110516 non-null category
scheduled_day               110516 non-null datetime64[ns]
appointment_day             110516 non-null datetime64[ns]
age                         110516 non-null int64
neighborhood                110516 non-null category
scholarship                 110516 non-null category
hypertension                110516 non-null category
diabetes                    110516 non-null category
alcoholism                  110516 non-null category
handicap                    110516 non-null category
sms_received                110516 non-null category
no_show                     110516 non-null category
scheduled_to_appointment    110516 non-null int64
appointment_weekday         110516 non-null category
age_group                   110516 non-null category
same_day_appointment        110516 non-null category
dtypes: category(12), datetime64[ns

In [24]:
# Split into feature dataframe and target dataframe
features = raw_data[["gender", "age", "age_group", "scholarship", "hypertension", "diabetes", "alcoholism",
                     "handicap", "sms_received", "neighborhood", "appointment_weekday",
                    "scheduled_to_appointment", "same_day_appointment"]]
target = raw_data["no_show"]

# Create dummy variables
features = pd.get_dummies(features, drop_first=True)
features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 110516 entries, 0 to 110526
Data columns (total 100 columns):
age                                         110516 non-null int64
scheduled_to_appointment                    110516 non-null int64
gender_M                                    110516 non-null uint8
age_group_child                             110516 non-null uint8
age_group_senior                            110516 non-null uint8
scholarship_1                               110516 non-null uint8
hypertension_1                              110516 non-null uint8
diabetes_1                                  110516 non-null uint8
alcoholism_1                                110516 non-null uint8
handicap_1                                  110516 non-null uint8
handicap_2                                  110516 non-null uint8
handicap_3                                  110516 non-null uint8
handicap_4                                  110516 non-null uint8
sms_received_1                

### No sampling

In [27]:
X = features.values
y = np.array(target.values)

le = LabelEncoder()
y = le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

print(le.classes_)
print(le.transform(["No", "Yes"]))

['No' 'Yes']
[0 1]


In [28]:
# Null accuracy rate
y_pred = np.zeros(y.shape[0])
np.mean(y_pred == y)

0.79811972926996999

### Up-sample

In [40]:
# Up-sample to balance target class
X_upsampled, y_upsampled = resample(X_train[y_train == 1], y_train[y_train == 1], replace=True,
                                   n_samples=X_train[y_train == 0].shape[0])

X_train_up = np.vstack((X_train[y_train == 0], X_upsampled))
y_train_up = np.hstack((y_train[y_train == 0], y_upsampled))

y_pred = np.zeros(y_train_up.shape[0])
np.mean(y_pred == y_train_up)

0.5

## Deep learning model

In [41]:
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import SGD, Adam

In [51]:
start_time = time.time()

model = Sequential()
model.add(Dense(4, input_shape=(100,), activation="tanh"))
model.add(Dense(2, activation="tanh"))
model.add(Dense(1, activation="sigmoid"))
model.compile(Adam(lr=0.05), "binary_crossentropy", metrics=["accuracy"])
model.fit(X_train_up, y_train_up, epochs=50, verbose=0)
results = model.evaluate(X_test, y_test)

elapsed_time = time.time() - start_time



In [52]:
print("------ {:.6f} seconds ------".format(elapsed_time))

------ 521.323253 seconds ------


In [53]:
print("accuracy score: {:0.3f}".format(results[1]))

accuracy score: 0.515


For the capstone project I was getting testing accuracy of 0.58 on average for ML models with up-sampled data. This seems to be doing worse. Model tuning and layer addtions may imporve this, but at the cost of higher run-times. I'm skeptical that it's worth it. Deep learning doesn't appear to be a solution to this dataset. And that's a shame, becuase I really wanted to get this to work.