In [None]:
import numpy as np
import pandas as pd 
import sklearn
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score, roc_curve 
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_predict
from sklearn.ensemble import RandomForestClassifier

%matplotlib inline

# Reading data from input file
df = pd.read_csv("Appointment-No-Show-Data.csv")
print(df.dtypes)
print("------------------------------------------")
df.shape

In [None]:
# Renaming No-Show column entity

df = df.rename(columns={"No-show": "no_show"})
df.no_show = df.no_show.map({ 'No': 0, 'Yes': 1 })
df['ScheduledDay'] = pd.to_datetime(df['ScheduledDay']).dt.date.astype('datetime64[ns]')
df["AppointmentDay"] = pd.to_datetime(df["AppointmentDay"])
df["WeekdayScheduled"] = df["ScheduledDay"].dt.weekday
df['awaiting_time_days'] = (df.AppointmentDay - df.ScheduledDay).dt.days
df = df[(df.awaiting_time_days >= 0)]

In [None]:
df = df[df["Age"] <100]
df = df[df["Age"] > -1] 

# Question 1
no_show = len(df.query('no_show == "1"'))
no_show_ratio = int(round(no_show/len(df)*100))

print("Total proportion of patients with no show:",no_show_ratio)


In [None]:
# Performing dummies operation
df[['Handcap']] = df[['Handcap']].astype('str')
df_cat = pd.get_dummies(df[['Gender'] + ['Handcap']],drop_first = True)
cols_all_cat = list(df_cat.columns)
print(df_cat.head())
df = pd.concat([df,df_cat], axis = 1)

cols_input = ['Scholarship','Hipertension', 'Diabetes', 'Alcoholism',
       'SMS_received', 'Age', 'awaiting_time_days', 'WeekdayScheduled','no_show']

df_final = df[cols_input+cols_all_cat]

In [None]:
# Train and test data 

df_final = df_final.sample(n = len(df_final), random_state = 20)
df_final = df_final.reset_index(drop = True)

df_test=df_final.sample(frac=0.20,random_state=20)
df_train = df_final.drop(df_test.index)

In [None]:
def calc_prevalence(y_actual):
    return (sum(y_actual)/len(y_actual))

print('Train prevalence(n = %d):%.3f'%(len(df_train), calc_prevalence(df_train.no_show.values)))
print('Test prevalence(n = %d):%.3f'%(len(df_test),calc_prevalence(df_test.no_show.values)))

In [None]:
# Balancing the data

rows_pos = df_train.no_show == 1
df_train_pos = df_train.loc[rows_pos]
df_train_neg = df_train.loc[~rows_pos]

n = np.min([len(df_train_pos), len(df_train_neg)])

# merge the balanced data
df_train = pd.concat([df_train_pos.sample(n = n, random_state = 20), 
                      df_train_neg.sample(n = n, random_state = 20)],axis = 0, 
                     ignore_index = True)

# shuffle the order of training samples 
df_train = df_train.sample(n = len(df_train), random_state = 20).reset_index(drop = True)

print('Train balanced prevalence(n = %d):%.3f'%(len(df_train), calc_prevalence(df_train.no_show.values)))

In [None]:
# 
cols_input = ['Scholarship','Hipertension', 'Diabetes', 'Alcoholism',
       'SMS_received', 'Age', 'awaiting_time_days', 'WeekdayScheduled'] + cols_all_cat


X_train = df_train[cols_input].values
y_train = df_train['no_show'].values

X_test = df_test[cols_input].values
y_test = df_test['no_show'].values

print('Training shapes:',X_train.shape, y_train.shape)
print('Testing shapes:',X_test.shape, y_test.shape)
