In [40]:
import pandas as pd
import numpy as np
import scipy as sp
import glob as glob

def subset_permute(n_subset, n_total):
    #An effort to make a random list of range of integers as even as possible.
    #i.e., np.random.randint() gave quite variable results. 
    np.random.seed()
    pm = np.tile ( np.arange(n_subset),  int(np.ceil(n_total/n_subset))) [0:n_total] #[1,2,3,4,5,1,2,3,4 ... 1,2]
    for i in range(20):
        np.random.shuffle(pm) #Shuffle somewhat evenly distributed array. 
    return pm

def parse_datetime(raw_datetime):
    num_samples = raw_datetime.shape[0]
    
    weekday = pd.Series([])
    timeofday = pd.Series([])
    datetime_list = pd.Series([])
    
    for i in range(0,raw_datetime.shape[0]):
        datetime_temp = str.split(raw_datetime[i], ' ')

        if len(datetime_temp)<2:
            weekday[i]=np.nan
            timeofday[i]=np.nan
            datetime_list[i] = np.nan          
            continue

        date_temp = str.split(datetime_temp[0],'/')
        yy = int(date_temp[2])
        mm = int(date_temp[0])
        dd = int(date_temp[1])

        time_temp = str.split(datetime_temp[1],':')
        hh = int(time_temp[0])
        mn = int(time_temp[1])
        
        datetime_obj = datetime(yy,mm,dd,hh,mn)
        datetime_list[i] = datetime_temp
        weekday[i] = datetime_obj.weekday()
        if 8<= datetime_obj.hour and datetime_obj.hour < 12:
            timeofday[i] = 'AM'
        elif 12 <= datetime_obj.hour and datetime_obj.hour <= 17: 
            timeofday[i] = 'PM'
        else: 
            timeofday[i] = 'OFF'
    return weekday, timeofday, datetime_list


In [None]:
data_raw_fname = 'be223a_dataset.csv'
data_raw = pd.read_csv(data_raw_fname)


raw_datetime = data_raw['ScheduledDTTM_D']
weekday,timeofday,datetimeobj=parse_datetime(data_raw['ScheduledDTTM_D'])

features = pd.concat([
    data_raw[['Gender','Age','OrgCode','Modality']],
    pd.DataFrame({'Weekday':weekday, 'Timeofday':timeofday})
                     ],axis=1)

features_encoded=pd.get_dummies(features)
features_encoded.to_csv('features_encoded.csv')



In [147]:
data_raw_fname = 'be223a_dataset.csv'
data_raw = pd.read_csv(data_raw_fname)

cancel_list = data_raw['ReasonDesc']
#ct = labels.groupby(labels).count()
ct = cancel_list.value_counts()

labels = np.zeros(cancel_list.shape)
toinclude = np.array(['CANCELLED BY PT', 'PT NO SHOW'])
for reason in toinclude: 
    labels += cancel_list ==reason
labels=(labels>0).astype(int) #Cancel == 1

features_encoded = pd.read_csv('features_encoded.csv')
features_encoded=features_encoded.rename(columns={features_encoded.columns[0]:'orig_index'})
features_encoded = pd.concat([features_encoded,pd.DataFrame({'Labels':labels})], axis=1)
show=features_encoded[features_encoded['Labels']==0]
noshow=features_encoded[features_encoded['Labels']==1]

show_subset = show.iloc[np.random.choice(show.shape[0],noshow.shape[0],replace=False)] #Balancing the numbers
features_final=pd.concat([noshow, show_subset],axis=0)
features_final=features_final.reset_index(drop=True)

n_fold = 5
foldlist = subset_permute(n_fold,features_final.shape[0]) #List of fold numbers
features_final = pd.concat([features_final,pd.DataFrame({'Folds':foldlist})], axis=1)
features_final = pd.concat([features_final.iloc[:,1:], features_final.iloc[:,0]],axis=1)
display(features_final)
features_final.to_csv('features_encoded_processed.csv',index=False)


Unnamed: 0,Age,Weekday,Gender_F,Gender_M,Gender_U,OrgCode_ASM,OrgCode_AWW,OrgCode_CCHS,OrgCode_CKHC,OrgCode_JSMO,...,Modality_RG,Modality_SR,Modality_US,Modality_XA,Timeofday_AM,Timeofday_OFF,Timeofday_PM,Labels,Folds,orig_index
0,57,1.0,1,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,1,0,39
1,72,2.0,1,0,0,0,0,0,0,0,...,0,0,1,0,0,0,1,1,2,40
2,72,1.0,1,0,0,0,0,0,0,0,...,0,0,1,0,0,0,1,1,2,41
3,58,4.0,0,1,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,4,57
4,44,0.0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,1,66
5,17,5.0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,3,74
6,65,1.0,1,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,77
7,65,1.0,1,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,1,78
8,65,1.0,1,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,1,79
9,63,3.0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,96
