# Medical Appointment No Shows Capstone

### 4. Pre-Processing And Training Data Development

In [1]:
import os

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# SKLEARN
from sklearn.preprocessing import MinMaxScaler, StandardScaler, PowerTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

In [3]:
path = 'data/step2_output.csv'
df = pd.read_csv(path, index_col=None, parse_dates=["ScheduledDay","AppointmentDay"]) 
df.head()

Unnamed: 0,PatientId,AppointmentId,Gender,ScheduledDay,AppointmentDay,Age,Neighborhood,Scholarship,Hypertension,Diabetes,Alcoholism,Handicap,SMSSent,NoShow,WaitingDays
0,29872499824296,5642903,F,2016-04-29 18:38:08,2016-04-29,62,JARDIM DA PENHA,0,1,0,0,0,0,No,-1
1,558997776694438,5642503,M,2016-04-29 16:08:27,2016-04-29,56,JARDIM DA PENHA,0,0,0,0,0,0,No,-1
2,4262962299951,5642549,F,2016-04-29 16:19:04,2016-04-29,62,MATA DA PRAIA,0,0,0,0,0,0,No,-1
3,867951213174,5642828,F,2016-04-29 17:29:31,2016-04-29,8,PONTAL DE CAMBURI,0,0,0,0,0,0,No,-1
4,8841186448183,5642494,F,2016-04-29 16:07:23,2016-04-29,56,JARDIM DA PENHA,0,1,1,0,0,0,No,-1


In [4]:
df = df.drop(["PatientId","AppointmentId"],axis=1)

In [5]:
df['ScheduledDay_Y'] = df['ScheduledDay'].dt.year
df['ScheduledDay_M'] = df['ScheduledDay'].dt.month
df['ScheduledDay_D'] = df['ScheduledDay'].dt.day
df.drop(['ScheduledDay'], axis=1, inplace=True)

df['AppointmentDay_Y'] = df['AppointmentDay'].dt.year
df['AppointmentDay_M'] = df['AppointmentDay'].dt.month
df['AppointmentDay_D'] = df['AppointmentDay'].dt.day
df.drop(['AppointmentDay'], axis=1, inplace=True)

In [6]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110517 entries, 0 to 110516
Data columns (total 17 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   Gender            110517 non-null  object
 1   Age               110517 non-null  int64 
 2   Neighborhood      110517 non-null  object
 3   Scholarship       110517 non-null  int64 
 4   Hypertension      110517 non-null  int64 
 5   Diabetes          110517 non-null  int64 
 6   Alcoholism        110517 non-null  int64 
 7   Handicap          110517 non-null  int64 
 8   SMSSent           110517 non-null  int64 
 9   NoShow            110517 non-null  object
 10  WaitingDays       110517 non-null  int64 
 11  ScheduledDay_Y    110517 non-null  int64 
 12  ScheduledDay_M    110517 non-null  int64 
 13  ScheduledDay_D    110517 non-null  int64 
 14  AppointmentDay_Y  110517 non-null  int64 
 15  AppointmentDay_M  110517 non-null  int64 
 16  AppointmentDay_D  110517 non-null  int

In [7]:
df.select_dtypes(include=[object])

Unnamed: 0,Gender,Neighborhood,NoShow
0,F,JARDIM DA PENHA,No
1,M,JARDIM DA PENHA,No
2,F,MATA DA PRAIA,No
3,F,PONTAL DE CAMBURI,No
4,F,JARDIM DA PENHA,No
...,...,...,...
110512,F,MARIA ORTIZ,No
110513,F,MARIA ORTIZ,No
110514,F,MARIA ORTIZ,No
110515,F,MARIA ORTIZ,No


## LabelEncoder 

### df_le1

In [8]:
df_le1 = df.loc[:, ['Gender', 'Neighborhood', 'NoShow']]

In [9]:
le = LabelEncoder()

df_le1['Gender'] = le.fit_transform(df_le1['Gender'])
df_le1['Neighborhood'] = le.fit_transform(df_le1['Neighborhood'])
df_le1['NoShow'] = le.fit_transform(df_le1['NoShow'])



df_le1.head()

Unnamed: 0,Gender,Neighborhood,NoShow
0,0,38,0
1,1,38,0
2,0,44,0
3,0,53,0
4,0,38,0


In [10]:
df.head()

Unnamed: 0,Gender,Age,Neighborhood,Scholarship,Hypertension,Diabetes,Alcoholism,Handicap,SMSSent,NoShow,WaitingDays,ScheduledDay_Y,ScheduledDay_M,ScheduledDay_D,AppointmentDay_Y,AppointmentDay_M,AppointmentDay_D
0,F,62,JARDIM DA PENHA,0,1,0,0,0,0,No,-1,2016,4,29,2016,4,29
1,M,56,JARDIM DA PENHA,0,0,0,0,0,0,No,-1,2016,4,29,2016,4,29
2,F,62,MATA DA PRAIA,0,0,0,0,0,0,No,-1,2016,4,29,2016,4,29
3,F,8,PONTAL DE CAMBURI,0,0,0,0,0,0,No,-1,2016,4,29,2016,4,29
4,F,56,JARDIM DA PENHA,0,1,1,0,0,0,No,-1,2016,4,29,2016,4,29


### df_le2

In [11]:
df_le2 = df.loc[:]

In [12]:
le = LabelEncoder()

df_le2['Gender'] = le.fit_transform(df_le2['Gender'])
df_le2['Neighborhood'] = le.fit_transform(df_le2['Neighborhood'])
df_le2['NoShow'] = le.fit_transform(df_le2['NoShow'])

# use df.apply() to apply le.fit_transform to all columns
#df = df.apply(le.fit_transform)

df_le2.head()

Unnamed: 0,Gender,Age,Neighborhood,Scholarship,Hypertension,Diabetes,Alcoholism,Handicap,SMSSent,NoShow,WaitingDays,ScheduledDay_Y,ScheduledDay_M,ScheduledDay_D,AppointmentDay_Y,AppointmentDay_M,AppointmentDay_D
0,0,62,38,0,1,0,0,0,0,0,-1,2016,4,29,2016,4,29
1,1,56,38,0,0,0,0,0,0,0,-1,2016,4,29,2016,4,29
2,0,62,44,0,0,0,0,0,0,0,-1,2016,4,29,2016,4,29
3,0,8,53,0,0,0,0,0,0,0,-1,2016,4,29,2016,4,29
4,0,56,38,0,1,1,0,0,0,0,-1,2016,4,29,2016,4,29


In [13]:
df.head()

Unnamed: 0,Gender,Age,Neighborhood,Scholarship,Hypertension,Diabetes,Alcoholism,Handicap,SMSSent,NoShow,WaitingDays,ScheduledDay_Y,ScheduledDay_M,ScheduledDay_D,AppointmentDay_Y,AppointmentDay_M,AppointmentDay_D
0,F,62,JARDIM DA PENHA,0,1,0,0,0,0,No,-1,2016,4,29,2016,4,29
1,M,56,JARDIM DA PENHA,0,0,0,0,0,0,No,-1,2016,4,29,2016,4,29
2,F,62,MATA DA PRAIA,0,0,0,0,0,0,No,-1,2016,4,29,2016,4,29
3,F,8,PONTAL DE CAMBURI,0,0,0,0,0,0,No,-1,2016,4,29,2016,4,29
4,F,56,JARDIM DA PENHA,0,1,1,0,0,0,No,-1,2016,4,29,2016,4,29


## Pandas Get_Dummies Approach

In [14]:
features = ['Gender','Neighborhood','NoShow']

df_gd = pd.concat([df,
                   pd.get_dummies(df[features], drop_first=True,
                   prefix=['G','N','Show'])], axis=1).drop(features, axis=1)

print(df_gd.shape)
df_gd.head()

(110517, 95)


Unnamed: 0,Age,Scholarship,Hypertension,Diabetes,Alcoholism,Handicap,SMSSent,WaitingDays,ScheduledDay_Y,ScheduledDay_M,...,N_SEGURANÇA DO LAR,N_SOLON BORGES,N_SÃO BENEDITO,N_SÃO CRISTÓVÃO,N_SÃO JOSÉ,N_SÃO PEDRO,N_TABUAZEIRO,N_UNIVERSITÁRIO,N_VILA RUBIM,Show_Yes
0,62,0,1,0,0,0,0,-1,2016,4,...,0,0,0,0,0,0,0,0,0,0
1,56,0,0,0,0,0,0,-1,2016,4,...,0,0,0,0,0,0,0,0,0,0
2,62,0,0,0,0,0,0,-1,2016,4,...,0,0,0,0,0,0,0,0,0,0
3,8,0,0,0,0,0,0,-1,2016,4,...,0,0,0,0,0,0,0,0,0,0
4,56,0,1,1,0,0,0,-1,2016,4,...,0,0,0,0,0,0,0,0,0,0


# Sklearn's OneHotEncoder Approach

In [15]:
categorical_features = ['Gender','NoShow','Neighborhood']

ohe = OneHotEncoder(sparse=True, drop='first', handle_unknown='error')


encoded = pd.DataFrame(ohe.fit_transform(df[categorical_features]).toarray(), \
                                columns=ohe.get_feature_names(categorical_features))

dropped_cols = df.drop(columns=categorical_features)


#df_ohe = data_hot_encoded_drop.join(data_hot_encoded)
df_ohe = pd.concat([dropped_cols,encoded],axis=1)

df_ohe

Unnamed: 0,Age,Scholarship,Hypertension,Diabetes,Alcoholism,Handicap,SMSSent,WaitingDays,ScheduledDay_Y,ScheduledDay_M,...,Neighborhood_SANTOS REIS,Neighborhood_SEGURANÇA DO LAR,Neighborhood_SOLON BORGES,Neighborhood_SÃO BENEDITO,Neighborhood_SÃO CRISTÓVÃO,Neighborhood_SÃO JOSÉ,Neighborhood_SÃO PEDRO,Neighborhood_TABUAZEIRO,Neighborhood_UNIVERSITÁRIO,Neighborhood_VILA RUBIM
0,62,0,1,0,0,0,0,-1,2016,4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,56,0,0,0,0,0,0,-1,2016,4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,62,0,0,0,0,0,0,-1,2016,4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,8,0,0,0,0,0,0,-1,2016,4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,56,0,1,1,0,0,0,-1,2016,4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110512,56,0,0,0,0,0,1,34,2016,5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
110513,51,0,0,0,0,0,1,34,2016,5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
110514,21,0,0,0,0,0,1,40,2016,4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
110515,38,0,0,0,0,0,1,40,2016,4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
ohe.get_feature_names(input_features=categorical_features)

array(['Gender_M', 'NoShow_Yes', 'Neighborhood_ANDORINHAS',
       'Neighborhood_ANTÔNIO HONÓRIO', 'Neighborhood_ARIOVALDO FAVALESSA',
       'Neighborhood_BARRO VERMELHO', 'Neighborhood_BELA VISTA',
       'Neighborhood_BENTO FERREIRA', 'Neighborhood_BOA VISTA',
       'Neighborhood_BONFIM', 'Neighborhood_CARATOÍRA',
       'Neighborhood_CENTRO', 'Neighborhood_COMDUSA',
       'Neighborhood_CONQUISTA', 'Neighborhood_CONSOLAÇÃO',
       'Neighborhood_CRUZAMENTO', 'Neighborhood_DA PENHA',
       'Neighborhood_DE LOURDES', 'Neighborhood_DO CABRAL',
       'Neighborhood_DO MOSCOSO', 'Neighborhood_DO QUADRO',
       'Neighborhood_ENSEADA DO SUÁ', 'Neighborhood_ESTRELINHA',
       'Neighborhood_FONTE GRANDE', 'Neighborhood_FORTE SÃO JOÃO',
       'Neighborhood_FRADINHOS', 'Neighborhood_GOIABEIRAS',
       'Neighborhood_GRANDE VITÓRIA', 'Neighborhood_GURIGICA',
       'Neighborhood_HORTO', 'Neighborhood_ILHA DAS CAIEIRAS',
       'Neighborhood_ILHA DE SANTA MARIA', 'Neighborhood_ILHA DO BOI'

# Export Data

In [17]:
#LabelEncoder 1 
df_le1.to_csv('data/df_le1.csv',index=False)

In [18]:
#LabelEncoder 2
df_le2.to_csv('data/df_le2.csv',index=False)

In [19]:
#Get_dummies data
df_gd.to_csv('data/df_gd.csv',index=False)

In [20]:
#OneHotEncoder data
df_ohe.to_csv('data/df_ohe.csv',index=False)