In [38]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime as dt

# You can configure the format of the images: ‘png’, ‘retina’, ‘jpeg’, ‘svg’, ‘pdf’.
%config InlineBackend.figure_format = 'svg'
# this statement allows the visuals to render within your Jupyter Notebook
%matplotlib inline 

# Data Cleaning & Feature Engineering

In [2]:
df = pd.read_csv('../data/KaggleV2-May-2016.csv')

In [3]:
df

Unnamed: 0,PatientId,AppointmentID,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,No-show
0,2.987250e+13,5642903,F,2016-04-29T18:38:08Z,2016-04-29T00:00:00Z,62,JARDIM DA PENHA,0,1,0,0,0,0,No
1,5.589978e+14,5642503,M,2016-04-29T16:08:27Z,2016-04-29T00:00:00Z,56,JARDIM DA PENHA,0,0,0,0,0,0,No
2,4.262962e+12,5642549,F,2016-04-29T16:19:04Z,2016-04-29T00:00:00Z,62,MATA DA PRAIA,0,0,0,0,0,0,No
3,8.679512e+11,5642828,F,2016-04-29T17:29:31Z,2016-04-29T00:00:00Z,8,PONTAL DE CAMBURI,0,0,0,0,0,0,No
4,8.841186e+12,5642494,F,2016-04-29T16:07:23Z,2016-04-29T00:00:00Z,56,JARDIM DA PENHA,0,1,1,0,0,0,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110522,2.572134e+12,5651768,F,2016-05-03T09:15:35Z,2016-06-07T00:00:00Z,56,MARIA ORTIZ,0,0,0,0,0,1,No
110523,3.596266e+12,5650093,F,2016-05-03T07:27:33Z,2016-06-07T00:00:00Z,51,MARIA ORTIZ,0,0,0,0,0,1,No
110524,1.557663e+13,5630692,F,2016-04-27T16:03:52Z,2016-06-07T00:00:00Z,21,MARIA ORTIZ,0,0,0,0,0,1,No
110525,9.213493e+13,5630323,F,2016-04-27T15:09:23Z,2016-06-07T00:00:00Z,38,MARIA ORTIZ,0,0,0,0,0,1,No


In [4]:
df.describe()

Unnamed: 0,PatientId,AppointmentID,Age,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received
count,110527.0,110527.0,110527.0,110527.0,110527.0,110527.0,110527.0,110527.0,110527.0
mean,147496300000000.0,5675305.0,37.088874,0.098266,0.197246,0.071865,0.0304,0.022248,0.321026
std,256094900000000.0,71295.75,23.110205,0.297675,0.397921,0.258265,0.171686,0.161543,0.466873
min,39217.84,5030230.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,4172614000000.0,5640286.0,18.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,31731840000000.0,5680573.0,37.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,94391720000000.0,5725524.0,55.0,0.0,0.0,0.0,0.0,0.0,1.0
max,999981600000000.0,5790484.0,115.0,1.0,1.0,1.0,1.0,4.0,1.0


In [5]:
df.info() ### check dtypes and for missing values 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110527 entries, 0 to 110526
Data columns (total 14 columns):
PatientId         110527 non-null float64
AppointmentID     110527 non-null int64
Gender            110527 non-null object
ScheduledDay      110527 non-null object
AppointmentDay    110527 non-null object
Age               110527 non-null int64
Neighbourhood     110527 non-null object
Scholarship       110527 non-null int64
Hipertension      110527 non-null int64
Diabetes          110527 non-null int64
Alcoholism        110527 non-null int64
Handcap           110527 non-null int64
SMS_received      110527 non-null int64
No-show           110527 non-null object
dtypes: float64(1), int64(8), object(5)
memory usage: 11.8+ MB


In [6]:
###remove negative value for age 
df['Age'].replace(-1,np.NaN, inplace=True)

In [7]:
### Cast target and gender feature as binaries
### Yes = 1
### F = 1
df['No-show'] = pd.get_dummies(df['No-show'])
df['No-show'] = pd.get_dummies(df['No-show']) ### repeating because I want Yes to be '1'
df['Gender'] = pd.get_dummies(df['Gender'])

In [86]:
### Split time stamp from date for date time features 

df['ScheduledDay_Day'] = df['ScheduledDay'].str.split(pat='T', expand = True)[0]
df['AppointmentDay_Day'] = df['AppointmentDay'].str.split(pat='T', expand = True)[0]

df['AppointmentDay_Time'] = df['AppointmentDay'].str.split(pat='T', expand = True)[1]
df['ScheduledDay_Time'] = df['ScheduledDay'].str.split(pat='T', expand = True)[1]

In [9]:
df['AppointmentDay_Time'].value_counts() ### thre are no time stamps for appointment day 

00:00:00Z    110527
Name: AppointmentDay_Time, dtype: int64

In [82]:
### remove the trailing Z
df['ScheduledDay_Time'] = df['ScheduledDay_Time'].str.translate({ord('Z'): None})

AttributeError: Can only use .str accessor with string values!

In [87]:
### Create feature for hour of the day for scheduled 
df['ScheduledDay_Hours'] = df['ScheduledDay_Time'].apply(lambda x:x[0:2])

In [80]:
### Cast datetime features  
df['ScheduledDay_Day'] = pd.to_datetime(df['ScheduledDay_Day'], infer_datetime_format=True)
df['AppointmentDay_Day'] = pd.to_datetime(df['AppointmentDay_Day'], infer_datetime_format=True)

AttributeError: 'Series' object has no attribute 'time'

In [45]:
### Calculate time lag between ScheduleDay and AppointmentDay
df['day_difference'] = (df['AppointmentDay_Day'] - df['ScheduledDay_Day'])

In [46]:
df['day_difference'].describe() ### there seems to be negative date differences...

count                     110527
mean     10 days 04:24:31.828602
std      15 days 06:07:11.673762
min            -6 days +00:00:00
25%              0 days 00:00:00
50%              4 days 00:00:00
75%             15 days 00:00:00
max            179 days 00:00:00
Name: day_difference, dtype: object

In [14]:
### -1 means less than a day, but when I checked the entries for time differences
### of less than -1, the scheduled date is clearly after appointment date 
### I suspect that the negative days are arising from data entry errors, 
### where the scheduled and appointment days were mixed up

In [47]:
df['day_difference'] = df['day_difference'].apply(lambda x: x.days) ### cast day lag into absolute values
df['day_difference']

0          0
1          0
2          0
3          0
4          0
          ..
110522    35
110523    35
110524    41
110525    41
110526    41
Name: day_difference, Length: 110527, dtype: int64

In [17]:
df.Neighbourhood.value_counts().describe()

### there are way too many variables to create dummies.
### can't really find additional info about the neigbourhoods to append 

count      81.000000
mean     1364.530864
std      1369.199539
min         1.000000
25%       413.000000
50%       851.000000
75%      2018.000000
max      7717.000000
Name: Neighbourhood, dtype: float64

In [18]:
### This feature is suppose to be true/false 
df['Handcap'].replace([2,3,4],1, inplace = True)

In [19]:
df.AppointmentID.value_counts() ### all appointment ids are unique! 

5769215    1
5731652    1
5707080    1
5702986    1
5715276    1
          ..
5586290    1
5584243    1
5598584    1
5602682    1
5771266    1
Name: AppointmentID, Length: 110527, dtype: int64

In [20]:
### create a feature that tells us if its a repeat patient 

patient_visits = df['PatientId'].value_counts()

In [21]:
### let's create a new feature to tell us how many prior appointments 
total_appointments = df.groupby('PatientId').AppointmentID.count().reset_index()
total_appointments.columns = ['PatientId','total_appointments']

In [22]:
df = df.merge(total_appointments, how = 'left', on = 'PatientId')

In [56]:
### we can also create a flag for regular patients 
### we can change the treshold later 
df['regular_patient'] = np.where(df['total_appointments']>1,1,0)

In [67]:
### yay we can also check the day of the week for appointments and schedule 

df['appointment_weekday'] = df['AppointmentDay_Day'].apply(lambda x:x.isoweekday())
df['schedule_weekday'] = df['ScheduledDay_Day'].apply(lambda x:x.isoweekday())

In [71]:
df['appointment_weekday'].value_counts() ### some sunday appointments... are these special cases?

3    25867
2    25640
1    22715
5    19019
4    17247
6       39
Name: appointment_weekday, dtype: int64

In [70]:
df['schedule_weekday'].value_counts() ### hmmm, looks like need to make appointment at the clinic 

2    26168
3    24262
1    23085
5    18915
4    18073
6       24
Name: schedule_weekday, dtype: int64

In [90]:
### let's more sunday entries, they seem to be special cases 
df['appointment_weekday'].replace(6,np.NaN, inplace=True)
df['schedule_weekday'].replace(6,np.NaN, inplace=True)

In [102]:
### create feature for no. of afflictions 

df['total_condition'] = df['Hipertension']+df['Diabetes']+df['Handcap']+df['Alcoholism']

In [105]:
df['total_condition'].value_counts()

0    84115
1    18123
2     7658
3      618
4       13
Name: total_condition, dtype: int64