In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime

# You can configure the format of the images: ‘png’, ‘retina’, ‘jpeg’, ‘svg’, ‘pdf’.
%config InlineBackend.figure_format = 'svg'
# this statement allows the visuals to render within your Jupyter Notebook
%matplotlib inline 

In [2]:
df = pd.read_csv('../data/KaggleV2-May-2016.csv')

In [3]:
df

Unnamed: 0,PatientId,AppointmentID,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,No-show
0,2.987250e+13,5642903,F,2016-04-29T18:38:08Z,2016-04-29T00:00:00Z,62,JARDIM DA PENHA,0,1,0,0,0,0,No
1,5.589978e+14,5642503,M,2016-04-29T16:08:27Z,2016-04-29T00:00:00Z,56,JARDIM DA PENHA,0,0,0,0,0,0,No
2,4.262962e+12,5642549,F,2016-04-29T16:19:04Z,2016-04-29T00:00:00Z,62,MATA DA PRAIA,0,0,0,0,0,0,No
3,8.679512e+11,5642828,F,2016-04-29T17:29:31Z,2016-04-29T00:00:00Z,8,PONTAL DE CAMBURI,0,0,0,0,0,0,No
4,8.841186e+12,5642494,F,2016-04-29T16:07:23Z,2016-04-29T00:00:00Z,56,JARDIM DA PENHA,0,1,1,0,0,0,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110522,2.572134e+12,5651768,F,2016-05-03T09:15:35Z,2016-06-07T00:00:00Z,56,MARIA ORTIZ,0,0,0,0,0,1,No
110523,3.596266e+12,5650093,F,2016-05-03T07:27:33Z,2016-06-07T00:00:00Z,51,MARIA ORTIZ,0,0,0,0,0,1,No
110524,1.557663e+13,5630692,F,2016-04-27T16:03:52Z,2016-06-07T00:00:00Z,21,MARIA ORTIZ,0,0,0,0,0,1,No
110525,9.213493e+13,5630323,F,2016-04-27T15:09:23Z,2016-06-07T00:00:00Z,38,MARIA ORTIZ,0,0,0,0,0,1,No


In [4]:
df.describe()

Unnamed: 0,PatientId,AppointmentID,Age,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received
count,110527.0,110527.0,110527.0,110527.0,110527.0,110527.0,110527.0,110527.0,110527.0
mean,147496300000000.0,5675305.0,37.088874,0.098266,0.197246,0.071865,0.0304,0.022248,0.321026
std,256094900000000.0,71295.75,23.110205,0.297675,0.397921,0.258265,0.171686,0.161543,0.466873
min,39217.84,5030230.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,4172614000000.0,5640286.0,18.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,31731840000000.0,5680573.0,37.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,94391720000000.0,5725524.0,55.0,0.0,0.0,0.0,0.0,0.0,1.0
max,999981600000000.0,5790484.0,115.0,1.0,1.0,1.0,1.0,4.0,1.0


In [5]:
df.info() ### check dtypes and for missing values 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110527 entries, 0 to 110526
Data columns (total 14 columns):
PatientId         110527 non-null float64
AppointmentID     110527 non-null int64
Gender            110527 non-null object
ScheduledDay      110527 non-null object
AppointmentDay    110527 non-null object
Age               110527 non-null int64
Neighbourhood     110527 non-null object
Scholarship       110527 non-null int64
Hipertension      110527 non-null int64
Diabetes          110527 non-null int64
Alcoholism        110527 non-null int64
Handcap           110527 non-null int64
SMS_received      110527 non-null int64
No-show           110527 non-null object
dtypes: float64(1), int64(8), object(5)
memory usage: 11.8+ MB


In [6]:
###remove negative value for age 
df['Age'].replace(-1,np.NaN, inplace=True)

In [7]:
### Cast datetime features  
df['ScheduledDay'] = pd.to_datetime(df['ScheduledDay'], infer_datetime_format=True)
df['AppointmentDay'] = pd.to_datetime(df['AppointmentDay'], infer_datetime_format=True)

In [8]:
### Cast target and gender feature as binaries
### Yes = 1
### F = 1
df['No-show'] = pd.get_dummies(df['No-show'])
df['No-show'] = pd.get_dummies(df['No-show']) ### repeating because I want Yes to be '1'
df['Gender'] = pd.get_dummies(df['Gender'])

In [9]:
### Calculate time lag between ScheduleDay and AppointmentDay
df['time_lag'] = (df['AppointmentDay'] - df['ScheduledDay'])

In [10]:
df['time_lag'].describe() ### there seems to be negative date differences...

count                      110527
mean       9 days 17:08:34.161960
std       15 days 05:51:27.891504
min             -7 days +10:10:40
25%      -1 days +15:41:31.500000
50%               3 days 11:22:18
75%       14 days 07:41:34.500000
max             178 days 13:19:01
Name: time_lag, dtype: object

In [27]:
df['time_lag_days'] = df['time_lag'].apply(lambda x:x.days)
df['time_lag_days'].astype(int)

### -1 means less than a day, but when I checked the entries for time differences
### of less than -1, the scheduled date is clearly after appointment date 

0         -1
1         -1
2         -1
3         -1
4         -1
          ..
110522    34
110523    34
110524    40
110525    40
110526    40
Name: time_lag_days, Length: 110527, dtype: int64

In [28]:
df['time_lag_days'].apply(abs) ### cast day lag into absolute values

### I suspect that the negative days are arising from data entry errors, 
### where the scheduled and appointment days were mixed up

0          1
1          1
2          1
3          1
4          1
          ..
110522    34
110523    34
110524    40
110525    40
110526    40
Name: time_lag_days, Length: 110527, dtype: int64

In [30]:
df.Neighbourhood.value_counts().describe()

### there are way too many variables to create dummies.
### Let's try to reduce them by setting a minimum category count

count      81.000000
mean     1364.530864
std      1369.199539
min         1.000000
25%       413.000000
50%       851.000000
75%      2018.000000
max      7717.000000
Name: Neighbourhood, dtype: float64

In [47]:
x = df.Neighbourhood.value_counts()

In [49]:
pd.options.display.max_rows = 4000

JARDIM CAMBURI                 7717
MARIA ORTIZ                    5805
RESISTÊNCIA                    4431
JARDIM DA PENHA                3877
ITARARÉ                        3514
CENTRO                         3334
TABUAZEIRO                     3132
SANTA MARTHA                   3131
JESUS DE NAZARETH              2853
BONFIM                         2773
SANTO ANTÔNIO                  2746
SANTO ANDRÉ                    2571
CARATOÍRA                      2565
JABOUR                         2509
SÃO PEDRO                      2448
ILHA DO PRÍNCIPE               2266
NOVA PALESTINA                 2264
ANDORINHAS                     2262
DA PENHA                       2217
ROMÃO                          2215
GURIGICA                       2018
SÃO JOSÉ                       1977
BELA VISTA                     1907
MARUÍPE                        1902
FORTE SÃO JOÃO                 1889
ILHA DE SANTA MARIA            1885
SÃO CRISTÓVÃO                  1836
REDENÇÃO                    