In [25]:
# Import required modules
import numpy as np
from __future__ import print_function
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import style
from pprint import pprint
from IPython.display import display

# Load the data into variable 'df'
df = pd.read_csv('https://raw.githubusercontent.com/cdmorgan103/7331DataMiningNoShow/master/data/updated.csv')
# Get an overview of the raw data
df.info(null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110527 entries, 0 to 110526
Data columns (total 18 columns):
PatientId         110527 non-null int64
AppointmentID     110527 non-null int64
Gender            110527 non-null object
ScheduledDay      110527 non-null object
ScheduledTime     110527 non-null object
AppointmentDay    110527 non-null object
Age               110527 non-null int64
Neighbourhood     110527 non-null object
Scholarship       110527 non-null int64
Hypertension      110527 non-null int64
Diabetes          110527 non-null int64
Alcoholism        110527 non-null int64
Handicap          110527 non-null int64
SMSReceived       110527 non-null int64
NoShow            110527 non-null object
DaysInAdvance     110527 non-null int64
ScheduledDOW      110527 non-null object
AppointmentDOW    110527 non-null object
dtypes: int64(10), object(8)
memory usage: 15.2+ MB


In [26]:
# 1. Remove attributes not usefull
#del df['PatientId'] not sure if we need to remove, might be helpful with identifying multiple appointments by Patient 

# 3. Computed discrete features agains now with the newest values
df['age_range'] = pd.cut(df.Age,[0,16,24,65,1e6],4,labels=[0,1,2,3]) # this creates a new variable

# 4. drop rows that still had missing values after grouped imputation
df.dropna(inplace=True)
df.age_range = df.age_range.astype(np.int)

df.info

<bound method DataFrame.info of               PatientId  AppointmentID Gender ScheduledDay ScheduledTime  \
0        29872499824296        5642903      F   2016-04-29      18:38:08   
1       558997776694438        5642503      M   2016-04-29      16:08:27   
2         4262962299951        5642549      F   2016-04-29      16:19:04   
3          867951213174        5642828      F   2016-04-29      17:29:31   
4         8841186448183        5642494      F   2016-04-29      16:07:23   
5        95985133231274        5626772      F   2016-04-27      08:36:51   
6       733688164476661        5630279      F   2016-04-27      15:05:12   
7         3449833394123        5630575      F   2016-04-27      15:39:58   
8        56394729949972        5638447      F   2016-04-29      08:02:16   
9        78124564369297        5629123      F   2016-04-27      12:48:25   
10      734536231958495        5630213      F   2016-04-27      14:58:11   
11        7542951368435        5620163      M   2016-04-

In [27]:
# NOTE: calling describe when not all the data is categorical will cause the 
# categorical variables to be removed
df[['Gender','Handicap']].describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Handicap,106988.0,0.020937,0.143174,0.0,0.0,0.0,0.0,1.0


In [28]:
# perform one-hot encoding of the categorical data "embarked"
tmp_df = pd.get_dummies(df.Handicap,prefix='Handicap')
df = pd.concat((df,tmp_df),axis=1) # add back into the dataframe

# replace the current Sex atribute with something slightly more intuitive and readable
df['IsMale'] = df.Gender=='male' 
df.IsMale = df.IsMale.astype(np.int)

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 106988 entries, 0 to 110526
Data columns (total 22 columns):
PatientId         106988 non-null int64
AppointmentID     106988 non-null int64
Gender            106988 non-null object
ScheduledDay      106988 non-null object
ScheduledTime     106988 non-null object
AppointmentDay    106988 non-null object
Age               106988 non-null int64
Neighbourhood     106988 non-null object
Scholarship       106988 non-null int64
Hypertension      106988 non-null int64
Diabetes          106988 non-null int64
Alcoholism        106988 non-null int64
Handicap          106988 non-null int64
SMSReceived       106988 non-null int64
NoShow            106988 non-null object
DaysInAdvance     106988 non-null int64
ScheduledDOW      106988 non-null object
AppointmentDOW    106988 non-null object
age_range         106988 non-null int32
Handicap_0        106988 non-null uint8
Handicap_1        106988 non-null uint8
IsMale            106988 non-null int32
d

In [29]:
# Now let's clean up the dataset
if 'Gender' in df:
    del df['Gender'] # if 'Sex' column still exists, delete it (as we created an ismale column)
    
if 'Handicap' in df:    
    del df['Handicap'] # get reid of the original category as it is now one-hot encoded
    
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 106988 entries, 0 to 110526
Data columns (total 20 columns):
PatientId         106988 non-null int64
AppointmentID     106988 non-null int64
ScheduledDay      106988 non-null object
ScheduledTime     106988 non-null object
AppointmentDay    106988 non-null object
Age               106988 non-null int64
Neighbourhood     106988 non-null object
Scholarship       106988 non-null int64
Hypertension      106988 non-null int64
Diabetes          106988 non-null int64
Alcoholism        106988 non-null int64
SMSReceived       106988 non-null int64
NoShow            106988 non-null object
DaysInAdvance     106988 non-null int64
ScheduledDOW      106988 non-null object
AppointmentDOW    106988 non-null object
age_range         106988 non-null int32
Handicap_0        106988 non-null uint8
Handicap_1        106988 non-null uint8
IsMale            106988 non-null int32
dtypes: int32(2), int64(9), object(7), uint8(2)
memory usage: 14.9+ MB


In [30]:
from sklearn.model_selection import ShuffleSplit

# we want to predict the X and y data as follows:
if 'NoShow' in df:
    y = df['NoShow'].values # get the labels we want
    del df['NoShow'] # get rid of the class label
    X = df.values # use everything else to predict!

    ## X and y are now numpy matrices, by calling 'values' on the pandas data frames we
    #    have converted them into simple matrices to use with scikit learn
    
    
# to use the cross validation object in scikit learn, we need to grab an instance
#    of the object and set it up. This object will be able to split our data into 
#    training and testing splits
num_cv_iterations = 3
num_instances = len(y)
cv_object = ShuffleSplit(n_splits=num_cv_iterations,
                         test_size  = 0.2)
                         
print(cv_object)

ShuffleSplit(n_splits=3, random_state=None, test_size=0.2, train_size=None)
