# Medical Appointment No Shows

### Import Libraries

In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score

### Import Data and Explore

In [2]:
ma = pd.read_csv('KaggleV2-May-2016.csv')

In [3]:
ma.head()

Unnamed: 0,PatientId,AppointmentID,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,No-show
0,29872500000000.0,5642903,F,2016-04-29T18:38:08Z,2016-04-29T00:00:00Z,62,JARDIM DA PENHA,0,1,0,0,0,0,No
1,558997800000000.0,5642503,M,2016-04-29T16:08:27Z,2016-04-29T00:00:00Z,56,JARDIM DA PENHA,0,0,0,0,0,0,No
2,4262962000000.0,5642549,F,2016-04-29T16:19:04Z,2016-04-29T00:00:00Z,62,MATA DA PRAIA,0,0,0,0,0,0,No
3,867951200000.0,5642828,F,2016-04-29T17:29:31Z,2016-04-29T00:00:00Z,8,PONTAL DE CAMBURI,0,0,0,0,0,0,No
4,8841186000000.0,5642494,F,2016-04-29T16:07:23Z,2016-04-29T00:00:00Z,56,JARDIM DA PENHA,0,1,1,0,0,0,No


In [4]:
ma.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110527 entries, 0 to 110526
Data columns (total 14 columns):
PatientId         110527 non-null float64
AppointmentID     110527 non-null int64
Gender            110527 non-null object
ScheduledDay      110527 non-null object
AppointmentDay    110527 non-null object
Age               110527 non-null int64
Neighbourhood     110527 non-null object
Scholarship       110527 non-null int64
Hipertension      110527 non-null int64
Diabetes          110527 non-null int64
Alcoholism        110527 non-null int64
Handcap           110527 non-null int64
SMS_received      110527 non-null int64
No-show           110527 non-null object
dtypes: float64(1), int64(8), object(5)
memory usage: 11.8+ MB


In [5]:
# Check for missing values
ma.isnull().sum(axis = 0)

PatientId         0
AppointmentID     0
Gender            0
ScheduledDay      0
AppointmentDay    0
Age               0
Neighbourhood     0
Scholarship       0
Hipertension      0
Diabetes          0
Alcoholism        0
Handcap           0
SMS_received      0
No-show           0
dtype: int64

### Preprocessing

In [6]:
# Select Features and Target
X = ma[['Gender','Age','Scholarship','Hipertension','Diabetes','Alcoholism','Handcap','SMS_received']]
y = ma['No-show']

In [7]:
# Need to encode y (No-show) and Gender
encoder = LabelEncoder()
y = encoder.fit_transform(y)
X['Gender'] = encoder.fit_transform(X['Gender'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [8]:
# Split the data, 80% training, 10% test, 10% validation
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=0)
X_test, X_val, y_test, y_val = train_test_split(X_train,y_train,test_size=0.5,random_state=1)

### Decision Tree Classifier

In [9]:
# Try with entropy
model = DecisionTreeClassifier(criterion='entropy',random_state=0).fit(X_train,y_train)
score = model.score(X_test,y_test)
print('accuracy score: ',score)
y_pred = model.predict(X_test)
conf = confusion_matrix(y_test,y_pred)
print(conf)

accuracy score:  0.8002714318027595
[[35135    76]
 [ 8754   245]]


In [10]:
# Try with gini
model = DecisionTreeClassifier(criterion='gini',random_state=0).fit(X_train,y_train)
score = model.score(X_test,y_test)
print('accuracy score: ',score)
y_pred = model.predict(X_test)
conf = confusion_matrix(y_test,y_pred)
print(conf)

accuracy score:  0.8002714318027595
[[35135    76]
 [ 8754   245]]


### Random Forest Classifier

In [11]:
# Try with entropy
model = RandomForestClassifier(n_estimators=25,criterion='entropy',random_state=0).fit(X_train,y_train)
score = model.score(X_test,y_test)
print('accuracy score: ',score)
y_pred = model.predict(X_test)
conf = confusion_matrix(y_test,y_pred)
print(conf)

accuracy score:  0.8000904772675865
[[35062   149]
 [ 8689   310]]


In [12]:
# Try with gini
model = RandomForestClassifier(n_estimators=25,criterion='gini',random_state=0).fit(X_train,y_train)
score = model.score(X_test,y_test)
print('accuracy score: ',score)
y_pred = model.predict(X_test)
conf = confusion_matrix(y_test,y_pred)
print(conf)

accuracy score:  0.8001130965844832
[[35064   147]
 [ 8690   309]]


In [13]:
# search for optimal number of estimators
i_scores = []

for i in range(10,100,20):
    RFC = RandomForestClassifier(n_estimators=i)
    scores = cross_val_score(RFC,X,y,cv=10,scoring='accuracy')
    i_scores.append(scores.mean())

print(i_scores)
print('best i:',10+(np.asarray(i_scores,dtype=float)).argmax()*20)

[0.7941317697340996, 0.7942584387280381, 0.7944122693227437, 0.7944936788063716, 0.7946203510746164]
best i: 90


In [14]:
# Best Model
model = RandomForestClassifier(n_estimators=90,criterion='gini',random_state=0).fit(X_train,y_train)
score = model.score(X_test,y_test)
print('accuracy score: ',score)
y_pred = model.predict(X_test)
conf = confusion_matrix(y_test,y_pred)
print(conf)

accuracy score:  0.8003845283872427
[[35073   138]
 [ 8687   312]]
