Modified and improved from base code from IBM Coursera: Machine Learning with Python

In [2]:
import numpy as np 
import pandas as pd
from sklearn.tree import DecisionTreeClassifier

In [3]:
import requests

# store url of dataset
url = 'https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-ML0101EN-SkillsNetwork/labs/Module%203/data/drug200.csv'
# Create requests variable 
r = requests.get(url, allow_redirects=True)
# Write content of request to 'drug200.csv' file
open('drug200.csv', 'wb').write(r.content)

5827

In [4]:
df = pd.read_csv("drug200.csv")

# take a look at the dataset
df.head()

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,drugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC
3,28,F,NORMAL,HIGH,7.798,drugX
4,61,F,LOW,HIGH,18.043,drugY


In [20]:
df.shape

(200, 6)

#### Data Pre-processing

In [5]:
X = np.asanyarray(df.loc[:, df.columns!='Drug'])
X[0:5]

array([[23, 'F', 'HIGH', 'HIGH', 25.355],
       [47, 'M', 'LOW', 'HIGH', 13.093],
       [47, 'M', 'LOW', 'HIGH', 10.114],
       [28, 'F', 'NORMAL', 'HIGH', 7.798],
       [61, 'F', 'LOW', 'HIGH', 18.043]], dtype=object)

In [6]:
# Convert categorical values to numerical dummy/indicator values
from sklearn import preprocessing
le_sex = preprocessing.LabelEncoder()
le_sex.fit(['F','M'])
X[:,1] = le_sex.transform(X[:,1])

le_BP = preprocessing.LabelEncoder()
le_BP.fit([ 'LOW', 'NORMAL', 'HIGH'])
X[:,2] = le_BP.transform(X[:,2])

le_Chol = preprocessing.LabelEncoder()
le_Chol.fit([ 'NORMAL', 'HIGH'])
X[:,3] = le_Chol.transform(X[:,3]) 

X[0:5]

array([[23, 0, 0, 0, 25.355],
       [47, 1, 1, 0, 13.093],
       [47, 1, 1, 0, 10.114],
       [28, 0, 2, 0, 7.798],
       [61, 0, 1, 0, 18.043]], dtype=object)

In [8]:
# Set target variable
y = df["Drug"]
y[0:5]

0    drugY
1    drugC
2    drugC
3    drugX
4    drugY
Name: Drug, dtype: object

### Decision Tree Model, Prediction, Evaluation

In [9]:
from sklearn.model_selection import train_test_split
X_trainset, X_testset, y_trainset, y_testset = train_test_split(X, y, test_size=0.3, random_state=3)

In [11]:
# Create decision tree classifier model
clf = DecisionTreeClassifier(criterion="entropy", max_depth = 4)
# Fit to training data
clf.fit(X_trainset,y_trainset)

DecisionTreeClassifier(criterion='entropy', max_depth=4)

In [12]:
# Prediction
preds = clf.predict(X_testset)

In [15]:
print (preds[0:5])
print (y_testset[0:5].tolist())

['drugY' 'drugX' 'drugX' 'drugX' 'drugX']
['drugY', 'drugX', 'drugX', 'drugX', 'drugX']


In [16]:
# Evaluation
from sklearn import metrics
import matplotlib.pyplot as plt
print("Decision Tree Classifier's Accuracy: ", metrics.accuracy_score(y_testset, preds))

Decision Tree Classifier's Accuracy:  0.9833333333333333


### Test different max_depth values

In [19]:
def best_clf(max_depth, X_trainset, X_testset, y_trainset, y_testset):
    # Create decision tree classifier model
    clf = DecisionTreeClassifier(criterion="entropy", max_depth = 4)
    # Fit to training data
    clf.fit(X_trainset,y_trainset)
    # Prediction
    preds = clf.predict(X_testset)
    # Evaluation
    accuracy = metrics.accuracy_score(y_testset, preds)
    
    return accuracy

In [21]:
max_depth_ls = [2,3,4,5,10,15,20,50,100,150]

max_depth_best = 0
acc_best = 0
acc_curr = 0

for max_depth in max_depth_ls:
    acc_curr = best_clf(max_depth, X_trainset, X_testset, y_trainset, y_testset)
    if acc_curr > acc_best:
        acc_best = acc_curr
        max_depth_best = max_depth
        
print("Best Max Depth: ", max_depth_best, "\tAccuracy:", acc_best)

Best Max Depth:  2 	Accuracy: 0.9833333333333333
