In [2]:
# Classification algorithm to build a model.

# Import numpy(as np), pandas, and DecisionTreeClassifier from sklearn.tree

import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier


In [3]:
# Now read data using pandas dataframe:

my_data = pd.read_csv('drug200.csv')
my_data[0:5]

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,drugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC
3,28,F,NORMAL,HIGH,7.798,drugX
4,61,F,LOW,HIGH,18.043,drugY


In [4]:
# Practice 
# What is the size of data?
#import os

#def getSize(drug200):
#    my_data = os.my_data(drug200)
#    return my_data.my_data_size
#print(my_data)

my_data.size

1200

In [12]:
#        Pre-processing      

# Using my_data as the Drug.csv read by pandas, declare the following
# variables:

#  . X as the Feature Matrix(data of my_data)
#  . y as the response vector(target)

# Remove the column containing the target name since it doesn't contain
#numeric values.

X = my_data[['Age','Sex','BP','Cholesterol','Na_to_K']].values
X[0:5]

array([[23, 'F', 'HIGH', 'HIGH', 25.355],
       [47, 'M', 'LOW', 'HIGH', 13.093],
       [47, 'M', 'LOW', 'HIGH', 10.113999999999999],
       [28, 'F', 'NORMAL', 'HIGH', 7.797999999999999],
       [61, 'F', 'LOW', 'HIGH', 18.043]], dtype=object)

In [13]:
#Some feature in this dataset is categorical such as SEX or BP. Sklearn
#Decision Trees do not handle categorical variables. So convert these
#features to numerical values. pandas.get_dummies()converts categorical
#variable into dummy/indicator variables

from sklearn import preprocessing
le_sex = preprocessing.LabelEncoder()
le_sex.fit(['F','M'])
X[:,1] = le_sex.transform(X[:,1])


le_BP = preprocessing.LabelEncoder()
le_BP.fit(['LOW','NORMAL','HIGH'])
X[:,2] = le_BP.transform(X[:,2])


le_Chol = preprocessing.LabelEncoder()
le_Chol.fit(['NORMAL', 'HIGH'])
X[:, 3] = le_Chol.transform(X[:, 3])

X[0:5]

array([[23, 0, 0, 0, 25.355],
       [47, 1, 1, 0, 13.093],
       [47, 1, 1, 0, 10.113999999999999],
       [28, 0, 2, 0, 7.797999999999999],
       [61, 0, 1, 0, 18.043]], dtype=object)

In [14]:
# Now we can fill the target variable

y = my_data["Drug"]
y[0:5]

0    drugY
1    drugC
2    drugC
3    drugX
4    drugY
Name: Drug, dtype: object

In [15]:
#     SETTING UP THE DECISION TREE

#We will be using train/test split or our Decision Tree. 
# Let's import train_test_split from sklearn.cross_validation

from sklearn.model_selection import train_test_split

In [17]:
#Now train_test_split will return 4 different parameters. we'll name
#them:

# X_trainset, X_testset, y_trainset, y_testset

#The train_test_split will need the parameters:
# X, y, test_size = 0.3, and random_state = 3

#The X and y are the arrays required before the split, the test_size
#represents the ratio of the testing dataset, and the random_state
#ensures that we obtain the same splits.

X_trainset, X_testset, y_trainset, y_testset = train_test_split(X,y,test_size=0.3, random_state=3)


In [18]:
#   PRACTICE 

# print the shape of X_trainset and y_trainset. Ensure that the dimensions
#match

print ('Train set: ', X_trainset.shape, y_trainset.shape)


Train set:  (140, 5) (140,)


In [19]:
# PRACTICE

# Print the shape of X_testset and y_testset. Ensure that the dimension
# match

print ('Test set: ', X_testset.shape, y_testset.shape)

Test set:  (60, 5) (60,)


In [20]:
#            MODELING

#We'll first create an instance of the DecisionTreeClassifier called
# drugTree.
#Inside of the classifier, specify criterion='entropy' so we can see
#the information gain of each node.

drugTree = DecisionTreeClassifier(criterion="entropy", max_depth=4)
drugTree  # it shows the default parameters


DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=4,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [21]:
# Next, we'll fit the data with the training feature X-trainset and 
# training response vector y_trainset

drugTree.fit(X_trainset, y_trainset)


DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=4,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [25]:
#        PREDICTION 

#Let's make some predictions on the testing dataset and store it into 
# a variable called predTree.

predTree = drugTree.predict(X_testset)

#You can print out predTree and y_testset if you want to visually 
#compare the prediction to the actual values

print(predTree [0:5])
print(y_testset [0:5])

['drugY' 'drugX' 'drugX' 'drugX' 'drugX']
40     drugY
51     drugX
139    drugX
197    drugX
170    drugX
Name: Drug, dtype: object


In [24]:
#           EVALUATION
#Next, let's import metrics from sklearn and check the accuracy of our
#model.

from sklearn import metrics
import matplotlib.pyplot as plt
print("DecisionTree's Accuracy: ", metrics.accuracy_score(y_testset, predTree))


DecisionTree's Accuracy:  0.983333333333


In [None]:
#           PRACTICE

#Can you calculate the accuracy score without sklearn?

accuracy_score = (y_testset - predTree)


In [None]:
#              VISUALIZATION

# Let's visualize the tree

#NOTICE: You might need to uncomment and install the pydotplus and 
#graphviz libraries if you have not installed

#!conda install -c conda-forge pydotplus -y
#!conda install -c conda-forge python-graphviz -y

#!conda update -n base -c defaults conda


In [None]:
from sklearn.externals.six import StringIO
import pydotplus
import matplotlib.image as mpimg
from sklearn import tree
%matplotlib inline

In [None]:
dot_data = StringIO()
filename = "drugtree.png"
featureNames = my_data.columns[0:5]
targetNames = my_data["Drug"].unique().tolist()
out=tree.export_graphviz(drugTree,feature_names=featureNames, out_file=dot_data, class_names=np.unique(y_trainset),filled=True,special_characters=True, rotate=False)
graph = pydotplus.graph_from_dot_data(dot_data.getValue())
graph.write_png(filename)
img = mpimg.imread(filename)
plt.figure(figsize =(100, 200))
plt.imshow(img, interpolation='nearest')
#plt.show()