In [1]:
%matplotlib inline
%matplotlib notebook

In [2]:
#import warnings
#warnings.filterwarnings('ignore')

from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt

from sklearn import datasets
import numpy as np
import pandas as pd
from sklearn import tree
from sklearn import metrics

### Loading the HD_labelled_50 dataset

In [3]:
dataset = pd.read_csv('HD_labelled_50.csv')

Head function returns the first n rows for the object based on position, here n = 5.

In [4]:
dataset.head()

Unnamed: 0,Age,Sex,CP,TRestBPS,Chol,FBS,RestECG,ThalAch,ExAng,OldPeak,Slope,CA,Thal,Label
0,63,1,1,145,233,1,2,150,0,2.3,3,0,6,0
1,67,1,4,160,286,0,2,108,1,1.5,2,3,3,1
2,67,1,4,120,229,0,2,129,1,2.6,2,2,7,1
3,37,1,3,130,250,0,0,187,0,3.5,3,0,3,0
4,41,0,2,130,204,0,2,172,0,1.4,1,0,3,0


### Renaming the attribute names

In [5]:
dataset.columns = ['A','B','C','D','E','F','G','H','I','J','K','L','M','Label']

Drop function is used to elimate the values from the table, here we are dropping the column 'A' and 'B'.

In [6]:
z = dataset.drop(['A','B'], axis =  1)

Displaying the values after dropping the columns 'A' and 'B'.

In [7]:
z

Unnamed: 0,C,D,E,F,G,H,I,J,K,L,M,Label
0,1,145,233,1,2,150,0,2.3,3,0,6,0
1,4,160,286,0,2,108,1,1.5,2,3,3,1
2,4,120,229,0,2,129,1,2.6,2,2,7,1
3,3,130,250,0,0,187,0,3.5,3,0,3,0
4,2,130,204,0,2,172,0,1.4,1,0,3,0
5,2,120,236,0,0,178,0,0.8,1,0,3,0
6,4,140,268,0,2,160,0,3.6,3,2,3,1
7,4,120,354,0,0,163,1,0.6,1,0,3,0
8,4,130,254,0,2,147,0,1.4,2,1,7,1
9,4,140,203,1,2,155,1,3.1,3,0,7,1


In [8]:
dataset['Label'].unique()

array([0, 1], dtype=int64)

In [9]:
x = dataset.iloc[:,[2,3,4,6,7,9,10,11,12]].values
y = dataset.iloc[:,[1,5,8,3]].values

In [10]:
y_train =  dataset['Label']

In [11]:
x_train =  dataset.drop('Label', axis = 1)

In [12]:
x_train

Unnamed: 0,A,B,C,D,E,F,G,H,I,J,K,L,M
0,63,1,1,145,233,1,2,150,0,2.3,3,0,6
1,67,1,4,160,286,0,2,108,1,1.5,2,3,3
2,67,1,4,120,229,0,2,129,1,2.6,2,2,7
3,37,1,3,130,250,0,0,187,0,3.5,3,0,3
4,41,0,2,130,204,0,2,172,0,1.4,1,0,3
5,56,1,2,120,236,0,0,178,0,0.8,1,0,3
6,62,0,4,140,268,0,2,160,0,3.6,3,2,3
7,57,0,4,120,354,0,0,163,1,0.6,1,0,3
8,63,1,4,130,254,0,2,147,0,1.4,2,1,7
9,53,1,4,140,203,1,2,155,1,3.1,3,0,7


In [13]:
y_train

0     0
1     1
2     1
3     0
4     0
5     0
6     1
7     0
8     1
9     1
10    0
11    0
12    1
13    0
14    0
15    0
16    1
17    0
18    0
19    0
20    0
21    0
22    1
23    1
24    1
25    0
26    0
27    0
28    0
29    1
30    0
31    1
32    1
33    0
34    0
35    0
36    1
37    1
38    1
39    0
40    1
41    0
42    0
43    0
44    1
45    1
46    0
47    1
48    0
49    0
Name: Label, dtype: int64

Normalize data: the unit of measurement might differ so lets normalize the data before building the model

In [14]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
sc.fit(x_train)
x_train = sc.transform(x_train)

Split data into train and test.  

In [15]:
# split data into train and test
# from sklearn.cross_validation import train_test_split
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.3, random_state=0)

Generate the decision tree using the training data and the entropy-based informaion gain

In [16]:
clf = tree.DecisionTreeClassifier(criterion = 'entropy', random_state=0, min_samples_leaf = 3)
clf.fit(x_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=3, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=0, splitter='best')

Calculate the performance metrics = accuracy and confusion matrix for train and test data 

In [17]:
# generate evaluation metrics
print ("Train - Accuracy :", metrics.accuracy_score(y_train, clf.predict(x_train)))
print ("Train - Confusion matrix :\n", metrics.confusion_matrix(y_train, clf.predict(x_train)))
print ("Train - classification report :\n", metrics.classification_report(y_train, clf.predict(x_train)))

print ("Test - Accuracy :", metrics.accuracy_score(y_test, clf.predict(x_test)))
print ("Test - Confusion matrix :\n",metrics.confusion_matrix(y_test, clf.predict(x_test)))
print ("Test - classification report :\n", metrics.classification_report(y_test, clf.predict(x_test)))

Train - Accuracy : 0.9142857142857143
Train - Confusion matrix :
 [[18  1]
 [ 2 14]]
Train - classification report :
               precision    recall  f1-score   support

           0       0.90      0.95      0.92        19
           1       0.93      0.88      0.90        16

    accuracy                           0.91        35
   macro avg       0.92      0.91      0.91        35
weighted avg       0.92      0.91      0.91        35

Test - Accuracy : 0.8
Test - Confusion matrix :
 [[11  0]
 [ 3  1]]
Test - classification report :
               precision    recall  f1-score   support

           0       0.79      1.00      0.88        11
           1       1.00      0.25      0.40         4

    accuracy                           0.80        15
   macro avg       0.89      0.62      0.64        15
weighted avg       0.84      0.80      0.75        15



### Visualize Decision Tree 

In [18]:
tree.export_graphviz(clf, out_file='tree.dot')    

from sklearn.externals.six import StringIO  
import pydot 
out_data = StringIO() 
tree.export_graphviz(clf, out_file=out_data,
                    feature_names=dataset.columns[:13],
                    class_names=clf.classes_.astype(int).astype(str),
                   # class_names=dataset.columns[:10],
                    filled=True, rounded=True,
                    special_characters=True,
                    node_ids=1,) 
graph = pydot.graph_from_dot_data(out_data.getvalue()) 
graph[0].write_pdf("dataset_eval.pdf")  # save to pdf

from IPython.display import IFrame
IFrame("dataset_eval.pdf", width=600, height=600)

