# Decision Trees: Comparison between ID3 and CART

### Download the data

In [1]:
!wget https://gist.githubusercontent.com/netj/8836201/raw/6f9306ad21398ea43cba4f7d537619d0e07d5ae3/iris.csv

--2022-02-10 13:44:38--  https://gist.githubusercontent.com/netj/8836201/raw/6f9306ad21398ea43cba4f7d537619d0e07d5ae3/iris.csv
Resolving gist.githubusercontent.com (gist.githubusercontent.com)... 185.199.108.133, 185.199.111.133, 185.199.109.133, ...
Connecting to gist.githubusercontent.com (gist.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3975 (3.9K) [text/plain]
Saving to: ‘iris.csv.1’


2022-02-10 13:44:39 (1.96 MB/s) - ‘iris.csv.1’ saved [3975/3975]



### Importing libraries

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

### Load the dataset

In [3]:
df = pd.read_csv('iris.csv')
df.head()

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,5.1,3.5,1.4,0.2,Setosa
1,4.9,3.0,1.4,0.2,Setosa
2,4.7,3.2,1.3,0.2,Setosa
3,4.6,3.1,1.5,0.2,Setosa
4,5.0,3.6,1.4,0.2,Setosa


### Add a column 'class' representing the 'variety' as discrete values

In [4]:
df['class'] = df['variety'].map({
    'Setosa': 0,
    'Versicolor': 1,
    'Virginica': 2
})

### Get the independent (X) data

In [5]:
X = df[['sepal.length', 'sepal.width', 'petal.length', 'petal.width']]
X

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


### Get the dependent (Y) data

In [6]:
Y = df[['class']]
Y

Unnamed: 0,class
0,0
1,0
2,0
3,0
4,0
...,...
145,2
146,2
147,2
148,2


### Split the dataset into training and testing data

In [7]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3)

## ID3

### Create classifiers and models and test them

In [8]:
id3 = DecisionTreeClassifier(criterion='entropy')
id3_model = id3.fit(X_train, Y_train)
Y_id3 = id3_model.predict(X_test)

### Metrics

In [9]:
accuracy = accuracy_score(Y_test, Y_id3)
recall = recall_score(Y_test, Y_id3, average='micro')
precision = precision_score(Y_test, Y_id3, average='micro')
f1 = f1_score(Y_test, Y_id3, average='micro')
print('Accuracy: ', accuracy)
print('Error rate: ', 1.0 - accuracy)
print('Recall: ', recall)
print('Precision: ', precision)
print('F1 score: ', f1)

Accuracy:  0.9333333333333333
Error rate:  0.06666666666666665
Recall:  0.9333333333333333
Precision:  0.9333333333333333
F1 score:  0.9333333333333333


## CART

### Create classifiers and models and test them

In [10]:
cart = DecisionTreeClassifier(criterion='gini')
cart_model = cart.fit(X_train, Y_train)
Y_cart = cart_model.predict(X_test)

### Metrics

In [11]:
accuracy = accuracy_score(Y_test, Y_cart)
recall = recall_score(Y_test, Y_cart, average='micro')
precision = precision_score(Y_test, Y_cart, average='micro')
f1 = f1_score(Y_test, Y_cart, average='micro')
print('Accuracy: ', accuracy)
print('Error rate: ', 1.0 - accuracy)
print('Recall: ', recall)
print('Precision: ', precision)
print('F1 score: ', f1)

Accuracy:  0.9555555555555556
Error rate:  0.0444444444444444
Recall:  0.9555555555555556
Precision:  0.9555555555555556
F1 score:  0.9555555555555556
