In [1]:
import csv
import pandas as pd
from sklearn import tree
from sklearn.tree import export_graphviz
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,classification_report
import warnings
warnings.filterwarnings('ignore')

### Step1. [Create Dataset]
Create the following dataset using Excel and save it as CSV file.

In [2]:
df = pd.read_csv("animal.csv")
df

Unnamed: 0,Toothed,Hair,Breathes,Legs,Species
0,True,True,True,True,Mammal
1,True,True,True,True,Mammal
2,True,False,True,False,Repite
3,False,True,True,True,Mammal
4,True,True,True,True,Mammal
5,True,True,True,True,Mammal
6,True,False,False,False,Repite
7,True,False,True,False,Repite
8,True,True,True,True,Mammal
9,False,False,True,True,Repite


### Step2. [Model building using ID3]

In [3]:
label_encoder = LabelEncoder()
df["Label"] = label_encoder.fit_transform(df["Species"]) 
df

Unnamed: 0,Toothed,Hair,Breathes,Legs,Species,Label
0,True,True,True,True,Mammal,0
1,True,True,True,True,Mammal,0
2,True,False,True,False,Repite,1
3,False,True,True,True,Mammal,0
4,True,True,True,True,Mammal,0
5,True,True,True,True,Mammal,0
6,True,False,False,False,Repite,1
7,True,False,True,False,Repite,1
8,True,True,True,True,Mammal,0
9,False,False,True,True,Repite,1


In [4]:
categories = list(label_encoder.inverse_transform([0, 1]))
categories

['Mammal', 'Repite']

In [5]:
X = df.drop(['Label','Species'],axis=1)

In [6]:
y = df.Label

In [7]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.33,random_state=0)

In [8]:
clf = DecisionTreeClassifier(criterion='entropy',max_depth=4, random_state=42)
clf.fit(X_train,y_train)

DecisionTreeClassifier(criterion='entropy', max_depth=4, random_state=42)

In [9]:
X_test

Unnamed: 0,Toothed,Hair,Breathes,Legs
2,True,False,True,False
8,True,True,True,True
4,True,True,True,True
9,False,False,True,True


In [10]:
y_pred = clf.predict(X_test)
y_pred

array([1, 0, 0, 0])

In [11]:
print("Accuracy of train:",clf.score(X_train,y_train))
print("Accuracy of test:",clf.score(X_test,y_test))

Accuracy of train: 1.0
Accuracy of test: 0.75


In [12]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.67      1.00      0.80         2
           1       1.00      0.50      0.67         2

    accuracy                           0.75         4
   macro avg       0.83      0.75      0.73         4
weighted avg       0.83      0.75      0.73         4



http://webgraphviz.com/

In [13]:
with open("tree1.txt", 'w') as f:
    f = tree.export_graphviz(clf,out_file=f,max_depth = 4,impurity = False,feature_names = X.columns.values,class_names = categories,filled= True )

### Step3. [Create a Test Set]

In [14]:
fields = ['name','Toothed', 'Hair', 'Breathes', 'Legs', 'Species'] 
    
# data rows of csv file 
rows = [ ['Turtile','False','False','True','False','Reptile'], 
         ['Blue Whales','False','True','True','True','Mammal'], 
         ['Crocodile','True','False','True','True','Reptile'] ] 
    
# name of csv file 
filename = "testing.csv"

In [15]:
with open(filename, 'w') as file: 
    # creating a csv dict writer object 
    writer = csv.writer(file) 
        
    # writing headers (field names) 
    writer.writerow(fields) 
        
    # writing data rows 
    writer.writerows(rows) 

In [16]:
df_1 = pd.read_csv("testing.csv")
df_1.set_index('name')

Unnamed: 0_level_0,Toothed,Hair,Breathes,Legs,Species
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Turtile,False,False,True,False,Reptile
Blue Whales,False,True,True,True,Mammal
Crocodile,True,False,True,True,Reptile


In [17]:
df_1["Label"] = label_encoder.fit_transform(df_1["Species"]) 
df_1

Unnamed: 0,name,Toothed,Hair,Breathes,Legs,Species,Label
0,Turtile,False,False,True,False,Reptile,1
1,Blue Whales,False,True,True,True,Mammal,0
2,Crocodile,True,False,True,True,Reptile,1


### Step4. [Perform prediction]

In [18]:
temp = df_1.drop(['name','Species','Label'],axis=1)
y_prd = clf.predict(temp)
y_prd

array([1, 0, 0])

In [19]:
accuracy_score(df_1.Label,y_prd)

0.6666666666666666

### Step5. [Build CART Decision Tree Model]

In [20]:
clf_1 = DecisionTreeClassifier(criterion='gini',max_depth=4, random_state=42)
clf_1.fit(X,y)

DecisionTreeClassifier(max_depth=4, random_state=42)

In [21]:
clf_1.predict(temp)

array([1, 0, 1])

In [22]:
with open("tree2.txt", 'w') as f:
    f = tree.export_graphviz(clf_1,out_file=f,max_depth = 4,impurity = False,feature_names = X.columns.values,class_names = categories,filled= True )

### Step6. [Buid DT with Zoo dataset]

In [23]:
df_2 = pd.read_csv("zoo.csv")

In [24]:
df_2.head()

Unnamed: 0,name,hair,feathers,eggs,milk,airbone,aquatic,predator,toothed,backbone,breathes,venomous,fins,legs,tail,dosmestic,catsize,type
0,aardvark,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,1
1,antelope,1,0,0,1,0,0,0,1,1,1,0,0,4,1,0,1,1
2,bass,0,0,1,0,0,1,1,1,1,0,0,1,0,1,0,0,4
3,bear,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,1
4,boar,1,0,0,1,0,0,1,1,1,1,0,0,4,1,0,1,1


In [25]:
df_2.shape

(101, 18)

In [26]:
df_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101 entries, 0 to 100
Data columns (total 18 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   name       101 non-null    object
 1   hair       101 non-null    int64 
 2   feathers   101 non-null    int64 
 3   eggs       101 non-null    int64 
 4   milk       101 non-null    int64 
 5   airbone    101 non-null    int64 
 6   aquatic    101 non-null    int64 
 7   predator   101 non-null    int64 
 8   toothed    101 non-null    int64 
 9   backbone   101 non-null    int64 
 10  breathes   101 non-null    int64 
 11  venomous   101 non-null    int64 
 12  fins       101 non-null    int64 
 13  legs       101 non-null    int64 
 14  tail       101 non-null    int64 
 15  dosmestic  101 non-null    int64 
 16  catsize    101 non-null    int64 
 17  type       101 non-null    int64 
dtypes: int64(17), object(1)
memory usage: 14.3+ KB


In [27]:
_X = df_2.drop(['name','type'],axis=1)
_y = df_2.type

In [28]:
X__train,X__test,y__train,y__test = train_test_split(_X,_y,test_size=0.33,random_state=0)

In [29]:
clf_2 = DecisionTreeClassifier(criterion='entropy',max_depth=3, random_state=52)
clf_2.fit(X__train,y__train)

DecisionTreeClassifier(criterion='entropy', max_depth=3, random_state=52)

In [30]:
clf_2.predict(X__test)

array([4, 4, 4, 1, 1, 1, 2, 4, 1, 1, 7, 1, 2, 7, 4, 6, 1, 7, 2, 4, 2, 4,
       1, 2, 1, 1, 1, 2, 4, 4, 4, 4, 4, 1], dtype=int64)

In [31]:
clf_3 = DecisionTreeClassifier(criterion='gini',max_depth=4, random_state=42)
clf_3.fit(X__train,y__train)

DecisionTreeClassifier(max_depth=4, random_state=42)

In [32]:
y_pred=clf_3.predict(X__test)
y_pred

array([7, 4, 4, 1, 1, 1, 2, 4, 1, 1, 7, 1, 2, 7, 4, 6, 1, 7, 2, 4, 2, 7,
       1, 2, 1, 1, 1, 2, 4, 7, 4, 7, 7, 1], dtype=int64)

In [33]:
from sklearn.preprocessing import StandardScaler
scale = StandardScaler()
ss = scale.fit_transform(X__train)
ss1 = scale.transform(X__test)

In [34]:
print("model accuracy  ",accuracy_score(y__test,y_pred))
print("Train accuracy   ",clf_3.score(ss,y__train))
print("Test accuracy   ",clf_3.score(ss1,y__test))

model accuracy   0.8235294117647058
Train accuracy    0.9253731343283582
Test accuracy    0.8235294117647058


In [35]:
df_2.type.value_counts(dropna=False)

1    41
2    20
4    13
7    10
6     8
3     5
5     4
Name: type, dtype: int64

In [36]:
with open("tree3.txt", 'w') as f:
    f = tree.export_graphviz(clf_3,out_file=f,max_depth = 16,impurity = False,feature_names = _X.columns.values,class_names = ['1','2','3','4','5','6','7'] ,filled= True )

In [37]:
print(classification_report(y__test,y_pred))

              precision    recall  f1-score   support

           1       1.00      1.00      1.00        12
           2       1.00      1.00      1.00         6
           3       0.00      0.00      0.00         4
           4       1.00      1.00      1.00         7
           5       0.00      0.00      0.00         1
           6       1.00      0.50      0.67         2
           7       0.25      1.00      0.40         2

    accuracy                           0.82        34
   macro avg       0.61      0.64      0.58        34
weighted avg       0.81      0.82      0.80        34

