In [7]:
import numpy as np
import pandas as pd
from IPython.display import display, HTML
import colorama
from colorama import Fore, Style

zoo_data=pd.read_csv('W11_zoo.csv')
print("Data set Preview")
display(zoo_data.head())
cols=zoo_data.columns

print("Data Pre-processing : Checking for missing values")
print('ColumnName, DataType, MissingValues')
for i in cols:
    print(i, ',', zoo_data[i].dtype,',',zoo_data[i].isnull().any())

print("Data Pre-processing : Dropping ID column")
zoo_data=zoo_data.drop('name',1)
display(zoo_data.head())

# preprocess label,  requires label encoding
from sklearn import preprocessing

y = zoo_data['type'] # define label as nominal values
le = preprocessing.LabelEncoder()
le.fit(y)
y_encoded = le.transform(y) # encode nominal labels to integers #####################################
zoo_data['type'] = y_encoded
y=zoo_data['type']

x_gender = zoo_data['gender']
x_origin =  zoo_data['origin']

x1_encoded = le.fit_transform(x_gender)
x2_encoded = le.fit_transform(x_origin) # encode nominal labels to integers #####################################
zoo_data['gender'] = x1_encoded
zoo_data['origin'] = x2_encoded
 

print('Using N-fold Cross Validation')

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

import warnings

#---------------------------
print('---------------------------------------DECISION TREE--------------------------------------------')
print('------------------------------------------------------------------------------------------------')
print('------------------------------------------------------------------------------------------------')
print("\n")
print('Model 1 : Using all features given in data set : Full model')  
print(Fore.GREEN+"\t \t All Features")
x1=zoo_data.drop('type',1)
warnings.simplefilter("ignore")
    
clf=DecisionTreeClassifier()
Model1=cross_val_score(clf, x1, y, cv=5, scoring='accuracy', error_score='raise-deprecating').mean()
print(Fore.BLUE+"\t \t Tree Accuracy :",Model1)
print(Style.RESET_ALL)
# --------------------------------
# Decision Tree using  Wrapper method
import statsmodels.api as sm
cols = list(zoo_data.columns)
cols.remove('type') # drop the nominal variable

y1=list(zoo_data['type']) # using Grade as y variable in linear regression
pmax = 1
while (len(cols)>0):
    p= []
    X_1 = zoo_data[cols]
    X_1 = sm.add_constant(X_1)
    model = sm.OLS(y1,X_1).fit()
    p = pd.Series(model.pvalues.values[1:],index = cols)      
    pmax = max(p)
    feature_with_p_max = p.idxmax()
    if(pmax>0.05):
        cols.remove(feature_with_p_max)
    else:
        break
selected_features_BE = cols
print("\n")
# --------------------------------
print(Fore.BLACK+'Model 2 : Using all features from feature selection (Wrapper method)')   
x2=zoo_data.drop('type',1)
x2 = pd.DataFrame(x2, columns = selected_features_BE)
print(Fore.GREEN+'\t \t Selected Features : ',selected_features_BE)
warnings.simplefilter("ignore")
    
# by N-fold cross validation
clf=DecisionTreeClassifier()
Model2=cross_val_score(clf, x2, y, cv=5, scoring='accuracy', error_score='raise-deprecating').mean()
print(Fore.BLUE+"\t \t Tree Accuracy :",Model2)
# --------------------------------
print("\n")
print(Fore.BLACK+'Model 3 : Using top 3 features from feature selection (Wrapper method)')   
top = 3
top3_features_BE = []
x3=zoo_data.drop('type',1)
for i in range(top):
    top3_features_BE.append(selected_features_BE[i])
x3 = pd.DataFrame(x3, columns = top3_features_BE)
print(Fore.GREEN+'\t \t Selected Features : ',top3_features_BE)
warnings.simplefilter("ignore")
    
# by N-fold cross validation
clf=DecisionTreeClassifier()
Model3=cross_val_score(clf, x3, y, cv=5, scoring='accuracy', error_score='raise-deprecating').mean()
print(Fore.BLUE+"\t \t Tree Accuracy :",Model3)
# --------------------------------


print("\n")
print(Fore.BLACK+'Model 4 : Using top 5 features from feature selection (Wrapper method)')   
top = 5
top5_features_BE = []
x4=zoo_data.drop('type',1)
for i in range(top):
    top5_features_BE.append(selected_features_BE[i])
x4 = pd.DataFrame(x4, columns = top5_features_BE)
print(Fore.GREEN+'\t \t Selected Features : ',top5_features_BE)
warnings.simplefilter("ignore")
    
# by N-fold cross validation
clf=DecisionTreeClassifier()
Model4=cross_val_score(clf, x4, y, cv=5, scoring='accuracy', error_score='raise-deprecating').mean()
print(Fore.BLUE+"\t \t Tree Accuracy :",Model4)
# --------------------------------
print("\n")
print(Fore.BLACK+'Model 5 : Using top 2 features from feature selection (Wrapper method)')   
top = 2
top2_features_BE = []
x5=zoo_data.drop('type',1)
for i in range(top):
    top2_features_BE.append(selected_features_BE[i])
x5 = pd.DataFrame(x5, columns = top2_features_BE)
print(Fore.GREEN+'\t \t Selected Features : ',top2_features_BE)
warnings.simplefilter("ignore")
    
# by N-fold cross validation
clf=DecisionTreeClassifier()
Model5=cross_val_score(clf, x5, y, cv=5, scoring='accuracy', error_score='raise-deprecating').mean()
print(Fore.BLUE+"\t \t Tree Accuracy :",Model5)
# --------------------------------
print("\n")
print(Fore.BLACK+'Model 6 : Using top 4 features from feature selection (Wrapper method)')   
top = 4
top4_features_BE = []
x6=zoo_data.drop('type',1)


for i in range(top):
    top4_features_BE.append(selected_features_BE[i])
    
x6 = pd.DataFrame(x6, columns = top4_features_BE)
print(Fore.GREEN+'\t \t Selected Features : ',top4_features_BE)
warnings.simplefilter("ignore")
    
# by N-fold cross validation
clf=DecisionTreeClassifier()
Model6=cross_val_score(clf, x6, y, cv=5, scoring='accuracy', error_score='raise-deprecating').mean()
print(Fore.BLUE+"\t \t Tree Accuracy :",Model6)
# --------------------------------
print("\n")
print(Fore.BLACK+'Model 7 : Using top 6 features from feature selection (Wrapper method)')   
top = 6
top6_features_BE = []
x7=zoo_data.drop('type',1)

for i in range(top):
    top6_features_BE.append(selected_features_BE[i])
    
x7 = pd.DataFrame(x7, columns = top6_features_BE)
print(Fore.GREEN+'\t \t Selected Features : ',top6_features_BE)
warnings.simplefilter("ignore")
    
# by N-fold cross validation
clf=DecisionTreeClassifier()
Model7=cross_val_score(clf, x7, y, cv=5, scoring='accuracy', error_score='raise-deprecating').mean()
print(Fore.BLUE+"\t \t Tree Accuracy :",Model7)
# --------------------------------
print(Style.RESET_ALL)

print("\n")
print('------------------------------------------------------------------------------------------------')
print('------------------------------------------------------------------------------------------------')

Data set Preview


Unnamed: 0,name,gender,origin,hair,feathers,eggs,milk,airborne,aquatic,predator,toothed,backbone,breathes,venomous,fins,legs,tail,domestic,catsize,type
0,aardvark,Male,Europe,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,1
1,antelope,Male,Asia,1,0,0,1,0,0,0,1,1,1,0,0,4,1,0,1,1
2,bass,Male,South America,0,0,1,0,0,1,1,1,1,0,0,1,0,1,0,0,4
3,bear,Male,North America,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,1
4,boar,Female,North America,1,0,0,1,0,0,1,1,1,1,0,0,4,1,0,1,1


Data Pre-processing : Checking for missing values
ColumnName, DataType, MissingValues
name , object , False
gender , object , False
origin , object , False
hair , int64 , False
feathers , int64 , False
eggs , int64 , False
milk , int64 , False
airborne , int64 , False
aquatic , int64 , False
predator , int64 , False
toothed , int64 , False
backbone , int64 , False
breathes , int64 , False
venomous , int64 , False
fins , int64 , False
legs , int64 , False
tail , int64 , False
domestic , int64 , False
catsize , int64 , False
type , int64 , False
Data Pre-processing : Dropping ID column


Unnamed: 0,gender,origin,hair,feathers,eggs,milk,airborne,aquatic,predator,toothed,backbone,breathes,venomous,fins,legs,tail,domestic,catsize,type
0,Male,Europe,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,1
1,Male,Asia,1,0,0,1,0,0,0,1,1,1,0,0,4,1,0,1,1
2,Male,South America,0,0,1,0,0,1,1,1,1,0,0,1,0,1,0,0,4
3,Male,North America,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,1
4,Female,North America,1,0,0,1,0,0,1,1,1,1,0,0,4,1,0,1,1


Using N-fold Cross Validation
---------------------------------------DECISION TREE--------------------------------------------
------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------


Model 1 : Using all features given in data set : Full model
[32m	 	 All Features
[34m	 	 Tree Accuracy : 0.9513784461152881
[0m


[30mModel 2 : Using all features from feature selection (Wrapper method)
[32m	 	 Selected Features :  ['feathers', 'milk', 'airborne', 'aquatic', 'toothed', 'backbone', 'tail']
[34m	 	 Tree Accuracy : 0.9301739196476039


[30mModel 3 : Using top 3 features from feature selection (Wrapper method)
[32m	 	 Selected Features :  ['feathers', 'milk', 'airborne']
[34m	 	 Tree Accuracy : 0.7923900660742766


[30mModel 4 : Using top 5 features from feature selection (Wrapper method)
[32m	 	 Selected Features :  ['feathers', 'milk', 'airbor