In [3]:
import numpy as np
import pandas as pd
from IPython.display import display, HTML
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.model_selection import cross_val_score
from sklearn import preprocessing
import colorama
from colorama import Fore, Style


zoo_data=pd.read_csv('W11_zoo.csv')

print('Sample of Given data')
display(zoo_data.head())

cols=zoo_data.columns
print('Data Preprocessing : Checking for missing values')
print('ColumnName, DataType, MissingValues')
for i in cols:
    print(i, ',', zoo_data[i].dtype,',',zoo_data[i].isnull().any())

print('Data Preprocessing : Removing Name')   
zoo_data=zoo_data.drop('name',1)
display(zoo_data.head())

# Pre-processing ###
print('Column data types:\n',zoo_data.dtypes)
zoo_data_nb=zoo_data.copy(deep=True)

print('Data Preprocessing : Converting Numerical to Categorical')   

# convert numerical to categorical data, e.g., Age ##
zoo_data_nb['hair'] = pd.cut(zoo_data_nb['hair'],2)
zoo_data_nb['feathers'] = pd.cut(zoo_data_nb['feathers'],2)
zoo_data_nb['eggs'] = pd.cut(zoo_data_nb['eggs'],2)
zoo_data_nb['milk'] = pd.cut(zoo_data_nb['milk'],2)
zoo_data_nb['airborne'] = pd.cut(zoo_data_nb['airborne'],2)
zoo_data_nb['aquatic'] = pd.cut(zoo_data_nb['aquatic'],2)
zoo_data_nb['predator'] = pd.cut(zoo_data_nb['predator'],2)
zoo_data_nb['toothed'] = pd.cut(zoo_data_nb['toothed'],2)
zoo_data_nb['backbone'] = pd.cut(zoo_data_nb['backbone'],2)
zoo_data_nb['breathes'] = pd.cut(zoo_data_nb['breathes'],2)
zoo_data_nb['venomous'] = pd.cut(zoo_data_nb['venomous'],2)
zoo_data_nb['fins'] = pd.cut(zoo_data_nb['fins'],2)
zoo_data_nb['legs'] = pd.cut(zoo_data_nb['legs'],6)
zoo_data_nb['tail'] = pd.cut(zoo_data_nb['tail'],2)
zoo_data_nb['domestic'] = pd.cut(zoo_data_nb['domestic'],2)
zoo_data_nb['catsize'] = pd.cut(zoo_data_nb['catsize'],2)
zoo_data_nb['type'] = pd.cut(zoo_data_nb['type'],7)

display('Data Example',HTML(zoo_data_nb.head(5).to_html()))

y=zoo_data_nb['type']
le = preprocessing.LabelEncoder()
y_encoded = le.fit_transform(y) # encode nominal labels to integers ######

zoo_data_nb=pd.get_dummies(zoo_data_nb.drop('type',axis=1))
zoo_data_nb['type']=y_encoded

# Full model
x1=zoo_data_nb.drop('type',axis=1)
y1=zoo_data_nb['type']

import warnings
warnings.simplefilter("ignore")
clf = GaussianNB()
Model1=cross_val_score(clf, x1, y1, cv=5, scoring='accuracy').mean()

print(Fore.RED+"************************************* NAIVE BAYES *******************************************")
print("\n")
print(Fore.BLACK+'Model One : Full model by using all features given in data set')  
print(Fore.BLUE+"Accuracy before feature selection:",Model1)


 # ------------------------------------------------------------------------------------------------ 
    
#Backward Elimination - Selected features by Wrapper model
import statsmodels.api as sm
cols = list(zoo_data_nb.columns)
cols.remove('type') # drop the nominal variable
y2=zoo_data_nb['type']
pmax = 1
while (len(cols)>0):
    p= []
    X_1 = zoo_data_nb[cols]
    X_1 = sm.add_constant(X_1)
    model = sm.OLS(y2,X_1).fit()
    p = pd.Series(model.pvalues.values[1:],index = cols)      
    pmax = max(p)
    feature_with_p_max = p.idxmax()
    if(pmax>0.05):
        cols.remove(feature_with_p_max)
    else:
        break

selected_features_BE = cols
 # ------------------------------------------------------------------------------------------------   
print("\n")
print(Fore.BLACK+'Model Two : Using all Features from feature selection by Wrapper method')
x2=zoo_data_nb.drop('type',1)
x2 = pd.DataFrame(x2, columns = selected_features_BE)
y2=zoo_data_nb['type']
print("Features",selected_features_BE)
warnings.simplefilter("ignore")
clf = GaussianNB()
Model2=cross_val_score(clf, x2, y2, cv=5, scoring='accuracy').mean()

print(Fore.BLUE+"Accuracy after feature selection:",Model2)


# ------------------------------------------------------------------------------------------------    

top = 3
top3_features_BE = []
x3=zoo_data_nb.drop('type',1)
for i in range(top):
    top3_features_BE.append(selected_features_BE[i])
    

x3 = pd.DataFrame(x3, columns = top3_features_BE)
y3=zoo_data_nb['type']

cols=x3.columns
#print('Selected features :')
#for j in cols:
 #   print(j,'\t',end = '')
    
warnings.simplefilter("ignore")
clf = GaussianNB()
Model3=cross_val_score(clf, x3, y3, cv=5, scoring='accuracy').mean()

print("\n")
print(Fore.BLACK+'Model Three : Using Top 3 Features from feature selection by Wrapper method')
print(Fore.BLUE+"Accuracy after feature selection:",Model3)

# ------------------------------------------------------------------------------------------------    
top = 5
top5_features_BE = []
x4=zoo_data_nb.drop('type',1)
for i in range(top):
    top5_features_BE.append(selected_features_BE[i])
    
x4 = pd.DataFrame(x4, columns = top5_features_BE)
y4=zoo_data_nb['type']

warnings.simplefilter("ignore")
clf = GaussianNB()
Model4=cross_val_score(clf, x4, y4, cv=5, scoring='accuracy').mean()

print("\n")
print(Fore.BLACK+'Model Four : Using Top 5 Features from feature selection by Wrapper method')
print(Fore.BLUE+"Accuracy after feature selection:",Model4)


# ------------------------------------------------------------------------------------------------    
top = 7
top7_features_BE = []
x5=zoo_data_nb.drop('type',1)
for i in range(top):
    top7_features_BE.append(selected_features_BE[i])
    
x5 = pd.DataFrame(x5, columns = top7_features_BE)
y5=zoo_data_nb['type']

warnings.simplefilter("ignore")
clf = GaussianNB()
Model5=cross_val_score(clf, x4, y4, cv=5, scoring='accuracy').mean()

print("\n")
print(Fore.BLACK+'Model Five : Using Top 7 Features from feature selection by Wrapper method')
print(Fore.BLUE+"Accuracy after feature selection:",Model5)

# ------------------------------------------------------------------------------------------------  
top = 10
top10_features_BE = []
x6=zoo_data_nb.drop('type',1)
for i in range(top):
    top10_features_BE.append(selected_features_BE[i])
    
x6 = pd.DataFrame(x6, columns = top10_features_BE)
y6=zoo_data_nb['type']

warnings.simplefilter("ignore")
clf = GaussianNB()
Model6=cross_val_score(clf, x6, y6, cv=5, scoring='accuracy').mean()


print("\n")
print(Fore.BLACK+'Model Six : Using Top 10 Features from feature selection by Wrapper method')
print(Fore.BLUE+"Accuracy after feature selection:",Model6)
print(Fore.RED+"----------------------------------------------------------------------------------------------")
print(Fore.RED+"----------------------------------------------------------------------------------------------")

Sample of Given data


Unnamed: 0,name,gender,origin,hair,feathers,eggs,milk,airborne,aquatic,predator,toothed,backbone,breathes,venomous,fins,legs,tail,domestic,catsize,type
0,aardvark,Male,Europe,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,1
1,antelope,Male,Asia,1,0,0,1,0,0,0,1,1,1,0,0,4,1,0,1,1
2,bass,Male,South America,0,0,1,0,0,1,1,1,1,0,0,1,0,1,0,0,4
3,bear,Male,North America,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,1
4,boar,Female,North America,1,0,0,1,0,0,1,1,1,1,0,0,4,1,0,1,1


Data Preprocessing : Checking for missing values
ColumnName, DataType, MissingValues
name , object , False
gender , object , False
origin , object , False
hair , int64 , False
feathers , int64 , False
eggs , int64 , False
milk , int64 , False
airborne , int64 , False
aquatic , int64 , False
predator , int64 , False
toothed , int64 , False
backbone , int64 , False
breathes , int64 , False
venomous , int64 , False
fins , int64 , False
legs , int64 , False
tail , int64 , False
domestic , int64 , False
catsize , int64 , False
type , int64 , False
Data Preprocessing : Removing Name


Unnamed: 0,gender,origin,hair,feathers,eggs,milk,airborne,aquatic,predator,toothed,backbone,breathes,venomous,fins,legs,tail,domestic,catsize,type
0,Male,Europe,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,1
1,Male,Asia,1,0,0,1,0,0,0,1,1,1,0,0,4,1,0,1,1
2,Male,South America,0,0,1,0,0,1,1,1,1,0,0,1,0,1,0,0,4
3,Male,North America,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,1
4,Female,North America,1,0,0,1,0,0,1,1,1,1,0,0,4,1,0,1,1


Column data types:
 gender      object
origin      object
hair         int64
feathers     int64
eggs         int64
milk         int64
airborne     int64
aquatic      int64
predator     int64
toothed      int64
backbone     int64
breathes     int64
venomous     int64
fins         int64
legs         int64
tail         int64
domestic     int64
catsize      int64
type         int64
dtype: object
Data Preprocessing : Converting Numerical to Categorical


'Data Example'

Unnamed: 0,gender,origin,hair,feathers,eggs,milk,airborne,aquatic,predator,toothed,backbone,breathes,venomous,fins,legs,tail,domestic,catsize,type
0,Male,Europe,"(0.5, 1.0]","(-0.001, 0.5]","(-0.001, 0.5]","(0.5, 1.0]","(-0.001, 0.5]","(-0.001, 0.5]","(0.5, 1.0]","(0.5, 1.0]","(0.5, 1.0]","(0.5, 1.0]","(-0.001, 0.5]","(-0.001, 0.5]","(2.667, 4.0]","(-0.001, 0.5]","(-0.001, 0.5]","(0.5, 1.0]","(0.994, 1.857]"
1,Male,Asia,"(0.5, 1.0]","(-0.001, 0.5]","(-0.001, 0.5]","(0.5, 1.0]","(-0.001, 0.5]","(-0.001, 0.5]","(-0.001, 0.5]","(0.5, 1.0]","(0.5, 1.0]","(0.5, 1.0]","(-0.001, 0.5]","(-0.001, 0.5]","(2.667, 4.0]","(0.5, 1.0]","(-0.001, 0.5]","(0.5, 1.0]","(0.994, 1.857]"
2,Male,South America,"(-0.001, 0.5]","(-0.001, 0.5]","(0.5, 1.0]","(-0.001, 0.5]","(-0.001, 0.5]","(0.5, 1.0]","(0.5, 1.0]","(0.5, 1.0]","(0.5, 1.0]","(-0.001, 0.5]","(-0.001, 0.5]","(0.5, 1.0]","(-0.008, 1.333]","(0.5, 1.0]","(-0.001, 0.5]","(-0.001, 0.5]","(3.571, 4.429]"
3,Male,North America,"(0.5, 1.0]","(-0.001, 0.5]","(-0.001, 0.5]","(0.5, 1.0]","(-0.001, 0.5]","(-0.001, 0.5]","(0.5, 1.0]","(0.5, 1.0]","(0.5, 1.0]","(0.5, 1.0]","(-0.001, 0.5]","(-0.001, 0.5]","(2.667, 4.0]","(-0.001, 0.5]","(-0.001, 0.5]","(0.5, 1.0]","(0.994, 1.857]"
4,Female,North America,"(0.5, 1.0]","(-0.001, 0.5]","(-0.001, 0.5]","(0.5, 1.0]","(-0.001, 0.5]","(-0.001, 0.5]","(0.5, 1.0]","(0.5, 1.0]","(0.5, 1.0]","(0.5, 1.0]","(-0.001, 0.5]","(-0.001, 0.5]","(2.667, 4.0]","(0.5, 1.0]","(-0.001, 0.5]","(0.5, 1.0]","(0.994, 1.857]"


[31m************************************* NAIVE BAYES *******************************************


[30mModel One : Full model by using all features given in data set
[34mAccuracy before feature selection: 0.9609022556390977


[30mModel Two : Using all Features from feature selection by Wrapper method
Features ['gender_Female', 'gender_Male', 'feathers_(-0.001, 0.5]', 'feathers_(0.5, 1.0]', 'eggs_(0.5, 1.0]', 'milk_(-0.001, 0.5]', 'milk_(0.5, 1.0]', 'aquatic_(0.5, 1.0]', 'predator_(-0.001, 0.5]', 'predator_(0.5, 1.0]', 'toothed_(-0.001, 0.5]', 'toothed_(0.5, 1.0]', 'backbone_(-0.001, 0.5]', 'backbone_(0.5, 1.0]', 'breathes_(-0.001, 0.5]', 'breathes_(0.5, 1.0]', 'legs_(1.333, 2.667]', 'legs_(2.667, 4.0]', 'legs_(5.333, 6.667]', 'domestic_(-0.001, 0.5]', 'domestic_(0.5, 1.0]', 'catsize_(-0.001, 0.5]', 'catsize_(0.5, 1.0]']
[34mAccuracy after feature selection: 0.9704260651629072


[30mModel Three : Using Top 3 Features from feature selection by Wrapper method
[34mAccuracy after fe