In [4]:
import numpy as np
import pandas as pd
from IPython.display import display, HTML

zoo_data=pd.read_csv('W11_zoo.csv')

print('Sample of Given data')
display(zoo_data.head())

cols=zoo_data.columns
print('Data Preprocessing : Checking for missing values')
print('ColumnName, DataType, MissingValues')
for i in cols:
    print(i, ',', zoo_data[i].dtype,',',zoo_data[i].isnull().any())

zoo_data=zoo_data.drop('name',1)
print('Data Preprocessing : Removing ID')   
display(zoo_data.head())


# preprocess label, since KNN requires label encoding
from sklearn import preprocessing

zoo_data_knn=zoo_data.copy(deep=True) 
zoo_data_dummies=pd.get_dummies(zoo_data_knn[['gender','origin']])
zoo_data_knn=zoo_data_knn.join(zoo_data_dummies)

zoo_data_knn=zoo_data_knn.drop('gender',1)
zoo_data_knn=zoo_data_knn.drop('origin',1)

print('Data Preprocessing : Converting Categorical to Numerical')   
display(HTML(zoo_data_knn.head(10).to_html()))
    
# Normalized all numerical features
# find numeric columns
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
cols_numeric = zoo_data_knn.select_dtypes(include=numerics).columns.tolist()
    
# min-max normalization to scale [0, 1]
for col in cols_numeric:
    zoo_data_knn[col]=(zoo_data_knn[col]-zoo_data_knn[col].min())/(zoo_data_knn[col].max()-zoo_data_knn[col].min())

print('Data Preprocessing : Normalizing Data')
display(HTML(zoo_data_knn.head(10).to_html()))


y = zoo_data_knn['type'] # define label as nominal values
le = preprocessing.LabelEncoder()
le.fit(y)
y_encoded = le.transform(y) # encode nominal labels to integers ########
zoo_data_knn['type'] = y_encoded
y = zoo_data_knn['type'] 

# Full model
# By N-fold cross evaluations ###
from sklearn.model_selection import cross_val_score


print('------- Full model using all the given features
      -------')
x1 = zoo_data_knn.drop('type',1)

# build and eval models
from sklearn import neighbors
import warnings

warnings.simplefilter("ignore")
    
import array as arr
Model1 = arr.array('d')

for k in range(1, 10, 2): 
    clf=neighbors.KNeighborsClassifier(k, weights='uniform')
    Model1.append(cross_val_score(clf, x1, y, cv=5, scoring='accuracy').mean())

 
print('------- Model One-------')   

cols=x1.columns
print('Selected features :')
for j in cols:
    print(j,'\t',end = '')

print('Accuracy')    
i=0
for k in range(1, 10, 2):     
    print('K =', k, '\t',Model1[i])
    i=i+1
    
    
 # ----------------------          Backward Elimination              ------------------------------------#

import statsmodels.api as sm
cols = list(zoo_data_knn.columns)
cols.remove('type') # drop the nominal variable

y2=list(zoo_data_knn['type']) # using Grade as y variable in linear regression
pmax = 1
while (len(cols)>0):
    p= []
    X_1 = zoo_data_knn[cols]
    X_1 = sm.add_constant(X_1)
    model = sm.OLS(y2,X_1).fit()
    p = pd.Series(model.pvalues.values[1:],index = cols)      
    pmax = max(p)
    feature_with_p_max = p.idxmax()
    if(pmax>0.05):
        cols.remove(feature_with_p_max)
    else:
        break
selected_features_BE = cols

print('-------  Features selected using Wrapper method  -------')
# define label as nominal values
print(' ------- Model TWO -------')
x2 = zoo_data_knn.drop('type',1)
x2 = pd.DataFrame(x2, columns = selected_features_BE)
   
cols=x2.columns
print('Selected features :')
for j in cols:
    print(j,'\t',end = '')

Model2 = arr.array('d')
warnings.simplefilter("ignore")

print('\n Accuracy')
for k in range(1, 10, 2): 
    clf=neighbors.KNeighborsClassifier(k, weights='uniform')
    Model2.append(cross_val_score(clf, x2, y, cv=5, scoring='accuracy').mean())

i=0
for k in range(1, 10, 2):     
    print('K =', k, '\t',Model2[i])
    i=i+1

# ------------------------------------------------------------------------------------------------#


top = 3
top3_features_BE = []
print(' ------- Model THREE ---- Top three Features selected using Wrapper method-------')   
x3 = zoo_data_knn.drop('type',1)
for i in range(top):
    top3_features_BE.append(selected_features_BE[i])
    
x3 = pd.DataFrame(x3, columns = top3_features_BE)
    
cols=x3.columns
print('Selected features :')
for j in cols:
    print(j,'\t',end = '')

Model3 = arr.array('d')
warnings.simplefilter("ignore")

print('\n Accuracy')
for k in range(1, 10, 2): 
    clf=neighbors.KNeighborsClassifier(k, weights='uniform')
    Model3.append(cross_val_score(clf, x3, y, cv=5, scoring='accuracy').mean())

i=0
for k in range(1, 10, 2):     
    print('K =', k, '\t',Model3[i])
    i=i+1  
    
 # --------------------------------------------------------------------------------------------#


top = 5
top5_features_BE = []
print(' ------- Model FOUR ------ Top five Features selected by Wrapper method-------')   
x4 = zoo_data_knn.drop('type',1)
for i in range(top):
    top5_features_BE.append(selected_features_BE[i])
    
x4 = pd.DataFrame(x4, columns = top5_features_BE)
    
cols=x4.columns
print('Selected features :')
for j in cols:
    print(j,'\t',end = '')

Model4 = arr.array('d')
warnings.simplefilter("ignore")

print('\n Accuracy')
for k in range(1, 10, 2): 
    clf=neighbors.KNeighborsClassifier(k, weights='uniform')
    Model4.append(cross_val_score(clf, x4, y, cv=5, scoring='accuracy').mean())

i=0
for k in range(1, 10, 2):     
    print('K =', k, '\t',Model4[i])
    i=i+1
    
    
# ----------------------------------------------------------------------------------------------#


top = 7
top7_features_BE = []
print(' ------- Model FIVE ----- Top seven Features selected by Wrapper method-------')   
x5 = zoo_data_knn.drop('type',1)
for i in range(top):
    top7_features_BE.append(selected_features_BE[i])
    
x5 = pd.DataFrame(x5, columns = top7_features_BE)
    
cols=x5.columns
print('Selected features :')
for j in cols:
    print(j,'\t',end = '')

Model5 = arr.array('d')
warnings.simplefilter("ignore")

print('\n Accuracy')
for k in range(1, 10, 2): 
    clf=neighbors.KNeighborsClassifier(k, weights='uniform')
    Model5.append(cross_val_score(clf, x5, y, cv=5, scoring='accuracy').mean())

i=0
for k in range(1, 10, 2):     
    print('K =', k, '\t',Model5[i])
    i=i+1     
    
# ----------------------------------------------------------------------------------------------#


top = 9
top9_features_BE = []
print(' ------- Model six  ----- Top nine Features selected by Wrapper method-------')   
x6 = zoo_data_knn.drop('type',1)
for i in range(top):
    top9_features_BE.append(selected_features_BE[i])
    
x6 = pd.DataFrame(x6, columns = top9_features_BE)
    
cols=x6.columns
print('Selected features :')
for j in cols:
    print(j,'\t',end = '')

Model6 = arr.array('d')
warnings.simplefilter("ignore")

print('\n Accuracy')
for k in range(1, 10, 2): 
    clf=neighbors.KNeighborsClassifier(k, weights='uniform')
    Model6.append(cross_val_score(clf, x6, y, cv=5, scoring='accuracy').mean())

i=0
for k in range(1, 10, 2):     
    print('K =', k, '\t',Model6[i])
    i=i+1     
# ----------------------------------------------------------------------------------------------#


print("----------------------------- K Nearest Neighbor -------------------------------")
print("\n")
print("\n")

print("Using N-fold Cross Validation Comparing Various models")
print('K\t', 'Model 1\t\t', 'Model 2\t\t','Model 3\t\t','Model 4\t\t','Model 5\t\t','Model 6\t\t')
i=0
for k in range(1, 10, 2):     
    print(k,'\t',Model1[i],'\t',Model2[i],'\t',Model3[i],'\t',Model4[i],'\t',Model5[i],'\t',Model6[i])
    i=i+1
print("\n")
print("\n")
print("------------------------------------Ends ---------------------------------------")

Sample of Given data


Unnamed: 0,name,gender,origin,hair,feathers,eggs,milk,airborne,aquatic,predator,toothed,backbone,breathes,venomous,fins,legs,tail,domestic,catsize,type
0,aardvark,Male,Europe,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,1
1,antelope,Male,Asia,1,0,0,1,0,0,0,1,1,1,0,0,4,1,0,1,1
2,bass,Male,South America,0,0,1,0,0,1,1,1,1,0,0,1,0,1,0,0,4
3,bear,Male,North America,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,1
4,boar,Female,North America,1,0,0,1,0,0,1,1,1,1,0,0,4,1,0,1,1


Data Preprocessing : Checking for missing values
ColumnName, DataType, MissingValues
name , object , False
gender , object , False
origin , object , False
hair , int64 , False
feathers , int64 , False
eggs , int64 , False
milk , int64 , False
airborne , int64 , False
aquatic , int64 , False
predator , int64 , False
toothed , int64 , False
backbone , int64 , False
breathes , int64 , False
venomous , int64 , False
fins , int64 , False
legs , int64 , False
tail , int64 , False
domestic , int64 , False
catsize , int64 , False
type , int64 , False
Data Preprocessing : Removing ID


Unnamed: 0,gender,origin,hair,feathers,eggs,milk,airborne,aquatic,predator,toothed,backbone,breathes,venomous,fins,legs,tail,domestic,catsize,type
0,Male,Europe,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,1
1,Male,Asia,1,0,0,1,0,0,0,1,1,1,0,0,4,1,0,1,1
2,Male,South America,0,0,1,0,0,1,1,1,1,0,0,1,0,1,0,0,4
3,Male,North America,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,1
4,Female,North America,1,0,0,1,0,0,1,1,1,1,0,0,4,1,0,1,1


Data Preprocessing : Converting Categorical to Numerical


Unnamed: 0,hair,feathers,eggs,milk,airborne,aquatic,predator,toothed,backbone,breathes,venomous,fins,legs,tail,domestic,catsize,type,gender_Female,gender_Male,origin_Africa,origin_Asia,origin_Europe,origin_North America,origin_Oceania,origin_South America
0,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,1,0,1,0,0,1,0,0,0
1,1,0,0,1,0,0,0,1,1,1,0,0,4,1,0,1,1,0,1,0,1,0,0,0,0
2,0,0,1,0,0,1,1,1,1,0,0,1,0,1,0,0,4,0,1,0,0,0,0,0,1
3,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,1,0,1,0,0,0,1,0,0
4,1,0,0,1,0,0,1,1,1,1,0,0,4,1,0,1,1,1,0,0,0,0,1,0,0
5,1,0,0,1,0,0,0,1,1,1,0,0,4,1,0,1,1,1,0,0,0,0,1,0,0
6,1,0,0,1,0,0,0,1,1,1,0,0,4,1,1,1,1,1,0,1,0,0,0,0,0
7,0,0,1,0,0,1,0,1,1,0,0,1,0,1,1,0,4,0,1,0,0,0,0,1,0
8,0,0,1,0,0,1,1,1,1,0,0,1,0,1,0,0,4,1,0,1,0,0,0,0,0
9,1,0,0,1,0,0,0,1,1,1,0,0,4,0,1,0,1,0,1,0,0,0,0,0,1


Data Preprocessing : Normalizing Data


Unnamed: 0,hair,feathers,eggs,milk,airborne,aquatic,predator,toothed,backbone,breathes,venomous,fins,legs,tail,domestic,catsize,type,gender_Female,gender_Male,origin_Africa,origin_Asia,origin_Europe,origin_North America,origin_Oceania,origin_South America
0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.5,0.0,0.0,1.0,0.0,0,1,0,0,1,0,0,0
1,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.5,1.0,0.0,1.0,0.0,0,1,0,1,0,0,0,0
2,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.5,0,1,0,0,0,0,0,1
3,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.5,0.0,0.0,1.0,0.0,0,1,0,0,0,1,0,0
4,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.5,1.0,0.0,1.0,0.0,1,0,0,0,0,1,0,0
5,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.5,1.0,0.0,1.0,0.0,1,0,0,0,0,1,0,0
6,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.5,1.0,1.0,1.0,0.0,1,0,1,0,0,0,0,0
7,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.5,0,1,0,0,0,0,1,0
8,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.5,1,0,1,0,0,0,0,0
9,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.5,0.0,1.0,0.0,0.0,0,1,0,0,0,0,0,1


------- Full model using all the given features  -------
------- Model One-------
Selected features :
hair 	feathers 	eggs 	milk 	airborne 	aquatic 	predator 	toothed 	backbone 	breathes 	venomous 	fins 	legs 	tail 	domestic 	catsize 	gender_Female 	gender_Male 	origin_Africa 	origin_Asia 	origin_Europe 	origin_North America 	origin_Oceania 	origin_South America 	Accuracy
K = 1 	 0.941285030758715
K = 3 	 0.950224044960887
K = 5 	 0.9487886382623225
K = 7 	 0.9497911445279866
K = 9 	 0.9297410192147033
-------  Features selected using Wrapper method  -------
 ------- Model TWO -------
Selected features :
feathers 	milk 	airborne 	aquatic 	toothed 	backbone 	fins 	gender_Female 	gender_Male 	origin_Africa 	origin_Asia 	origin_Europe 	origin_North America 	origin_Oceania 	origin_South America 	
 Accuracy
K = 1 	 0.8410116199589883
K = 3 	 0.8711551606288449
K = 5 	 0.8711551606288449
K = 7 	 0.8903546745652008
K = 9 	 0.859626338573707
 ------- Model THREE ---- Top three Features selecte