## Exploring Different Classification Methods on the Iris Dataset 

In [1]:
# Import libraries.
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn import naive_bayes
from sklearn import tree
from error_metrics import *

# Read in and inspect the data.
data = pd.read_csv('./data/iris.csv')
data.head()

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [2]:
# Select x and y data.
features = list(data)
features.remove('Species')
data_x = data[features]
data_y = data['Species']

In [3]:
# Convert class lables (species column) to numbers using label encoding. 
le = preprocessing.LabelEncoder()
data_y = le.fit_transform(data_y)

# Split into training and test sets.
x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, test_size=0.3, random_state=4)

### Method 1: Naive Bayes

In [4]:
# Build and evaluate the model.
gnb_mod = naive_bayes.GaussianNB()
gnb_mod.fit(x_train, y_train)
y_hat = gnb_mod.predict(x_test)
print_multiclass_classif_error_report(y_test, y_hat)

Accuracy: 0.9777777777777777
Avg. F1 (Micro): 0.9777777777777777
Avg. F1 (Macro): 0.9717813051146384
Avg. F1 (Weighted): 0.9778953556731335
Confusion Matrix: 
[[21  0  0]
 [ 0 10  0]
 [ 0  1 13]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        21
           1       1.00      0.91      0.95        11
           2       0.93      1.00      0.96        13

   micro avg       0.98      0.98      0.98        45
   macro avg       0.98      0.97      0.97        45
weighted avg       0.98      0.98      0.98        45



### Method 2: Decision Tree

In [5]:
# Approach 1: Gini Impurity 
dtree_gini = tree.DecisionTreeClassifier(criterion='gini')
dtree_gini.fit(x_train, y_train)
y_hat_gini = dtree_gini.predict(x_test)
print_multiclass_classif_error_report(y_test, y_hat_gini)

Accuracy: 0.9777777777777777
Avg. F1 (Micro): 0.9777777777777777
Avg. F1 (Macro): 0.9709618874773139
Avg. F1 (Weighted): 0.9775761242185925
Confusion Matrix: 
[[21  0  0]
 [ 0  9  1]
 [ 0  0 14]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        21
           1       0.90      1.00      0.95         9
           2       1.00      0.93      0.97        15

   micro avg       0.98      0.98      0.98        45
   macro avg       0.97      0.98      0.97        45
weighted avg       0.98      0.98      0.98        45



In [6]:
# Approach 2: Information Entropy 
dtree_ent = tree.DecisionTreeClassifier(criterion='entropy')
dtree_ent.fit(x_train, y_train)
y_hat_ent = dtree_gini.predict(x_test)
print_multiclass_classif_error_report(y_test, y_hat_ent)

Accuracy: 0.9777777777777777
Avg. F1 (Micro): 0.9777777777777777
Avg. F1 (Macro): 0.9709618874773139
Avg. F1 (Weighted): 0.9775761242185925
Confusion Matrix: 
[[21  0  0]
 [ 0  9  1]
 [ 0  0 14]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        21
           1       0.90      1.00      0.95         9
           2       1.00      0.93      0.97        15

   micro avg       0.98      0.98      0.98        45
   macro avg       0.97      0.98      0.97        45
weighted avg       0.98      0.98      0.98        45



### Illustrate decoding labels

In [7]:
# Illustrate recoding numeric classes back into original (text-based) labels.
y_test_labs = le.inverse_transform(y_test)
y_hat_labs = le.inverse_transform(y_hat)
print('(Actual, Predicted): \n'+str(list(zip(y_test_labs, y_hat_labs))))

(Actual, Predicted): 
[('virginica', 'virginica'), ('setosa', 'setosa'), ('virginica', 'virginica'), ('virginica', 'virginica'), ('virginica', 'virginica'), ('versicolor', 'versicolor'), ('versicolor', 'versicolor'), ('setosa', 'setosa'), ('setosa', 'setosa'), ('virginica', 'versicolor'), ('setosa', 'setosa'), ('setosa', 'setosa'), ('setosa', 'setosa'), ('versicolor', 'versicolor'), ('virginica', 'virginica'), ('setosa', 'setosa'), ('versicolor', 'versicolor'), ('setosa', 'setosa'), ('setosa', 'setosa'), ('virginica', 'virginica'), ('setosa', 'setosa'), ('virginica', 'virginica'), ('versicolor', 'versicolor'), ('setosa', 'setosa'), ('setosa', 'setosa'), ('setosa', 'setosa'), ('setosa', 'setosa'), ('setosa', 'setosa'), ('setosa', 'setosa'), ('virginica', 'virginica'), ('versicolor', 'versicolor'), ('setosa', 'setosa'), ('virginica', 'virginica'), ('setosa', 'setosa'), ('versicolor', 'versicolor'), ('virginica', 'virginica'), ('virginica', 'virginica'), ('versicolor', 'versicolor'), ('ve