In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

In [2]:
# load iris dataset from csv
df = pd.read_csv('data/iris.csv')

In [3]:
# display first 5 rows
df.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [4]:
# drop Id column
df.drop('Id',axis=1,inplace=True)
df.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [5]:
# rename columns
df.rename(index=str,
          columns={'SepalLengthCm':'sepal_length',
                   'SepalWidthCm': 'sepal_width',
                   'PetalLengthCm':'petal_length',
                   'PetalWidthCm': 'petal_width',
                   'Species': 'species'},
          inplace=True)

df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [6]:
# identify unique species
df['species'].unique()

array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)

In [7]:
# remove prefix in species column
df.species.replace('Iris-','',regex=True,inplace=True)
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [37]:
# separate features and prediction variables
X = df.drop('species',axis=1)
y = df['species']

In [38]:
# from sklearn.preprocessing import label_binarize

In [39]:
# convert labels to numic values
# y = label_binarize(y, classes=[0,1,2])
# n_classes = 3

In [40]:
# set train/test split at 70/30
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=101)

In [41]:
# instantiate classifier
gnb = GaussianNB()

# fit model
scores = gnb.fit(X_train,y_train)

In [42]:
# make predictions using the fitted model
predictions = gnb.predict(X_test)

In [43]:
from sklearn.metrics import classification_report

In [44]:
# classification report
print(classification_report(y_test,predictions))

             precision    recall  f1-score   support

     setosa       1.00      1.00      1.00        13
 versicolor       0.95      0.95      0.95        20
  virginica       0.92      0.92      0.92        12

avg / total       0.96      0.96      0.96        45



In [45]:
from sklearn.metrics import confusion_matrix

In [46]:
# confusion matrix
confusion_matrix(y_test,predictions)

array([[13,  0,  0],
       [ 0, 19,  1],
       [ 0,  1, 11]])

In [47]:
from sklearn.metrics import roc_curve, auc