In [1]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [2]:
# Importing a dataset 'iris.csv'
data = pd.read_csv('../datasets/iris.csv')
# data.columns

# Defining features and target variable
features = ['sepal.length', 'sepal.width', 'petal.length', 'petal.width']

# Features
X=data[features]

# Target Variable
y=data['variety'].replace({'Setosa':1,'Versicolor':2,'Virginica':3})

In [3]:
# Split dataset into training and testing data
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=1,shuffle=True)

In [4]:
# Creating a model
model = GaussianNB()

# Training the model
model.fit(X_train,y_train).predict(X_test)

# Forming a prediction
y_pred = model.predict(X_test)

y_pred = np.array(y_pred)
y_pred

array([1, 2, 2, 1, 3, 3, 3, 1, 1, 3, 2, 1, 3, 2, 2, 1, 2, 2, 1, 1, 2, 2,
       3, 1, 3, 2, 1, 1, 2, 3, 2, 3, 2, 3, 3, 1, 2, 1, 2, 3, 3, 1, 2, 3,
       2], dtype=int64)

In [5]:
# For the purpose of comparison
y_test = np.array(y_test) 
y_test

array([1, 2, 2, 1, 3, 2, 3, 1, 1, 3, 2, 1, 3, 2, 2, 1, 2, 2, 1, 1, 2, 2,
       2, 1, 3, 2, 1, 1, 2, 3, 2, 3, 2, 3, 3, 1, 2, 1, 2, 3, 3, 1, 3, 3,
       2], dtype=int64)

In [6]:
# Differences between predicted and actual values
diff = y_pred - y_test
# print(diff)

# Printing number of incorrect predictions were
print("The number of incorrect predictions were",len(y_pred)-list(diff).count(0))

The number of incorrect predictions were 3


In [7]:
# Printing accuracy score
print("The model was",(accuracy_score(y_pred,y_test)*100),"% accurate!")

The model was 93.33333333333333 % accurate!


In [8]:
# Printing a confusion matrix
print("The confusion matrix below further describes how accurate the model is:")
cm = confusion_matrix(y_test, y_pred)

print('\nConfusion matrix\n\n', cm)

print('\nTrue Positives(TP) = ', cm[0,0])

print('\nTrue Negatives(TN) = ', cm[1,1])

print('\nFalse Positives(FP) = ', cm[0,1])

print('\nFalse Negatives(FN) = ', cm[1,0])

The confusion matrix below further describes how accurate the model is:

Confusion matrix

 [[14  0  0]
 [ 0 16  2]
 [ 0  1 12]]

True Positives(TP) =  14

True Negatives(TN) =  16

False Positives(FP) =  0

False Negatives(FN) =  0


In [9]:
# Printing a classification report
print("Classification Report\n")
print(classification_report(y_test, y_pred))

Classification Report

              precision    recall  f1-score   support

           1       1.00      1.00      1.00        14
           2       0.94      0.89      0.91        18
           3       0.86      0.92      0.89        13

    accuracy                           0.93        45
   macro avg       0.93      0.94      0.93        45
weighted avg       0.94      0.93      0.93        45

