# Group Number 31
### 20EC30063 : Avi Amalanshu
### 20CS30064 : Anamitra Mukhopadhyay
### 22CS30R79 : Chavle Abhishek Shivanand
## Project Code : RVNB
## Project Title : Classification of Rice Varieties using Gaussian Naive Bayes Learning Model

In [1]:
# imports
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O 
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import KFold
from sklearn import naive_bayes # sklearn Naive Bayes
from sklearn.naive_bayes import GaussianNB # sklearn Gaussian Naive Bayes
import operator
from math import log
from collections import Counter
from statistics import mean

In [2]:
# from google.colab import drive
# drive.mount('/content/drive')

In [3]:
# The model
class GNB_binary():
    def __init__(self):
        self.attrNo = 0
        self.posmeans = []  # to store mean of positive class for each attribute
        self.negmeans = []  # the same for negative class
        self.posvars = []   # to store the variance of positive class for each attribute
        self.negvars = []   # the same for negative class
        self.const = 0      # the common term in the Gaussian Naive Bayes Formula
        self.classes = []   # list of the class names, only 2 class names since the model supports binary classification only
    
    def fit(self, X_df, y_df):
        self.attrNo = X_df.shape[1]
        self.classes = list(y_df.iloc[:].unique())
        self.const = log(y_df.iloc[:].value_counts()[self.classes[1]] / y_df.iloc[:].value_counts()[self.classes[0]])

        df = pd.concat([X_df, y_df], axis="columns")

        for i in range(self.attrNo):
            (self.posmeans).append(mean(df.loc[df['Class'] == self.classes[1]].iloc[:, i]))
            (self.negmeans).append(mean(df.loc[df['Class'] == self.classes[0]].iloc[:, i]))
            (self.posvars).append(mean(df.loc[df['Class'] == self.classes[1]].iloc[:, i].apply(lambda x: x*x)) - self.posmeans[i]**2)
            (self.negvars).append(mean(df.loc[df['Class'] == self.classes[0]].iloc[:, i].apply(lambda x: x*x)) - self.negmeans[i]**2)
            self.const += (log(self.negvars[i]) - log(self.posvars[i]) )/2

    def predict_single(self, attributes):
        ans = self.const
        for i in range(self.attrNo):
            ans += (attributes[i] - self.negmeans[i])**2/(2 * self.negvars[i]) - (attributes[i] - self.posmeans[i])**2/(2 * self.posvars[i])
        return (self.classes[1] if ans > 0 else self.classes[0])
    
    def predict(self, X_test):
        predictions = []
        for i in range(X_test.shape[0]):
            predictions.append(self.predict_single(X_test.iloc[i, :].values))
        return predictions
        

In [4]:
def test(y_pred, y_test):
    out = classification_report(y_test, y_pred, output_dict = True)
    print(pd.DataFrame(out).drop(['accuracy'], axis=1))
    print("accuracy:", out['accuracy'])
    return pd.DataFrame(out).drop(['accuracy'], axis=1), out['accuracy']

In [5]:
def x_val_5fold(X, y):
    kf = KFold(n_splits=5)
    i = 1
    reports = []
    avgs = []
    for train_index, test_index in kf.split(X):
        model = GNB_binary()
        print("Fold #{}\n============================================".format(i))
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        report, acc = test(y_pred, y_test)
        reports.append(report)
        avgs.append(acc)
        i = i+1
        print("============================================")
    reports_avg = (reports[0]+reports[1]+reports[2]+reports[3]+reports[4])/5
    print("Average\n============================================\n{df}".format(df=reports_avg))
    print("============================================")
    print("Average Accuracy = {}".format(np.average(avgs)))

### Loading the dataset

In [6]:
#dir = '/content/drive/MyDrive/Tests-Assgns/ML Project1/'
df = pd.read_csv('Rice_Cammeo_Osmancik.csv')
df.head()

Unnamed: 0,Area,Perimeter,Major_Axis_Length,Minor_Axis_Length,Eccentricity,Convex_Area,Extent,Class
0,15231,525.578979,229.749878,85.093788,0.928882,15617,0.572896,Cammeo
1,14656,494.311005,206.020065,91.730972,0.895405,15072,0.615436,Cammeo
2,14634,501.122009,214.106781,87.768288,0.912118,14954,0.693259,Cammeo
3,13176,458.342987,193.337387,87.448395,0.891861,13368,0.640669,Cammeo
4,14688,507.166992,211.743378,89.312454,0.906691,15262,0.646024,Cammeo


### Train-test split

In [7]:
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:, :-1], df.iloc[:, -1], test_size = 0.3, random_state = 42)

### 5-fold cross validation

In [8]:
x_val_5fold(X_train, y_train)

Fold #1
               Cammeo    Osmancik   macro avg  weighted avg
precision    0.920398    0.912913    0.916655      0.915913
recall       0.864486    0.950000    0.907243      0.915730
f1-score     0.891566    0.931087    0.911327      0.915249
support    214.000000  320.000000  534.000000    534.000000
accuracy: 0.9157303370786517
Fold #2
               Cammeo    Osmancik   macro avg  weighted avg
precision    0.919283    0.929260    0.924271      0.925019
recall       0.903084    0.941368    0.922226      0.925094
f1-score     0.911111    0.935275    0.923193      0.925003
support    227.000000  307.000000  534.000000    534.000000
accuracy: 0.9250936329588015
Fold #3
               Cammeo    Osmancik   macro avg  weighted avg
precision    0.892241    0.933555    0.912898      0.915960
recall       0.911894    0.918301    0.915097      0.915572
f1-score     0.901961    0.925865    0.913913      0.915684
support    227.000000  306.000000  533.000000    533.000000
accuracy: 0.915572

### Training on whole training set and corresponding predictions

In [9]:
nb_classifier = GNB_binary()
nb_classifier.fit(X_train, y_train)
y_pred_model = nb_classifier.predict(X_test)
test(y_pred_model, y_test)

               Cammeo    Osmancik    macro avg  weighted avg
precision    0.932406    0.923438     0.927922      0.927502
recall       0.905405    0.945600     0.925503      0.927384
f1-score     0.918707    0.934387     0.926547      0.927281
support    518.000000  625.000000  1143.000000   1143.000000
accuracy: 0.9273840769903762


(               Cammeo    Osmancik    macro avg  weighted avg
 precision    0.932406    0.923438     0.927922      0.927502
 recall       0.905405    0.945600     0.925503      0.927384
 f1-score     0.918707    0.934387     0.926547      0.927281
 support    518.000000  625.000000  1143.000000   1143.000000,
 0.9273840769903762)

### Comparing our model to the sklearn GaussianNB model

In [10]:
nb_classifier_skl = GaussianNB()
y_pred_skl = nb_classifier_skl.fit(X_train,y_train).predict(X_test)
test(y_pred_skl, y_test)

               Cammeo    Osmancik    macro avg  weighted avg
precision    0.920160    0.911215     0.915687      0.915269
recall       0.889961    0.936000     0.912981      0.915136
f1-score     0.904809    0.923441     0.914125      0.914997
support    518.000000  625.000000  1143.000000   1143.000000
accuracy: 0.9151356080489939


(               Cammeo    Osmancik    macro avg  weighted avg
 precision    0.920160    0.911215     0.915687      0.915269
 recall       0.889961    0.936000     0.912981      0.915136
 f1-score     0.904809    0.923441     0.914125      0.914997
 support    518.000000  625.000000  1143.000000   1143.000000,
 0.9151356080489939)

### Saving the output predictions of the test dataset of our model

In [11]:
save_df = pd.concat([X_test, y_test], axis="columns")
save_df['Predictions'] = y_pred_model
save_df.head()

Unnamed: 0,Area,Perimeter,Major_Axis_Length,Minor_Axis_Length,Eccentricity,Convex_Area,Extent,Class,Predictions
1011,12442,459.535004,187.50885,87.187302,0.885323,12941,0.58758,Cammeo,Osmancik
3185,12408,437.014008,179.741165,88.829605,0.869343,12598,0.636928,Osmancik,Osmancik
3698,12867,449.079987,181.700561,91.341064,0.86446,13152,0.649062,Osmancik,Osmancik
897,13090,472.945007,202.601578,83.230179,0.911722,13331,0.77529,Cammeo,Cammeo
3245,10359,409.510986,173.337967,76.875809,0.896273,10510,0.573588,Osmancik,Osmancik


In [12]:
save_df.to_csv('model_predictions.csv')