# Please download the dataset from the following link: 

# https://archive.ics.uci.edu/ml/datasets/Devanagari+Handwritten+Character+Dataset

# You can download the csv files of the training as well as cross validation dataset from the following links:

# https://drive.google.com/file/d/1LO7l5LqiGgRg_EUJhrwtrw6TRGeixahc/view?usp=sharing

# https://drive.google.com/file/d/16KYVxtexm1Jzw_d7sv9ql3S33571U_J8/view?usp=sharingv

In [1]:
import pandas as pd

import numpy as np 

import matplotlib.pyplot as plt

import os

In [2]:
import scipy.stats as s

In [3]:
from sklearn.metrics import classification_report,accuracy_score,precision_score,recall_score

In [96]:
class img_to_df:
    
    def __init__(self,path,train_cv_split):
        
        self.path = path
        
        self.unique_labels = os.listdir(path)
        
        self.train_cv_split = train_cv_split
        
        
        
    def list_of_images(self,folder):
        
        return os.listdir(os.path.join(self.path,folder))
        
        
    
    def read_image(self,folder,image):
        
        folder_path = os.path.join(self.path,folder)
        
        image_path = os.path.join(folder_path,image)
        
        image = plt.imread(image_path)
        
        return image.reshape(image.shape[0]*image.shape[1],)
    
    
    
    def stacking_row_vectors(self,folder):
        
        images_list = self.list_of_images(folder)
        
        images = list()
        
        for img in images_list:
            
            images.append(self.read_image(folder,img))
            
        train_len = int(self.train_cv_split[0]*len(images))
            
        return np.array(images[0:train_len]), np.array(images[train_len:])
    
    
    
    def generate_df(self):
        
        train_data = list()
        
        cv_data = list()
        
        for folder in self.unique_labels:
            
            train_folder_matrix, cv_folder_matrix = self.stacking_row_vectors(folder)
            
            train_data.append(train_folder_matrix)
            
            cv_data.append(cv_folder_matrix)
            
        train_data = np.concatenate(train_data,axis=0)
        
        cv_data = np.concatenate(cv_data,axis=0)
        
        train_labels = list()
        
        cv_labels = list()
        
        for folder_name in self.unique_labels:
            
            train_labels = train_labels + [folder_name]*train_folder_matrix.shape[0]
            
            cv_labels = cv_labels + [folder_name]*cv_folder_matrix.shape[0]
            
        train_data = pd.DataFrame(data=train_data)
        
        train_data['label'] = train_labels
        
        cv_data = pd.DataFrame(data=cv_data)
        
        cv_data['label'] = cv_labels
        
        return train_data,cv_data

In [5]:
obj = img_to_df("./DevanagariHandwrittenCharacterDataset/DevanagariHandwrittenCharacterDataset/Train",(0.8,0.2))

In [6]:
training_data, cv_data = obj.generate_df()

In [7]:
training_data.to_csv("./Devnagari_Handwritten_Character_Train.csv")

cv_data.to_csv("./Devnagari_Handwritten_Character_cv.csv")

In [4]:
training_data = pd.read_csv("Devnagari_Handwritten_Character_Train.csv")

In [5]:
training_data.drop([training_data.columns[0]],axis=1,inplace=True)

In [6]:
training_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1015,1016,1017,1018,1019,1020,1021,1022,1023,label
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,character_10_yna
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,character_10_yna
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,character_10_yna
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,character_10_yna
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,character_10_yna


In [7]:
cv_data = pd.read_csv("Devnagari_Handwritten_Character_cv.csv")

In [16]:
class GaussianNB:
    
    """Instantiate a Gaussian Naive Bayes Object with the following parameters: 
        
        features :               A dataframe consisting of continuous features, excluding labels
        labels :                 A series consisting of binary labels
        train_cv_test_split :    A tuple consisting of fraction for training, cross validation and testing data
        apply_pca :              Boolean value specifying whether to apply PCA or not
        n_principal_components : Number of Principal Components (Eigen vectors having non zero values to keep) 
    """
    
    def __init__(self,features,labels,train_cv_test_split,apply_pca,n_principal_components):
        
        self.unique_labels = list(labels.unique())
        
        self.labels = np.array(labels).reshape(labels.shape[0],1)
        
        self.train_cv_test_split = train_cv_test_split
        
        self.n_principal_components = n_principal_components
        
        if apply_pca == True:
            
            self.X_new = self.apply_dim_reduction(features,self.n_principal_components)
            
            
            
            
    def apply_dim_reduction(self,data,n_components):
        
        X = np.array(data)
        
        X_dash = X - np.mean(X,axis=0).reshape(-1,X.shape[1])
        
        sigma_hat = (1/data.shape[0])*np.matmul(X_dash.T,X_dash)
        
        sigma_hat_decompose = np.linalg.svd(sigma_hat)
        
        Q = sigma_hat_decompose[0]
        
        self.Q_tilda = Q[:,0:n_components]
        
        X_new = np.matmul(X_dash,self.Q_tilda)
        
        return X_new
    
    
    
    
    def fit(self,data,alpha,gamma):
        
        self.likelihood_params = dict()
        
        sigma_hats = 0
        
        for label in self.unique_labels:
        
            mu_hat = np.array(data[data['label'] == label].iloc[:,0:self.n_principal_components].mean())

            sigma_hat = np.array(data[data['label'] == label].iloc[:,0:self.n_principal_components].cov())
            
            sigma_hats = sigma_hats + 1359*sigma_hat
            
            self.likelihood_params[label] = [mu_hat,sigma_hat]
            
        self.sigma_hat = sigma_hats/(data.shape[0] - len(self.unique_labels))
        
        self.mean_variance = np.mean(np.diag(self.sigma_hat))
            
        for label in self.unique_labels:
            
            self.likelihood_params[label][1] = alpha*self.likelihood_params[label][1] + (1-alpha)*self.sigma_hat
            
            self.likelihood_params[label][1] = (1-gamma)*self.likelihood_params[label][1] + gamma*self.mean_variance*np.eye(self.n_principal_components,self.n_principal_components)
        
        
        
        
    def evaluate(self,data):
        
        inputs = np.array(data.iloc[:,0:self.n_principal_components])
        
        posterior = list()
        
        for label in self.unique_labels:
    
            posterior.append(s.multivariate_normal.pdf(inputs,self.likelihood_params[label][0],self.likelihood_params[label][1]).reshape(inputs.shape[0],1))
        
        posterior = np.concatenate(posterior,axis=1)
        
        predicted_category = pd.Series(np.argmax(posterior,axis=1))
    
        predicted_category.replace(to_replace=np.arange(0,len(self.unique_labels)),value=self.unique_labels,inplace=True)
    
        predicted_results = np.array(predicted_category)
        
        actual_results = np.array(data['label'])
        
        acc = accuracy_score(y_true=actual_results,y_pred=predicted_results)
        
        recall = recall_score(y_true=actual_results,y_pred=predicted_results,average='weighted')
        
        precision = precision_score(y_true=actual_results,y_pred=predicted_results,average='weighted')
        
        return {"acc":acc,"recall":recall,"precision":precision}

In [19]:
D = dict()

for n_components in [50,100,150,200,250,300,350,400,450,500,550,600,650,700]:
    
    obj = GaussianNB(features=training_data.iloc[:,0:1024],labels=training_data.iloc[:,1024],train_cv_test_split=(0.7,0.2,0.1),
                             apply_pca=True,n_principal_components=n_components)

    X_train = pd.DataFrame(obj.X_new)

    X_train['label'] = training_data['label']

    obj.fit(X_train,0.5,0.3)

    X_cv = np.matmul(np.array(cv_data.iloc[:,0:1024]),obj.Q_tilda)

    X_cv = pd.DataFrame(data=X_cv)

    X_cv['label'] = cv_data['label']

    D[n_components] = obj.evaluate(X_cv)

In [20]:
D

{50: {'acc': 0.6053708439897698,
  'recall': 0.6053708439897698,
  'precision': 0.7089919101735292},
 100: {'acc': 0.6829923273657289,
  'recall': 0.6829923273657289,
  'precision': 0.743782751047135},
 150: {'acc': 0.7063938618925831,
  'recall': 0.7063938618925831,
  'precision': 0.7499621435097328},
 200: {'acc': 0.718158567774936,
  'recall': 0.718158567774936,
  'precision': 0.7550919367418554},
 250: {'acc': 0.7253196930946292,
  'recall': 0.7253196930946292,
  'precision': 0.7578255417191467},
 300: {'acc': 0.7292838874680307,
  'recall': 0.7292838874680307,
  'precision': 0.7585437955792808},
 350: {'acc': 0.7330562659846548,
  'recall': 0.7330562659846548,
  'precision': 0.7597412924528109},
 400: {'acc': 0.7351023017902814,
  'recall': 0.7351023017902814,
  'precision': 0.7602376222470921},
 450: {'acc': 0.7365728900255755,
  'recall': 0.7365728900255755,
  'precision': 0.7602746863769322},
 500: {'acc': 0.7368286445012787,
  'recall': 0.7368286445012787,
  'precision': 0.759