# Please download the dataset from the following link: 

# https://archive.ics.uci.edu/ml/datasets/Devanagari+Handwritten+Character+Dataset

# You can download the csv files of the training as well as cross validation dataset from the following links:

# https://drive.google.com/file/d/1LO7l5LqiGgRg_EUJhrwtrw6TRGeixahc/view?usp=sharing

# https://drive.google.com/file/d/16KYVxtexm1Jzw_d7sv9ql3S33571U_J8/view?usp=sharingv

In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import os

In [2]:
import scipy.stats as s

In [3]:
from sklearn.metrics import classification_report,accuracy_score,precision_score,recall_score

In [4]:
from concurrent.futures import ThreadPoolExecutor

In [5]:
from glob import iglob

In [6]:
class img_to_df:
    
    def __init__(self,path,train_cv_split):
        self.path = path
        self.unique_labels = os.listdir(path)
        self.train_cv_split = train_cv_split
        

    def list_of_images(self,folder):
        return iglob(os.path.join(self.path,folder)+"/*.png")
    
    
    def read_image(self,folder_image):
        image = plt.imread(folder_image)
        return image.reshape(image.shape[0]*image.shape[1],)
    
    
    def stacking_row_vectors(self,folder):
        images_list_gen = self.list_of_images(folder)
        
        with ThreadPoolExecutor(max_workers=16) as p:
            lazy_loop = p.map(self.read_image,images_list_gen)
            
        return lazy_loop
    
    
    def generate_df(self):
        train_data = list()
        cv_data = list()
        
        for folder in self.unique_labels:
            dir_images_gen = self.stacking_row_vectors(folder)
            
            train_folder_matrix = list()
            cv_folder_matrix = list()
            
            for _ in range(int(self.train_cv_split[0]*1700)):
                try:
                    train_folder_matrix.append(next(dir_images_gen))
                except StopIteration:
                    break
                
            for _ in range(int(self.train_cv_split[1]*1700)):
                try:
                    cv_folder_matrix.append(next(dir_images_gen))
                except StopIteration:
                    break
            
            train_folder_matrix = np.array(train_folder_matrix)
            cv_folder_matrix = np.array(cv_folder_matrix)
            
            train_data.append(train_folder_matrix)
            cv_data.append(cv_folder_matrix)
            
        train_data = np.concatenate(train_data,axis=0)
        cv_data = np.concatenate(cv_data,axis=0)
        train_labels = list()
        cv_labels = list()
        
        for folder_name in self.unique_labels:
            train_labels = train_labels + [folder_name]*train_folder_matrix.shape[0]
            cv_labels = cv_labels + [folder_name]*cv_folder_matrix.shape[0]
            
        train_data = pd.DataFrame(data=train_data)
        train_data['label'] = train_labels
        cv_data = pd.DataFrame(data=cv_data)
        cv_data['label'] = cv_labels
        return train_data,cv_data

In [7]:
obj = img_to_df("./Train",(0.8,0.2))

In [8]:
training_data, cv_data = obj.generate_df()

In [None]:
training_data.to_csv("./Devnagari_Handwritten_Character_Train.csv")

cv_data.to_csv("./Devnagari_Handwritten_Character_cv.csv")

In [None]:
training_data = pd.read_csv("Devnagari_Handwritten_Character_Train.csv")

In [None]:
training_data.drop([training_data.columns[0]],axis=1,inplace=True)

In [None]:
training_data.head()

In [None]:
cv_data = pd.read_csv("Devnagari_Handwritten_Character_cv.csv")

In [None]:
class GaussianNB:
    
    """Instantiate a Gaussian Naive Bayes Object with the following parameters: 
        
        features :               A dataframe consisting of continuous features, excluding labels
        labels :                 A series consisting of binary labels
        train_cv_test_split :    A tuple consisting of fraction for training, cross validation and testing data
        apply_pca :              Boolean value specifying whether to apply PCA or not
        n_principal_components : Number of Principal Components (Eigen vectors having non zero values to keep) 
    """
    
    def __init__(self,features,labels,train_cv_test_split,apply_pca,n_principal_components):
        self.unique_labels = list(labels.unique())
        self.labels = np.array(labels).reshape(labels.shape[0],1)
        self.train_cv_test_split = train_cv_test_split
        self.n_principal_components = n_principal_components
        
        if apply_pca == True:
            self.X_new = self.apply_dim_reduction(features,self.n_principal_components)
            
            
    def apply_dim_reduction(self,data,n_components):
        X = np.array(data)
        X_dash = X - np.mean(X,axis=0).reshape(-1,X.shape[1])
        sigma_hat = (1/data.shape[0])*np.matmul(X_dash.T,X_dash)
        sigma_hat_decompose = np.linalg.svd(sigma_hat)
        Q = sigma_hat_decompose[0]
        self.Q_tilda = Q[:,0:n_components]
        X_new = np.matmul(X_dash,self.Q_tilda)
        return X_new
    
    
    def fit(self,data,alpha,gamma):
        self.likelihood_params = dict()
        sigma_hats = 0
        
        for label in self.unique_labels:
            mu_hat = np.array(data[data['label'] == label].iloc[:,0:self.n_principal_components].mean())
            sigma_hat = np.array(data[data['label'] == label].iloc[:,0:self.n_principal_components].cov())
            sigma_hats = sigma_hats + 1359*sigma_hat
            self.likelihood_params[label] = [mu_hat,sigma_hat]
            
        self.sigma_hat = sigma_hats/(data.shape[0] - len(self.unique_labels))
        self.mean_variance = np.mean(np.diag(self.sigma_hat))
            
        for label in self.unique_labels:
            self.likelihood_params[label][1] = alpha*self.likelihood_params[label][1] + (1-alpha)*self.sigma_hat
            self.likelihood_params[label][1] = (1-gamma)*self.likelihood_params[label][1] + gamma*self.mean_variance*np.eye(self.n_principal_components,self.n_principal_components)
        
        
    def evaluate(self,data):
        inputs = np.array(data.iloc[:,0:self.n_principal_components])
        posterior = list()
        
        for label in self.unique_labels:
            posterior.append(s.multivariate_normal.pdf(inputs,self.likelihood_params[label][0],self.likelihood_params[label][1]).reshape(inputs.shape[0],1))
        
        posterior = np.concatenate(posterior,axis=1)
        predicted_category = pd.Series(np.argmax(posterior,axis=1))
        predicted_category.replace(to_replace=np.arange(0,len(self.unique_labels)),value=self.unique_labels,inplace=True)
        predicted_results = np.array(predicted_category)
        actual_results = np.array(data['label'])
        acc = accuracy_score(y_true=actual_results,y_pred=predicted_results)
        recall = recall_score(y_true=actual_results,y_pred=predicted_results,average='weighted')
        precision = precision_score(y_true=actual_results,y_pred=predicted_results,average='weighted')
        return {"acc":acc,"recall":recall,"precision":precision}

In [None]:
D = dict()

for n_components in [50,100,150,200,250,300,350,400,450,500,550,600,650,700]:
    obj = GaussianNB(features=training_data.iloc[:,0:1024],labels=training_data.iloc[:,1024],train_cv_test_split=(0.7,0.2,0.1),
                             apply_pca=True,n_principal_components=n_components)
    X_train = pd.DataFrame(obj.X_new)
    X_train['label'] = training_data['label']
    obj.fit(X_train,0.5,0.3)
    X_cv = np.matmul(np.array(cv_data.iloc[:,0:1024]),obj.Q_tilda)
    X_cv = pd.DataFrame(data=X_cv)
    X_cv['label'] = cv_data['label']
    D[n_components] = obj.evaluate(X_cv)