In [1]:
#unzip a file.
#from zipfile import ZipFile
#zip = ZipFile('devanagari+handwritten+character+dataset.zip')
#zip.extractall()

In [2]:
import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

import os

In [3]:
from sklearn.metrics import classification_report

In [4]:
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, roc_curve, average_precision_score, recall_score

In [5]:
train_path = "./DevanagariHandwrittenCharacterDataset/Train"

test_path = "./DevanagariHandwrittenCharacterDataset/Test"

In [6]:
unique_labels = os.listdir(train_path)

In [7]:
def list_of_images(folder):
    
    return os.listdir(os.path.join(train_path,folder))

In [8]:
def read_image(folder,image):
    
    folder_path = os.path.join(train_path,folder)
    
    image_path = os.path.join(folder_path,image)
    
    image = plt.imread(image_path)
    
    return image.reshape(image.shape[0]*image.shape[1],)

In [9]:
for i in unique_labels:
    m=list_of_images(i)    
print("train_data_len:",float(len(m)) * 0.8)
print("cv_data_len:",float(len(m)) * 0.2)

train_data_len: 1360.0
cv_data_len: 340.0


In [10]:
def stacking_row_vectors(folder):
    
    images_list = list_of_images(folder)
    
    images = []
    
    for img in images_list:
        
        images.append(read_image(folder,img))
        
    return np.array(images[0:1360]), np.array(images[1360:])

In [11]:
train_data = []

cv_data = []

for folder in unique_labels:
    
    train_folder_matrix, cv_folder_matrix = stacking_row_vectors(folder)
    
    train_data.append(train_folder_matrix)
    
    cv_data.append(cv_folder_matrix)
    
train_data = np.concatenate(train_data,axis=0)

cv_data = np.concatenate(cv_data,axis=0)

In [12]:
train_labels = []

cv_labels = []

for folder_name in unique_labels:
    
    train_labels = train_labels + [folder_name]*1360
    
    cv_labels = cv_labels + [folder_name]*340


In [13]:
train_data = pd.DataFrame(data=train_data)
train_data['label'] = train_labels
cv_data = pd.DataFrame(data=cv_data)
cv_data['label'] = cv_labels

In [14]:
cv_actual_results = np.array(cv_data['label'])

In [19]:
data = pd.concat([train_data,cv_data])

X = np.array(data.iloc[:,0:1024])
X 

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [16]:
from sklearn.naive_bayes import GaussianNB

from sklearn.decomposition import PCA

from sklearn.metrics import accuracy_score

In [61]:
def cross_validation(n_eig_vectors):
    pca_obj = PCA(n_components=n_eig_vectors)
    X_new = pca_obj.fit_transform(X)
    # Create a new DataFrame with a range index
    X_new_df = pd.DataFrame(X_new)
    X_new_df.reset_index(drop=True, inplace=True)
    data.reset_index(drop=True, inplace=True)
    X_new_df= pd.concat([ X_new_df, data['label']], axis=1)  
    X_train_new = X_new_df.iloc[:62560, :-1]  
    X_cv_new = X_new_df.iloc[62560:, :-1]
    y_train_new = X_new_df.iloc[:62560, -1]  
    y_cv_new = X_new_df.iloc[62560:, -1]  
    
    obj = GaussianNB()
    obj.fit(X_train_new, y_train_new)
    
    cv_predicted_category = obj.predict(X_cv_new)
    return accuracy_score(y_true=y_cv_new,y_pred=cv_predicted_category)

 

In [62]:
number_eig_vectors = [512,256,128,64,32,16,8,4,2,1]

D_performance = {}

for number in number_eig_vectors:
    
    D_performance[number] = cross_validation(number)
    

In [63]:
D_performance

{512: 0.45613810741687977,
 256: 0.5058823529411764,
 128: 0.566304347826087,
 64: 0.5492966751918159,
 32: 0.5319693094629157,
 16: 0.4650895140664962,
 8: 0.31681585677749363,
 4: 0.15933503836317137,
 2: 0.06604859335038363,
 1: 0.03618925831202046}