## Preparing the Dataset

Converting all the Images into 32X32 Grayscale Images and saving them into numpy format

In [29]:
import numpy as np
import cv2
import glob
import os
import codecs
import matplotlib.pyplot as plt

In [3]:
#This fetches all the folders in the lekha-ocr-database/train_images folder 
def returnFolders(path):
    
    folders = [ f for f in os.listdir(path) ]
        
    print "Found {} folders".format(len(folders))
    
    return folders

In [43]:
def preprocessImage(image):
    #Does Adaptive Gaussian Thresholding (See sudoku image in documentation for understanding)
    adaptiveGThreshold = cv2.adaptiveThreshold(image,255,cv2.ADAPTIVE_THRESH_GAUSSIAN_C,cv2.THRESH_BINARY,11,2)
    retVal,thresholded_img = cv2.threshold(adaptiveGThreshold,127,255,cv2.THRESH_BINARY)
    img = cv2.resize(thresholded_img,(32,32),interpolation = cv2.INTER_CUBIC)
    return img

In [19]:
def getImagesInFolder(imageFolder):
    
    image_paths = glob.glob(imageFolder+'*.png')
    image_label = imageFolder.split('/')[-2]
    len_images = len(image_paths)
    print "Found {} Images of Label {}".format(len_images,image_label)
    
    input_images = [ cv2.imread(img,0) for img in image_paths ]
    input_images = [ preprocessImage(img) for img in input_images ]
    
    labels =  [ image_label for img in input_images ]
    
    images,labels =  np.array(input_images),np.array(labels)
    
    return images,labels

In [20]:
def prepareDataset(path):
    trainX = None
    trainy = None
    
    folders =  returnFolders(path)
    for f in folders:
        imageFolder = path+str(f)+'/'
        images,labels = getImagesInFolder(imageFolder)
        
        if trainX is None:
            trainX = images
        else:
            trainX = np.append(trainX,images,axis = 0)
        
        if trainy is None:
            trainy = labels
        else:
            trainy = np.append(trainy,labels,axis=0)
            
    print "TrainX Shape: ",trainX.shape
    print "Trainy Shape: ",trainy.shape
    return trainX,trainy

In [21]:
def saveNumpyArrays(trainX,trainy):
    np.save('NP-Dataset/X.npy',trainX)
    np.save('NP-Dataset/y.npy',trainy)
    print "Saved Numpy Arrays to NP-Dataset/"
    return

In [7]:
trainX,trainy = prepareDataset("/home/amrith/Machine-Learning/MalayalamOCR/IN/")

Found 133 folders
Found 86 Images of Label ്വ
Found 349 Images of Label ധ
Found 141 Images of Label ഏ
Found 76 Images of Label ൺ
Found 157 Images of Label ങ്ക
Found 470 Images of Label ശ
Found 350 Images of Label ബ
Found 252 Images of Label ൗ
Found 333 Images of Label ീ
Found 683 Images of Label ക
Found 3 Images of Label ഹ്മ
Found 582 Images of Label മ
Found 271 Images of Label ഉ
Found 150 Images of Label '2
Found 490 Images of Label ഇ
Found 65 Images of Label സ്ഥ
Found 352 Images of Label ജ
Found 147 Images of Label 3
Found 6 Images of Label ഗ്ന
Found 132 Images of Label ഘ
Found 439 Images of Label ച
Found 238 Images of Label (
Found 387 Images of Label ഗ
Found 127 Images of Label ദ്ദ
Found 13 Images of Label +
Found 427 Images of Label ണ്ട
Found 537 Images of Label ർ
Found 226 Images of Label ഞ്ഞ
Found 543 Images of Label ു
Found 4 Images of Label ബ്ബ
Found 258 Images of Label ന്ത
Found 7 Images of Label വ്വ
Found 521 Images of Label ൽ
Found 299 Images of Label ള്ള
Found 590 Images o

In [9]:
saveNumpyArrays(trainX,trainy)

Saved Numpy Arrays to NP-Dataset/


In [15]:
def getShape():
    return trainX.shape,trainy.shape