### Import all necessary packages required to run the code

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

from sklearn.cluster import DBSCAN
from sklearn.preprocessing import MinMaxScaler

from PIL import Image
from tqdm import tqdm_notebook

### Below is the summary of the steps preformed in the function clean_image

* Loaded image pixel data into a NumPy array using Image package
- To remove the noise of the different colors (color of page etc.) the pixel where the R-G-B values are less than 100 are made pure white pixels with value (255,255,255) 
- To increase the character strength the Blue color intensity is improved where the pixels are identified with color of the pen written (In the current scenario the color of the text is blue). This module would require slight modification with color of the text is different from blue. 
- Convert the image to monochrome and then to Black & White.
- The image of then flipped to black to white and vise versa
- To improve the quality of the image removed the Pixels for which there is only one neighbouring pixel. One of technique used to remove the noise

In [2]:
def clean_image(img,skip_lines=10):
    
    ## Get the pixel map of the image
    pixelMap = img.load()
    
    ## Increase the contrast of the image by manipulating the RGB values
    for i in range(img.size[0]):
        for j in range(img.size[1]):
                if pixelMap[i,j][0] < 100 and pixelMap[i,j][1] < 100 and pixelMap[i,j][2] < 100 :
                    pixelMap[i,j] = (255, 255, 255)
                elif pixelMap[i,j][0] < 200 and pixelMap[i,j][1] < 200:
                    pixelMap[i,j] = (0, 0, 255)
    
    # convert image to monochrome
    img_mc = img.convert('L')
    # convert image to black and white
    img_bw = img_mc.convert('1')
    
    ## Getting pixel map for Black & White Image
    pixelMap_bw = img_bw.load()
    
    ## Flipping the colours black and white
    for i in range(img_bw.size[0]):
        for j in range(img_bw.size[1]):
            pixelMap_bw[i,j] = 255 - pixelMap_bw[i,j]
    
    ## Further to enhance the image, removing all the white dots of the image
    for i in range(img_bw.size[0]):
        for j in range(img_bw.size[1]):
            if i < skip_lines or j < skip_lines or i >= img_bw.size[0]-skip_lines or j >= img_bw.size[1]-skip_lines:
                pixelMap_bw[i,j] = 0
                continue
            s = pixelMap_bw[i-1,j-1] + pixelMap_bw[i,j-1] + pixelMap_bw[i+1,j-1]
            s = s + pixelMap_bw[i-1,j] + pixelMap_bw[i+1,j]
            s = s + pixelMap_bw[i-1,j+1] + pixelMap_bw[i,j+1] + pixelMap_bw[i+1,j+1]
            if s < 256:
                pixelMap_bw[i,j] = 0
    
    ## Return the result image
    return img_bw

### finding_clusters function uses DBScan to find clusters of pixels in image
- Converting the image-array into data points in 2 dimensional canvas by taking the indices of the white color pixels of images as x,y co-ordinates.
- All the two-dimensional data points are processed through DBScan to identify the clusters
- For DBScan, eps is set to 2 and minimum samples to 3
- The x,y co-ordinates and the labels for each cluster are returned in data frame format

In [3]:
def find_clusters(img,eps=2,min_samples=3):
    
    ## Get the image file into an array
    img_array = np.array(img)
    
    ## Get the data points in a dataframe to use DBScan to identify clusters
    x = []
    y = []
    for i,j in enumerate(img_array):
        for k,l in enumerate(j):
            if l > 0:
                x.append(i)
                y.append(k)
                
    ## Create dataframe
    img_df = pd.DataFrame()
    img_df['x'] = x
    img_df['y'] = y
    
    ## Initiate DBScan Model
    img_dbscan = DBSCAN(eps, min_samples)
    img_dbscan.fit(img_df)
    
    ## Get the labels (clusters)
    img_df['labels'] = img_dbscan.labels_    
    
    ## Return the dataframe with labels
    return img_df

### Below function "remove_clusters" is to delete clusters which are good for further processing
#### Following scenarios are considered to remove clusters

- **Scenario 01:** Cluster with label -1. These data points does not have minimum samples to make a cluster
- **Scenario 02:** Clusters with small length and width. Default is 5. Any cluster with just less than 5X5 are removed as they are very small clusters which are considered noise. Default threshold of length and width of the cluster can be change by the argument small_cluster_threshold.

In [4]:
def remove_clusters(img_df, small_cluster_threshold=5):
    ## removing all data points which are not part of any cluster (-1)
    img_df = img_df[img_df['labels'] > -1]
       
    ## Find min and max values for x & y co-ordinates for each cluster
    cluster_min_points = img_df.groupby(['labels'],as_index=False).min()
    cluster_max_points = img_df.groupby(['labels'],as_index=False).max()
    
    ## reset index of both above dataframes
    cluster_min_points.reset_index(inplace=True)
    cluster_max_points.reset_index(inplace=True)
    
    ## Get the labels
    labels = img_df.labels
    
    # removing very small clusters which are not supposed to be there
    # any cluster of size of less than small cluster threshold will be removed
    for label in np.unique(labels):
        if (cluster_max_points[cluster_max_points.labels==label]['x'].values[0] \
                  - cluster_min_points[cluster_min_points.labels==label]['x'].values[0] <= small_cluster_threshold) \
            or (cluster_max_points[cluster_max_points.labels==label]['y'].values[0] \
                  - cluster_min_points[cluster_min_points.labels==label]['y'].values[0] <= small_cluster_threshold):
            img_df = img_df[img_df['labels'] != label]
    
    return img_df

### merge_clusters function is used to merge or split clusters to correct character of the language

##### This is the most complex function of all and in fact flawed in implementation. Needs revision in future to remove the bug of the code

##### Following are the key steps of function
- Clusters are merged where one cluster is completely overlaps another vertically. This is because in Telugu a character can split into multiple unconnected clusters like the character "i" in English.
- In second scenario, clusters with overlap of 20 pixels are considered to merge into one. This is because in Telugu, it is not necessary that entire cluster to be overlapped with other cluster, for example 'క్య'
- Merge processes is repeated at different thresholds starting from 20 then 15,10,5,0 and even -5 to improve the quality of the characters. Each stage the number resulted unique clusters are compared with number character labels available to the whole images if it fits are not. However, this step should be avoided as this can't be performed on complete new dataset.
- Further clusters were split into small clusters when their length is too wider than average size. In this scenario as well the number of available character labels is used to determine average width of each character. Obviously this code should also be re-written to remove the bug of using target in improving quality of predictors.

In [5]:
def merge_clusters(img_df, char_labels):
    
    if img_df.shape[0] == 0:
        return pd.DataFrame()
    
    ## Find min and max values for x & y co-ordinates for each cluster
    cluster_min_points = img_df.groupby(['labels'],as_index=False).min()
    cluster_max_points = img_df.groupby(['labels'],as_index=False).max()
    
    ## Get labels
    labels = img_df.labels
    unq_labels = np.unique(labels)

    ## First merge based on full horizontal overlap
    valid_label_list = [k for k in unq_labels]
    for i in unq_labels:
        for j in unq_labels:
            if i == j or i not in valid_label_list or j not in valid_label_list:
                continue
            elif ((cluster_min_points[cluster_min_points.labels==i]['y'].values[0] 
                       - cluster_min_points[cluster_min_points.labels==j]['y'].values[0]) * 
                      (cluster_max_points[cluster_max_points.labels==i]['y'].values[0] 
                       - cluster_max_points[cluster_max_points.labels==j]['y'].values[0]) <= 0):
                labels = [j if label == i else label for label in labels]
                valid_label_list.remove(i)
     
    ## Partial overlaps
    img_df.loc[:,'labels'] = labels
    unq_labels = valid_label_list
    ## Exit if the labels are equal to number of chars
    if len(unq_labels) == len(char_labels):
        #print('level01')
        return img_df
    elif len(unq_labels) > len(char_labels):
        for threshold in range(20,-5,-11):
            labels = img_df.labels 
            valid_label_list = [l for l in np.sort(np.unique(labels))]
            unq_labels = valid_label_list
            cluster_min_points = img_df.groupby(['labels'],as_index=False).min().reset_index(drop=True)
            cluster_max_points = img_df.groupby(['labels'],as_index=False).max().reset_index(drop=True)

            for i in unq_labels:
                for j in unq_labels:                       
                    if cluster_min_points[cluster_min_points.labels==i]['y'].values[0] \
                                        <= cluster_min_points[cluster_min_points.labels==j]['y'].values[0]:
                        lc = i
                        rc = j
                    else:
                        lc = j
                        rc = i

                    if (i != j) and (cluster_max_points[cluster_max_points.labels==lc]['y'].values[0] \
                          - cluster_min_points[cluster_min_points.labels==rc]['y'].values[0] >= threshold):
                        labels = [rc if label == lc else label for label in labels]
                        try:
                            valid_label_list.remove(lc)
                        except:
                            None
            if len(valid_label_list) == len(char_labels):
                img_df.loc[:,'labels'] = labels
                return img_df
            elif len(valid_label_list) < len(char_labels):
                img_df_t = img_df.copy()
                img_df_t.loc[:,'labels'] = labels
                max_pixel_y = np.max(img_df_t.y)
                min_pixel_y = np.min(img_df_t.y)
                char_avg_size = (max_pixel_y - min_pixel_y) / len(char_labels)
                cluster_min_points = img_df_t.groupby(['labels'],as_index=False).min().reset_index(drop=True)
                cluster_max_points = img_df_t.groupby(['labels'],as_index=False).max().reset_index(drop=True)
                cluster_mm_points = cluster_min_points.merge(cluster_max_points,on='labels',suffixes=('_min', '_max'))
                cluster_lengths = cluster_mm_points.y_max - cluster_mm_points.y_min
                assumed_chars_in_each_cluster = np.round(cluster_lengths/char_avg_size,0)
                result_char_length_after_split = cluster_lengths / assumed_chars_in_each_cluster 
                result_char_confidense_rate = np.round((result_char_length_after_split / char_avg_size)*100,0)
                cluster_mm_points['assumed_chars_in_each_cluster'] = assumed_chars_in_each_cluster
                cluster_mm_points['result_char_confidense_rate'] = result_char_confidense_rate
                cluster_mm_points['accept_split'] = ['Yes' if cr in range(80,121) else 'No' \
                                                     for cr in result_char_confidense_rate]
                cluster_mm_points['chars_final'] = cluster_mm_points.apply(lambda x: x[5] \
                                                                           if x[7] == 'Yes' else 1,axis=1)
                
                max_label = np.max(cluster_mm_points.labels) + 5
                flag = 1
                new_clusters = dict()
                for i,j,k in cluster_mm_points.apply(lambda x:(x[2],x[4],x[8]), axis=1):
                    if int(k) == 1:
                        new_clusters[flag] = {'range_l':i,'range_u':j,'new_label':max_label + flag}
                        flag += 1
                    else:
                        f = int((j - i) / k) 
                        for l in range(int(k)):
                            new_clusters[flag] = {'range_l':i+f*l+1*(l>0*1), \
                                                  'range_u':i+f*(l+1),'new_label':max_label + flag}
                            flag += 1

                for i in range(len(new_clusters)):
                    img_df_t.loc[img_df_t[(img_df_t.y >= new_clusters[i+1]['range_l']) \
                                   & (img_df_t.y <= new_clusters[i+1]['range_u'])].index,'labels'] \
                                                            = int(new_clusters[i+1]['new_label'])
                if(len(np.unique(img_df_t.labels)) == len(char_labels)):
                    #print('level03')
                    return img_df_t
    elif len(unq_labels) < len(char_labels):
        labels = img_df.labels
        valid_label_list = [l for l in np.sort(np.unique(labels))]
        unq_labels = valid_label_list 
        cluster_min_points = img_df.groupby(['labels'],as_index=False).min().reset_index(drop=True)
        cluster_max_points = img_df.groupby(['labels'],as_index=False).max().reset_index(drop=True)

        for i in unq_labels:
            for j in unq_labels:                       
                if cluster_min_points[cluster_min_points.labels==i]['y'].values[0] \
                                    <= cluster_min_points[cluster_min_points.labels==j]['y'].values[0]:
                    lc = i
                    rc = j
                else:
                    lc = j
                    rc = i

                if (i != j) and (cluster_max_points[cluster_max_points.labels==lc]['y'].values[0] \
                      - cluster_min_points[cluster_min_points.labels==rc]['y'].values[0] >= 20):
                    labels = [rc if label == lc else label for label in labels]
                    try:
                        valid_label_list.remove(lc)
                    except:
                        None         
        ## Splitting immedietly after merging.
        for confidence_threshold in range(5,21,5):
            img_df_t = img_df.copy()
            img_df_t.reset_index(drop=True)
            max_pixel_y = np.max(img_df_t.y)
            min_pixel_y = np.min(img_df_t.y)
            char_avg_size = (max_pixel_y - min_pixel_y) / len(char_labels)
            cluster_min_points = img_df_t.groupby(['labels'],as_index=False).min().reset_index(drop=True)
            cluster_max_points = img_df_t.groupby(['labels'],as_index=False).max().reset_index(drop=True)
            cluster_mm_points = cluster_min_points.merge(cluster_max_points,on='labels',suffixes=('_min', '_max'))
            cluster_lengths = cluster_mm_points.y_max - cluster_mm_points.y_min
            assumed_chars_in_each_cluster = np.round(cluster_lengths/char_avg_size,0)
            result_char_length_after_split = cluster_lengths / assumed_chars_in_each_cluster 
            result_char_confidense_rate = np.round((result_char_length_after_split / char_avg_size)*100,0)
            cluster_mm_points['assumed_chars_in_each_cluster'] = assumed_chars_in_each_cluster
            cluster_mm_points['result_char_confidense_rate'] = result_char_confidense_rate
            cluster_mm_points['accept_split'] = ['Yes' if cr in range(100-confidence_threshold,100+\
                                                                      confidence_threshold+1) \
                                                 else 'No' for cr in result_char_confidense_rate]
            cluster_mm_points['chars_final'] = cluster_mm_points.apply(lambda x: x[5] if x[7] == 'Yes' else 1,axis=1)

            max_label = np.max(cluster_mm_points.labels) + 5
            flag = 1
            new_clusters = dict()
            for i,j,k in cluster_mm_points.apply(lambda x:(x[2],x[4],x[8]), axis=1):
                if int(k) == 1:
                    new_clusters[flag] = {'range_l':i,'range_u':j,'new_label':max_label + flag}
                    flag += 1
                else:
                    f = int((j - i) / k) 
                    for l in range(int(k)):
                        new_clusters[flag] = {'range_l':i+f*l+1*(l>0*1),'range_u':i+f*(l+1),\
                                              'new_label':max_label + flag}
                        if l == k-1:
                            new_clusters[flag]['range_u'] = j
                        flag += 1

            for i in range(len(new_clusters)):
                for i in range(len(new_clusters)):
                    img_df_t.loc[img_df[(img_df_t.y >= new_clusters[i+1]['range_l']) \
                                       & (img_df_t.y <= new_clusters[i+1]['range_u'])].index,'labels'] \
                                                                = int(new_clusters[i+1]['new_label'])
            if(len(np.unique(img_df_t.labels)) == len(char_labels)):
                #print('level04')
                return img_df_t
    #print('level0F')
    return pd.DataFrame()

### sort_clusters function sorts the character cluster in the order written in the image

- This function uses the start position of each cluster on X-axis to sort the clusters

In [6]:
def sort_clusters(img_df):
    
    if img_df.shape[0] == 0:
        return pd.DataFrame()
    labels = img_df.labels
    max_lable = img_df['labels'].max() + 3
    labels = [l + max_lable for l in labels]
    img_df['labels'] = labels
    lables_list = img_df.groupby(['labels'],as_index=False).min().sort_values(by='y')['labels'].values
    
    new_lable = 0
    for lable in lables_list:
        labels = [new_lable if l == lable else l for l in labels]
        new_lable += 1
    
    ##update the labels after re-arrange process
    img_df['labels'] = labels
    
    ## return the dataframe with final clusters
    return img_df

### To keep the length and width of each character consistent the character clusters are standardized

- Default length and width of the resultant character block is 100X100

In [7]:
def stadardize_clusters(img_char_clusters):
    if img_char_clusters.shape[0] == 0:
        return np.zeros(1)
    
    ## There assumption that the columns of the dataframe is x, y, labels
    labels = img_char_clusters.labels
    
    Std_Clusters = np.zeros([len(np.unique(labels)),101,101])
    char_pos = 0
    
    for label in np.sort(np.unique(labels)):
        cluster = img_char_clusters[img_char_clusters.labels==label][['x','y']].copy()
        cluster.reset_index(drop=True,inplace=True)

        MM_Scaler = MinMaxScaler((1,100))
        Transformed_Cluster = np.round(MM_Scaler.fit_transform(cluster),0)
        
        Std_Cluster_Array = np.zeros([101,101])
    
        for i,j in (zip(Transformed_Cluster[:,0],Transformed_Cluster[:,1])):
            Std_Cluster_Array[int(i),int(j)] = 1
        
        Std_Clusters[char_pos] =  Std_Cluster_Array
        char_pos += 1
    
    return Std_Clusters

### Splitting the label words into label characters.

- All the characters of the Telugu language are split into different groups
- Vowels and Consonants can exist on their own.
- If Vowel extensions are followed by a consonant, both the characters are combined to make a new character in UNICODE set
- Vowel extensions can only followed by a consonant, while the characters in the group extensions_o can be preceded by both vowels and consonants
- half_extension character is very key one which is used widely that takes two characters on either side to make a brand new character
- Other half characters are also same as vowel extensions though less used in normal language.
- The characters in the group other_full_chars are similar to consonants though these rare usage characters
- Finally the characters in the group numbers are just the numbers 0-9 respectively

In [8]:
def get_char_labels(word_label):
    vowels = ['అ','ఆ','ఇ','ఈ','ఉ','ఊ','ఋ','ౠ','ఌ','ౡ','ఎ','ఏ','ఐ','ఒ','ఓ','ఔ']
    consonants = ['క','ఖ','గ','ఘ','ఙ','చ','ఛ','జ','ఝ','ఞ',
                  'ట','ఠ','డ','ఢ','ణ','త','థ','ద','ధ','న',
                  'ప','ఫ','బ','భ','మ','య','ర','ల','వ','స','ష','శ','హ','ళ','ఱ']
    vowel_extensions = ['ా','ి','ీ','ు','ూ','ృ','ౄ','ె','ే','ై','ొ','ో','ౌ']
    extentions_o = ['ం','ః']
    half_extentions = ['్']
    other_half_chars = ['ఁ','ౢ','ౣ','ౖ','ఀ']
    other_full_chars = ['ౘ','ౙ','ఽ','ఴ','౸','౹','౺','౻','౼','౽','౾','౿']
    numbers = ['౦','౧','౨','౩','౪','౫','౬','౭','౮','౯']
    
    char_labels = []
    half_char_flag = 0
    
    for c in word_label:
        if c in vowels or c in extentions_o:
            char_labels.append(c)
        elif c in consonants:
            if half_char_flag == 1:
                char = char_labels[-1] + c
                del char_labels[-1]
                char_labels.append(char)
                half_char_flag = 0
            else:
                char_labels.append(c)
        elif c in vowel_extensions or c in other_half_chars:
            char = char_labels[-1] + c
            del char_labels[-1]
            char_labels.append(char)
        elif c in half_extentions:
            char = char_labels[-1] + c
            del char_labels[-1]
            char_labels.append(char)
            half_char_flag = 1
        elif c == '-' or c in other_full_chars or c in numbers:
            char_labels.append(c)
    
    return char_labels

In [19]:
## Source file which has all the names of the image dataset and the word labels
train_file_name = 'train.txt'
train_data = pd.read_csv(train_file_name,sep=' ',header=None)
train_data.columns=['image','label']

data_df = pd.DataFrame()
## This list is used later to analyze the issues faced with files so that they can be analyzed
issue_files = []

cntr = 1
for i in range(train_data.shape[0]):
    ## Get the image object
    img = Image.open(train_data.image[i])
    cleaned_img = clean_image(img,10)
    if np.array(cleaned_img).sum() == 0:
        issue_files.append(train_data.image[i] + ',' + train_data.label[i] + ',EmptyFile')
        continue
    ## Following if the code perform different steps of the EDA
    char_labels = get_char_labels(train_data.label[i])    
    char_clusters_v1 = find_clusters(cleaned_img,eps=2,min_samples=3)
    char_clusters_v2 = remove_clusters(char_clusters_v1, small_cluster_threshold=10)
    char_clusters_v3 = merge_clusters(char_clusters_v2, char_labels)
    char_clusters_v4 = sort_clusters(char_clusters_v3)
    std_char_clusters = stadardize_clusters(char_clusters_v4)
    
    ## Check if the process is successful to get correct number of character cluster along with their labels
    if char_clusters_v3.shape[0] > 0:
        for r in range(len(char_labels)):
            row = [train_data.image[i],r] + std_char_clusters[r].reshape(1,10201).tolist()[0] + [char_labels[r]]
            row_df = pd.DataFrame([row])
            data_df = pd.concat([data_df, row_df], axis=0)
    else:
        issue_files.append(train_data.image[i]+','+train_data.label[i])
        
    if data_df.shape[0]/500 > 1:
        file_name = 'data_df_' + str(cntr) + '.csv'
        data_df.to_csv(file_name,index=False)
        data_df = pd.DataFrame()
        cntr += 1
        
file_name = 'data_df_' + str(cntr) + '.csv'
data_df.to_csv(file_name,index=False)

pd.DataFrame({'Issue_File,Issue_Label':issue_files}).to_csv('Issues_Files.csv')