### Download the dataset

In [1]:
!wget -O "/home/ec2-user/word_level_ocr/pritom/datasets/handwriting/BN-HTRd.zip" \
      https://md-datasets-cache-zipfiles-prod.s3.eu-west-1.amazonaws.com/743k6dm543-1.zip

--2022-01-09 19:45:40--  https://md-datasets-cache-zipfiles-prod.s3.eu-west-1.amazonaws.com/743k6dm543-1.zip
Resolving md-datasets-cache-zipfiles-prod.s3.eu-west-1.amazonaws.com (md-datasets-cache-zipfiles-prod.s3.eu-west-1.amazonaws.com)... 52.218.97.155
Connecting to md-datasets-cache-zipfiles-prod.s3.eu-west-1.amazonaws.com (md-datasets-cache-zipfiles-prod.s3.eu-west-1.amazonaws.com)|52.218.97.155|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1485923301 (1.4G) [application/octet-stream]
Saving to: ‘/home/ec2-user/word_level_ocr/pritom/datasets/handwriting/BN-HTRd.zip’


2022-01-09 19:46:43 (22.9 MB/s) - ‘/home/ec2-user/word_level_ocr/pritom/datasets/handwriting/BN-HTRd.zip’ saved [1485923301/1485923301]



### Unzip the Dataset

In [2]:
!unzip -qq "/home/ec2-user/word_level_ocr/pritom/datasets/handwriting/BN-HTRd.zip" \
       -d "/home/ec2-user/word_level_ocr/pritom/datasets/handwriting/BN-HTRd/"
!unzip -qq "/home/ec2-user/word_level_ocr/pritom/datasets/handwriting/BN-HTRd/Dataset.zip" \
       -d "/home/ec2-user/word_level_ocr/pritom/datasets/handwriting/BN-HTRd/"

### Import Dependencies

In [61]:
import os
import glob
import re
from tqdm import tqdm
import pandas as pd
import pylightxl as xl
from PIL import Image

DATA_DIR = "/home/ec2-user/word_level_ocr/pritom/datasets/handwriting/BN-HTRd"

### Look for any label and image number mismatch

In [62]:
print("Folder | Images | Labels")

subdirs = os.listdir(os.path.join(DATA_DIR, "Dataset"))
                     
for direc in subdirs:
    img_files = glob.glob(os.path.join(DATA_DIR, "Dataset", direc, "Words/*/*.[jJ|pP][pP|nN][gG]"))
    annot_file = glob.glob(os.path.join(DATA_DIR, "Dataset", direc, "*.xlsx"))
    
    try:
        labels_df = pd.read_excel(annot_file[0])
    except ValueError:
        db = xl.readxl(annot_file[0])
        xl.writexl(db=db, fn="temp.xlsx")
        labels_df = pd.read_excel("temp.xlsx")
        os.remove("temp.xlsx")
    
    if len(img_files) != len(labels_df):
        print("Writer", direc, ":", len(img_files), len(labels_df))

Folder | Images | Labels
Writer 101 : 916 919


  warn("""Cannot parse header or footer so it will be ignored""")


Writer 114 : 387 386
Writer 17 : 793 0
Writer 30 : 1357 1355


### Create a combined label file that holds all the labels

In [91]:
label_files = glob.glob(os.path.join(DATA_DIR, "Dataset/**/*.xlsx"), recursive=True)
print("Number of files: {}".format(len(label_files)))

all_labels_df = pd.DataFrame()

for label_file in tqdm(label_files):
    # print(label_file)
    try:
        labels_df = pd.read_excel(label_file)
    except ValueError:
        """
        If the file is unreadable due to formatting issues, 
        clear formatting by reading and re-writing the file using pylightxl
        """
        db = xl.readxl(label_file)
        xl.writexl(db=db, fn="temp.xlsx")
        labels_df = pd.read_excel("temp.xlsx")
        os.remove("temp.xlsx")
            
    if labels_df.empty:
        # If the first sheet is empty, look for a sheet named "Sheet1"
        labels_df = pd.read_excel(label_file, sheet_name="Sheet1")
        
    #print(len(labels_df))
       
    headers = list(labels_df.columns.values)
    correct_headers = ["Path", "Word"] # rename Id to Path
    if headers != correct_headers:
        # key = old name
        # value = new name
        rename_dict = dict(zip(headers, correct_headers))
        labels_df.rename(columns=rename_dict, inplace=True)
        
    labels_df = labels_df[labels_df['Path'].notna()] # removes nan values
     
    """
    Manual dataset corrections:
    Add missing and edit duplicate labels
    """
    curr_dir = label_file.rsplit('/', 1)[-1].split('.')[0]
    if curr_dir=="30":
        # Add missing labels
        labels_df.loc[len(labels_df.index)] = ['30_6_2_3 (1)', 'নির্ধারকরা'] 
        labels_df.loc[len(labels_df.index)] = ['30_6_2_4 (1)', 'বোঝান'] 
        # Edit duplicate label
        labels_df.loc[905] = ['30_6_6_7', 'এশিয়ায়'] 
        
    elif curr_dir=="114":
        # Edit duplicate label
        labels_df.loc[905] = ['30_6_6_7', 'দেখা']
        
    for idx, row in labels_df.iterrows():
#         last_dir = row['Path'].rsplit('_', 2)[0]
#         labels_df.loc[idx, 'Path'] = last_dir.split('_', 1)[0] + "/Word/" + last_dir + "/" + row['Path']
#         l = re.split(r"\D_\D_\D_\D", '1_2_3_4f_1')
#         print(l)
#         labels_df.loc[idx, 'Path'] = row['Path'].rsplit('_', 2)[0]
        row['Path'] = row['Path'].strip()
    
    all_labels_df = all_labels_df.append(labels_df, ignore_index=True)

all_labels_df.index.name = 'id'

all_labels_df

Number of files: 150


  warn("""Cannot parse header or footer so it will be ignored""")
100%|█████████████████████████████████████████████████████████████| 150/150 [00:11<00:00, 12.89it/s]


Unnamed: 0_level_0,Path,Word
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1_1_1_1,কথা
1,1_1_1_2,প্রকাশ
2,1_1_2_1,বৈচিত্র্যময়
3,1_1_2_2,এই
4,1_1_2_3,পৃথিবীর
...,...,...
108142,99_15_10_2,বিবিসির
108143,99_15_10_3,দৃষ্টিভঙ্গি
108144,99_15_10_4,সম্বন্ধে
108145,99_15_10_5,পড়ুন


### Remove labels with missing images

In [92]:
image_paths = glob.glob(os.path.join(DATA_DIR, "Dataset", "*/Words/*/*.[jJ|pP][pP|nN][gG]"))
print(len(image_paths))
name_to_path_dict = {image_path.rsplit('/', 1)[-1].split('.', 1)[0].strip():image_path for image_path in image_paths}

print(len(name_to_path_dict))

missing_images = []

for idx, row in tqdm(all_labels_df.iterrows(), total=len(all_labels_df)):
    if row['Path'] not in name_to_path_dict.keys():
        all_labels_df.drop(idx, inplace=True)
        

all_labels_df

108147
108147


100%|████████████████████████████████████████████████████| 108147/108147 [00:04<00:00, 24112.13it/s]


Unnamed: 0_level_0,Path,Word
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1_1_1_1,কথা
1,1_1_1_2,প্রকাশ
2,1_1_2_1,বৈচিত্র্যময়
3,1_1_2_2,এই
4,1_1_2_3,পৃথিবীর
...,...,...
108142,99_15_10_2,বিবিসির
108143,99_15_10_3,দৃষ্টিভঙ্গি
108144,99_15_10_4,সম্বন্ধে
108145,99_15_10_5,পড়ুন


### Translate Numbers

In BN-HTRd bengali numbers are labeled in English. That's why this translation stage is done.

In [93]:
def translate_number (word):
    
    translated = []
    
    for i, c in enumerate(str(word)):
        if c == '0': translated.append('০')
        elif c == '1': translated.append('১') 
        elif c == '2': translated.append('২') 
        elif c == '3': translated.append('৩') 
        elif c == '4': translated.append('৪') 
        elif c == '5': translated.append('৫') 
        elif c == '6': translated.append('৬') 
        elif c == '7': translated.append('৭') 
        elif c == '8': translated.append('৮') 
        elif c == '9': translated.append('৯')
        else: translated.append(c)
    
    return ''.join(translated)


pattern = re.compile(r'[0-9]+')

for idx, row in all_labels_df.iterrows():
    # print(row['Word'])

    if pattern.match(str(row['Word'])) != None:
        
        # print("Word: {} Index: {}".format(row['Word'], idx))
        
        # Visualize images
        # img_path = name_to_path_dict[row['Path']]
        # image = Image.open(img_path)
        # display(image)
        
        row['Word'] = translate_number(row['Word'])

all_labels_df

Unnamed: 0_level_0,Path,Word
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1_1_1_1,কথা
1,1_1_1_2,প্রকাশ
2,1_1_2_1,বৈচিত্র্যময়
3,1_1_2_2,এই
4,1_1_2_3,পৃথিবীর
...,...,...
108142,99_15_10_2,বিবিসির
108143,99_15_10_3,দৃষ্টিভঙ্গি
108144,99_15_10_4,সম্বন্ধে
108145,99_15_10_5,পড়ুন


In [94]:
all_labels_df.to_csv(os.path.join(DATA_DIR, "BN-HTRd_all.csv"))

### Create Train, Valid and Test Split

In [95]:
all_labels = pd.read_csv(os.path.join(DATA_DIR, "BN-HTRd_all.csv"))
all_labels = all_labels.sample(frac=1)

In [96]:
train_split = int(0.80 * len(all_labels))
train_labels = all_labels.iloc[:train_split]
print(len(train_labels))

valid_split = train_split + int(0.10 * len(all_labels))
valid_labels = all_labels.iloc[train_split:valid_split]
print(len(valid_labels))

test_labels = all_labels.iloc[valid_split:]
print(len(test_labels))

86448
10806
10807


In [97]:
train_labels.to_csv(os.path.join(DATA_DIR, "BN-HTRd_train.csv"), index=False)
valid_labels.to_csv(os.path.join(DATA_DIR, "BN-HTRd_valid.csv"), index=False)
test_labels.to_csv(os.path.join(DATA_DIR, "BN-HTRd_test.csv"), index=False)