In [1]:
import os
import numpy as np
import matplotlib.pyplot as plt

from pathlib import Path
from collections import Counter

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import pandas as pd
import cv2
from keras.preprocessing.sequence import pad_sequences
import keras.backend as K
import tensorflow_datasets as tfds
from keras.layers.advanced_activations import LeakyReLU
from sklearn.metrics import confusion_matrix

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [2]:
# config values
img_width = 1280
img_height = 128
batch_size = 32

In [None]:
data_dir = "/content/drive/MyDrive/mongol_bichig/dict_data.xlsx"
lyrics_dir0 = "/content/drive/MyDrive/mongol_bichig/lyrics_data_0-4999.xlsx"
lyrics_dir1 = "/content/drive/MyDrive/mongol_bichig/lyrics_data_5000-19999.xlsx"
lyrics_dir2 = "/content/drive/MyDrive/mongol_bichig/lyrics_data_20000-29999.xlsx"
lyrics_dir3 = "/content/drive/MyDrive/mongol_bichig/lyrics_data_30000-39999.xlsx"
lyrics_dir4 = "/content/drive/MyDrive/mongol_bichig/lyrics_data_40000-49999.xlsx"
data = pd.read_excel(data_dir,header=None)
lyrics0 = pd.read_excel(lyrics_dir0,header=None)
lyrics1 = pd.read_excel(lyrics_dir1,header=None)
lyrics2 = pd.read_excel(lyrics_dir2,header=None)
lyrics3 = pd.read_excel(lyrics_dir3,header=None)
lyrics4 = pd.read_excel(lyrics_dir4,header=None)

train_size = 30720
val_size = 10240
test_size = 10240
train_size = 800
val_size = 32
test_size = 32

all_labels = np.concatenate([data[1].to_numpy(), lyrics0[1].to_numpy(), lyrics1[1].to_numpy(), lyrics2[1].to_numpy(), lyrics3[1].to_numpy(), lyrics4[1].to_numpy()])
all_data = np.concatenate([data[0].to_numpy(), lyrics0[0].to_numpy(), lyrics1[0].to_numpy(), lyrics2[0].to_numpy(), lyrics3[0].to_numpy(), lyrics4[0].to_numpy()])
max_length = max([len(label) for label in all_labels])

np_label = all_labels[:train_size]
np_data = all_data[:train_size]
np_data = [f'/content/drive/MyDrive/mongol_bichig/images/dict-{int(i)}.png' for i in np_data]
np_data = np.array(np_data)

np_label_val = all_labels[train_size:(train_size+val_size)]
np_data_val = all_data[train_size:(train_size+val_size)]
np_data_val = [f'/content/drive/MyDrive/mongol_bichig/images/dict-{int(i)}.png' for i in np_data_val]
np_data_val = np.array(np_data_val)

np_label_test = all_labels[(train_size+val_size):(train_size+val_size+test_size)]
np_data_test = all_data[(train_size+val_size):(train_size+val_size+test_size)]
np_data_test = [f'/content/drive/MyDrive/mongol_bichig/images/dict-{int(i)}.png' for i in np_data_test]
np_data_test = np.array(np_data_test)

scripts = ['ᠰ', 'ᠡ', 'ᠭ', 'ᠥ', 'ᠳ', 'ᠯ', 'ᠵ', 'ᠬ', ' ', 'ᠠ', 'ᠤ', 'ᠴ', 'ᠪ', 'ᠷ', 'ᠦ', 'ᠢ', 'ᠮ', 'ᠨ', 'ᠱ', 'ᠣ', 'ᠲ', 'ᠶ', 'ᠺ', 'ᠹ', 'ᠧ', 'ᠫ', 'ᠩ', 'ᠼ', 'ᠸ', 'ᠻ', 'ᠾ', '᠍', 'ᠽ', '᠀', '᠁', '\u180e', '\u202f', '᠋', '᠌']
def encode_to_labels(txt):
    # encoding each output word into digits
    dig_lst = []
    for index, char in enumerate(txt):
        try:
            dig_lst.append(scripts.index(char))
        except:
          if (char == '/'):
            dig_lst.append(len(scripts))

    return dig_lst


ocr_train_data = []
ocr_train_label = []
i = 0
for label, data in zip(np_label, np_data):
  try:
    img = cv2.imread(data, 0)
    img = cv2.transpose(img)
    new_arr = cv2.resize(img, (img_width, img_height))
    if ocr_train_data:
      ocr_train_data = ocr_train_data + [new_arr]      
    else:
      ocr_train_data = [new_arr]
    label = label.ljust(max_length, '/')
    ocr_train_label.append(encode_to_labels(label))
  except:
    pass

ocr_train_data_val = []
ocr_train_label_val = []
i = 0
for label, data in zip(np_label_val, np_data_val):
    try:
      img = cv2.imread(data, 0)
      new_arr = cv2.resize(img, (img_width, img_height))    
      if ocr_train_data_val:
        ocr_train_data_val = ocr_train_data_val + [new_arr]      
      else:
        ocr_train_data_val = [new_arr]
      label = label.ljust(max_length, '/')
      ocr_train_label_val.append(encode_to_labels(label))
    except:
      pass

ocr_train_data_test = []
ocr_train_label_test = []
i = 0
for label, data in zip(np_label_test, np_data_test):
    try:
      img = cv2.imread(data, 0)
      new_arr = cv2.resize(img, (img_width, img_height))    
      if ocr_train_data_test:
        ocr_train_data_test = ocr_train_data_test + [new_arr]      
      else:
        ocr_train_data_test = [new_arr]
      label = label.ljust(max_length, '/')
      ocr_train_label_test.append(encode_to_labels(label))
    except:
      pass

In [None]:
# convert the data and labels into numpy array 

ocr_train_label = np.array([np.array(xi) for xi in ocr_train_label])
ocr_train_label = np.asarray(ocr_train_label).astype(float)
ocr_train_data = np.array(ocr_train_data) / 255

ocr_train_label_val = np.array([np.array(xi) for xi in ocr_train_label_val])
ocr_train_label_val = np.asarray(ocr_train_label_val).astype(float)
ocr_train_data_val = np.array(ocr_train_data_val) / 255

ocr_train_label_test = np.array([np.array(xi) for xi in ocr_train_label_test])
ocr_train_label_test = np.asarray(ocr_train_label_test).astype(float)
ocr_train_data_test = np.array(ocr_train_data_test) / 255

In [None]:
# visualize the data

_, ax = plt.subplots(1, 1, figsize=(15, 5))
img = (ocr_train_data[0, :, :] * 255).astype(np.uint8)
img = img.T
ax.imshow(img, cmap="gray")
ax.axis("off")
plt.show()

# Custom Accuracy Metric

In [5]:
# This is the custom accuracy metric that I created that returns character
# accuracy instead of word accuracy. 

# update_state() does the calculations and updates the state variables
# result() returns the accuracy score as a fraction/decimal
# reset_state() resets the state variables after each epoch.
class SequenceAccuracy(keras.metrics.Metric):
  def __init__(self, name='sequence_accuracy', **kwargs):
    super().__init__(name=name, **kwargs)
    self.total = self.add_weight(name='total', initializer='zeros')
    self.count = self.add_weight(name='count', initializer='zeros')
              
  def update_state(self, y_true, y_pred, sample_weight=None):
    def sparse2dense(tensor, shape):
      tensor = tf.sparse.reset_shape(tensor, shape)
      tensor = tf.sparse.to_dense(tensor, default_value=len(scripts))
      tensor = tf.cast(tensor, tf.float32)
      return tensor

    def dense2sparse2dense(tensor, shape):
      tensor = tf.sparse.from_dense(tensor=tensor)
      tensor = tf.sparse.reset_shape(tensor, shape)
      tensor = tf.sparse.to_dense(tensor, default_value=len(scripts))
      tensor = tf.cast(tensor, tf.float32)
      return tensor

    y_true_shape = tf.shape(y_true)
    y_pred_shape = tf.shape(y_pred)
    max_width = tf.math.maximum(y_true_shape[1], y_pred_shape[2])
    logit_length = tf.fill([batch_size], y_pred_shape[2])
    decoded, _ = tf.nn.ctc_greedy_decoder(
        inputs=tf.transpose(y_pred, perm=[1, 0, 2]),
        sequence_length=logit_length)
    y_true = dense2sparse2dense(y_true, [batch_size, max_width])
    y_pred = sparse2dense(decoded[0], [batch_size, max_width])
    correct = 0
    total = 0
    for i in range(batch_size):
      for j in range(max_width):
        if (not (y_true[i][j] == len(scripts) and y_pred[i][j] == len(scripts))):
          total += 1
          if (y_pred[i][j] == y_true[i][j]):
            correct += 1
    total = tf.cast(total, tf.float32)
    correct = tf.cast(correct, tf.float32)
    self.total.assign_add(total)
    self.count.assign_add(correct)

  def result(self):
    return self.count / self.total

  def reset_state(self):
    self.count.assign(0)
    self.total.assign(0)

# CTC Loss Function

In [15]:
class CTCLayer(layers.Layer):
    def __init__(self, name=None):
        super().__init__(name=name)
        self.loss_fn = keras.backend.ctc_batch_cost

    def call(self, y_true, y_pred):
        # Compute the training-time loss value and add it
        # to the layer using `self.add_loss()`.
        batch_len = tf.cast(tf.shape(y_true)[0], dtype="int64")
        input_length = tf.cast(tf.shape(y_pred)[1], dtype="int64")
        label_length = tf.cast(tf.shape(y_true)[1], dtype="int64")

        input_length = input_length * tf.ones(shape=(batch_len, 1), dtype="int64")
        label_length = label_length * tf.ones(shape=(batch_len, 1), dtype="int64")

        loss = self.loss_fn(y_true, y_pred, input_length, label_length)
        self.add_loss(loss)

        # just return the computed predictions
        return y_pred

# Model 1

In [None]:
def build_model():
    # Inputs to the model
    input_img = layers.Input(
        shape=(img_height, img_width, 1), name="image",
    )
    labels = layers.Input(name="label", shape=(None,), dtype="float32")

    # First conv block
    x = layers.Conv2D(
        64,
        (2, 2),
        activation="relu",
        kernel_initializer="he_normal",
        padding="same",
        name="Conv1",
    )(input_img)
    x = layers.MaxPooling2D((2, 2), strides=2, name="pool1")(x)

    # Second conv block
    x = layers.Conv2D(
        64,
        (2, 2),
        activation="relu",
        kernel_initializer="he_normal",
        padding="same",
        name="Conv2",
    )(x)
    x = layers.MaxPooling2D((2, 2), strides=2, padding="same", name="pool2")(x)

    new_shape = ((img_width // 8), (img_height // 2) * 64)
    x = layers.Reshape(target_shape=new_shape, name="reshape")(x)
    x = layers.Dense(64, activation="relu", name="dense1")(x)
    x = layers.Dropout(0.2)(x)

    # RNNs
    x = layers.Bidirectional(layers.LSTM(128, return_sequences=True, dropout=0.25))(x)
    x = layers.Bidirectional(layers.LSTM(64, return_sequences=True, dropout=0.25))(x)

    # Output layer
    x = layers.Dense(
        len(scripts) + 1, activation="softmax", name="dense2"
    )(x)

    # Add CTC layer for calculating CTC loss at each step
    output = CTCLayer(name="ctc_loss")(labels, x)

    # Define the model
    model = keras.models.Model(
        inputs=[input_img, labels], outputs=output, name="ocr_model_v1"
    )

    # Optimizer
    opt = keras.optimizers.Adam()
    # Compile the model and return
    model.compile(optimizer=opt, metrics=[SequenceAccuracy()])
    return model

# Get the model
model = build_model()
model.summary()

# Model 2

In [None]:
def build_model():
    # Inputs to the model
    input_img = layers.Input(
        shape=(img_height, img_width, 1), name="image",
    )
    labels = layers.Input(name="label", shape=(None,), dtype="float32")

    x = layers.Conv2D(64, (3,3), activation = 'relu', padding='same', name="Conv1")(input_img)
    x = layers.MaxPool2D(pool_size=(2, 2), name="pool1")(x)
    
    x = layers.Conv2D(128, (3,3), activation = 'relu', padding='same', name="Conv2")(x)
    x = layers.MaxPool2D(pool_size=(2, 2), name="pool2")(x)
    
    x = layers.Conv2D(256, (3,3), activation = 'relu', padding='same', name="Conv3")(x)
    
    x = layers.Conv2D(256, (3,3), activation = 'relu', padding='same', name="Conv4")(x)
    x = layers.MaxPool2D(pool_size=(2, 1), name="pool3")(x)
    
    x = layers.Conv2D(512, (3,3), activation = 'relu', padding='same', name="Conv5")(x)
    x = layers.BatchNormalization(name="norm1")(x)
    
    new_shape = ((img_width // 4), (img_height // 1) * 64) 
    x = layers.Reshape(target_shape=new_shape)(x)

    x = layers.Bidirectional(layers.LSTM(128, return_sequences=True, dropout=0.2), name="bidirec1")(x)
    x = layers.Bidirectional(layers.LSTM(64, return_sequences=True, dropout=0.2), name="bidirec2")(x)

    # Output layer
    x = layers.Dense(
        len(scripts) + 1, activation="softmax", name="dense2"
    )(x)

    # Add CTC layer for calculating CTC loss at each step
    output = CTCLayer(name="ctc_loss")(labels, x)

    # Define the model
    model = keras.models.Model(
        inputs=[input_img, labels], outputs=output, name="ocr_model_v2"
    )
    # Optimizer
    opt = keras.optimizers.Adam()
    # Compile the model and return
    model.compile(optimizer=opt, metrics=[SequenceAccuracy()])
    return model


# Get the model
model = build_model()
model.summary()

# Model 3

In [None]:
from sklearn import metrics
def build_model():
    inputs = layers.Input(
        shape=(img_height, img_width, 1), name="image",
    )

    labels = layers.Input(name="label", shape=(None,), dtype="float32")

    conv_1 = layers.Conv2D(32, (3,3), activation = "relu", padding='same', name="conv1")(inputs)
    pool_1 = layers.MaxPool2D(pool_size=(2, 1), name="pool1")(conv_1)
    
    conv_2 = layers.Conv2D(64, (3,3), activation = "relu", padding='same', name="conv2")(pool_1)
    pool_2 = layers.MaxPool2D(pool_size=(3, 2), name="pool2")(conv_2)

    conv_3 = layers.Conv2D(128, (3,3), activation = "relu", padding='same', name="conv3")(pool_2)
    pool_3 = layers.MaxPool2D(pool_size=(3, 2), name="pool3")(conv_3)
   
    drop_1 = layers.Dropout(0.2, name="drop1")(pool_3)
    conv_4 = layers.Conv2D(128, (3,3), activation = "relu", padding='same', name="conv4")(drop_1)

    pool_4 = layers.MaxPool2D(pool_size=(3, 2), name="pool4")(conv_4)

    conv_5 = layers.Conv2D(256, (3,3), activation = "relu", padding='same', name="conv5")(pool_4)
    
    # Batch normalization layer
    batch_norm_5 = layers.BatchNormalization(name="batch1")(conv_5)
    conv_6 = layers.Conv2D(256, (3,3), activation = "relu", padding='same', name="conv6")(batch_norm_5)
    batch_norm_6 = layers.BatchNormalization(name="batch2")(conv_6)
    
    conv_7 = layers.Conv2D(64, (2,2), activation = "relu", name="conv7")(batch_norm_6)
    
    squeezed = layers.Lambda(lambda x: K.squeeze(x, 1), name="sqeeze")(conv_7)
    
    # bidirectional LSTM layers 
    blstm_1 = layers.Bidirectional(layers.LSTM(128, return_sequences=True, dropout=0.2), name="lstm1")(squeezed)
    blstm_2 = layers.Bidirectional(layers.LSTM(128, return_sequences=True, dropout=0.2), name="lstm2")(blstm_1)

    softmax_output = layers.Dense(len(scripts) + 1, activation = 'softmax', name="dense")(blstm_2)

    output = CTCLayer(name="ctc_loss")(labels, softmax_output)


    optimizer = keras.optimizers.Adam(learning_rate=0.001)

    # create and return model
    model = keras.models.Model(inputs=[inputs, labels], outputs=output, name="ocr_model_v3")
    model.compile(optimizer = optimizer, metrics=[SequenceAccuracy()])

    return model

model = build_model()
model.summary()

In [None]:
epochs = 3

# Train the model
history = model.fit(
    [ocr_train_data, ocr_train_label],
    ocr_train_label,
    validation_data=[ocr_train_data_val, ocr_train_label_val],
    batch_size=batch_size,
    epochs=epochs,
)

In [None]:
test_scores = model.evaluate([ocr_train_data_test, ocr_train_label_test], ocr_train_label_test, verbose=1)
print("Test loss:", test_scores[0])
print("Test accuracy:", test_scores[1])