In [20]:
import os, random, cv2
import numpy as np
from collections import defaultdict, Counter

import tensorflow as tf
from tensorflow.keras import layers

In [21]:
class DataLoader:
    
    def get_file_path(self, file_name):
        path = '/root/Data/IAM/Words'
        destination = file_name[0]
        path = os.path.join(path, destination)
        for i in range(1, len(file_name)):
            destination = destination + "-" + file_name[i]
            path = os.path.join(path, destination)

        path = path + '.png'
        return (path)


    def get_image_paths_labels(self):
        img_paths = []
        img_labels = []
        data_path = '/root/Data/IAM'

        words = os.path.join(data_path, 'words.txt')

        with open(words) as labels_file:
            for line in labels_file:
                if line[0] == "#":
                    continue
                else:
                    label = line.split(' ')[-1].strip('\n')
                    file_name = line.split(' ')[0].split('-')
                    file_name[2] = file_name[2] + '-' + file_name[3]
                    file_name.pop()

                    path = self.get_file_path(file_name)
                    img_paths.append(path)
                    img_labels.append(label)

        return (img_paths, img_labels)


    def make_train_test(self):

        img_paths, img_labels = self.get_image_paths_labels()

        print ('Total number of images {}'.format(len(img_paths)))
        word_count = defaultdict(int)

        for word in img_labels:
            word_count[word] += 1

        common_words_counts = Counter(word_count).most_common(20)

        common_words = list(list(zip(*common_words_counts))[0])

    #print (common_words)

        not_common_paths_labels = []

        for i, label in enumerate(img_labels):
            if label not in common_words:
                not_common_paths_labels.append((img_paths[i], label))

    #print (not_common_paths_labels[1:10])

        random.shuffle(not_common_paths_labels)

        train_len = int(0.7 * len(not_common_paths_labels))
        test_len = len(not_common_paths_labels) - train_len

        train_paths_labels = not_common_paths_labels[0:train_len]
        test_paths_labels = not_common_paths_labels[train_len:]

        print ('Length of training data is {} test data is {}'.format(len(train_paths_labels), len(test_paths_labels)))
        train_imgs_ts = list()
        train_imgs_bs = list()
        train_labels = list()
        train_n = list()

        count = 0
        width, height = 128, 32

        for path_label in train_paths_labels:
            count += 1
            if count == 100:
                break

            n = len(path_label[1])
            img = cv2.imread(path_label[0])
            if (img is not None) and (not np.isnan(img).any()):   #It is a valid image and the input does not contain empty values
                img = (cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)) / 255.0
                img_ts = cv2.resize(img, (width, height)).astype(np.float32).reshape(1, 32, 128, 1) 
                train_imgs_ts.append(img_ts)

                img_bs = cv2.resize(img, (16*n, height)).astype(np.float32)
                shape = img_bs.shape
                img_bs = img_bs.reshape((1, shape[0], shape[1], 1))
                train_imgs_bs.append(img_bs)

                train_labels.append(path_label[1])
                train_n.append(n)


        train = {'imgs_ts':train_imgs_ts, 'imgs_bs':train_imgs_bs, 'labels':train_labels, 'n':train_n}

        test_imgs_ts = list()
        test_imgs_bs = list()
        test_labels = list()
        test_n = list()

        count = 0
        width, height = 128, 32

        for path_label in test_paths_labels:
            count += 1
            if count == 10:
                break

            n = len(path_label[1])
            img = cv2.imread(path_label[0])
            if img is not None:
                img = (cv2.cvtColor(img,cv2.COLOR_BGR2GRAY))/255.0

                img_ts = cv2.resize(img, (width, height)).astype(np.float32).reshape(1, 32, 128, 1)
                test_imgs_ts.append(img_ts)

                img_bs = cv2.resize(img, (16*n, height)).astype(np.float32)
                shape = img_bs.shape
                img_bs = img_bs.reshape((1, shape[0], shape[1], 1))
                test_imgs_bs.append(img_bs)

                test_labels.append(path_label[1])
                test_n.append(n)


        test = {'imgs_ts':test_imgs_ts, 'imgs_bs':test_imgs_bs, 'labels':test_labels, 'n':test_n}

        return (train, test)

In [22]:
class Models:
    
    def get_top_stream(self):
        input_shape = (32, 128, 1)
        model = tf.keras.Sequential()
        model.add(layers.Conv2D(filters=64, kernel_size=(3, 3), input_shape=input_shape, padding='same'))
        model.add(layers.MaxPooling2D(pool_size=(2, 2), padding='same'))
        model.add(layers.Conv2D(filters=128, kernel_size=(3, 3), padding='same'))
        model.add(layers.MaxPooling2D(pool_size=(2, 2), padding='same'))
        model.add(layers.Conv2D(filters=256, kernel_size=(3, 3), padding='same'))
        model.add(layers.MaxPooling2D(pool_size=(2, 2), padding='same'))
        model.add(layers.Flatten())
        model.add(layers.Dense(1024))
        return (model)


    def get_bottom_stream(self):

        model = None
        model = tf.keras.Sequential()
        input_shape = (32, None, 1)
        model.add(layers.Conv2D(filters=128, kernel_size=(3, 3), input_shape=input_shape, padding='same'))
        model.add(layers.Conv2D(filters=128, kernel_size=(3, 3),  padding='same'))
        model.add(layers.Conv2D(filters=128, kernel_size=(3, 3),  padding='same'))
        model.add(layers.MaxPooling2D(pool_size=(2, 2), padding='same'))

        model.add(layers.Conv2D(filters=256, kernel_size=(3, 3),  padding='same'))
        model.add(layers.Conv2D(filters=256, kernel_size=(3, 3),  padding='same'))
        model.add(layers.MaxPooling2D(pool_size=(2, 2), padding='same'))

        model.add(layers.Conv2D(filters=512, kernel_size=(3, 3),  padding='same'))
        model.add(layers.Conv2D(filters=512, kernel_size=(3, 3),  padding='same'))
        model.add(layers.MaxPooling2D(pool_size=(2, 2), padding='same'))
        padding = [[0, 0], [0, 0], [2, 2], [0, 0]]   
        model.add(layers.Conv2D(filters=1024, kernel_size=(4, 4), padding=padding))

        return (model)

    def get_middle_stream(self, Ns):
        model = None
        model = tf.keras.Sequential()
        model.add(layers.Dense(Ns, input_shape = (None, None, 1024), activation=tf.nn.relu))
        return (model)

    def get_character_error_rate(self, word1, word2):
        rows = len(word1) + 1
        cols = len(word2) + 1
        error_matrix = [[0 for i in range(cols)] for j in range(rows)]

        for i in range(cols):
            error_matrix[0][i] = i

        for i in range(rows):
            error_matrix[i][0] = i

        for i in range(1, rows):
            for j in range(1, cols):
                a = error_matrix[i-1][j] + 1
                b = error_matrix[i][j-1] + 1
                if word1[i-1] == word2[j-1]:
                    c = error_matrix[i-1][j-1]
                else:
                    c = error_matrix[i-1][j-1] + 2
                error_matrix[i][j] = min(a, b, c)

        return (error_matrix[rows-1][cols-1])

    def norm_loss(self, out, target):
        return (tf.norm(target - out))


    def get_optimizer(self):
        optimizer = tf.keras.optimizers.RMSprop(learning_rate=0.001)
        return (optimizer)


In [23]:
def train(ts_model, ms_model, bs_model, train_data, optimizer):
    
    save_path = '/root/arun/Handwriting-Recognition/models'

    _, labels_pos_map, pos_labels_map = find_unique_characters()

    epochs = 100
    train_imgs_ts, train_imgs_bs, train_labels, train_n  =\
         train_data['imgs_ts'], train_data['imgs_bs'], train_data['labels'], train_data['n']

    for epoch in range(epochs):
        epoch_loss = 0
        cer = 0
        count = 0
        indexes = np.arange(0, len(train_imgs_ts), 1)
        np.random.shuffle(indexes)
        for idx in indexes:
            img_ts = tf.convert_to_tensor(train_imgs_ts[idx])  #top stream input
            img_bs = tf.convert_to_tensor(train_imgs_bs[idx])  # bottom stream input
            actual_word = train_labels[idx]

            with tf.GradientTape(persistent=True) as tape:
            # ms_model - middle stream model, ts - top stream model, bs_model - bottom stream model
                out = ms_model(ts_model(img_ts) + bs_model(img_bs))
                out = tf.reshape(out, (out.shape[2], out.shape[3]))
                out = out / tf.norm(out, axis=1, keepdims=True)
                predicted_word = list()
                for i in range(1, len(out), 2):
                    predicted_word.append(pos_labels_map[np.argmax(out[i])])
                predicted_word = ''.join(predicted_word)

            # loss = get_character_error_rate(actual_word, predicted_word)    ->  Not able to construct gradients using this method
                target = np.zeros(out.shape)
                for i in range(1, out.shape[0], 2):
                    char = actual_word[(i - 1)//2]  # Shape of out is 2n
                    pos = labels_pos_map[char]
                    target[i][pos] = 1.0
                target = tf.convert_to_tensor(target, dtype=tf.float32)

                loss = models.norm_loss(target, out)

            grads_ms = tape.gradient(loss, ms_model.trainable_weights)
            grads_ts = tape.gradient(loss, ts_model.trainable_weights)
            grads_bs = tape.gradient(loss, bs_model.trainable_weights)

            optimizer.apply_gradients(zip(grads_ms, ms_model.trainable_weights))
            optimizer.apply_gradients(zip(grads_bs, bs_model.trainable_weights))
            optimizer.apply_gradients(zip(grads_ts, ts_model.trainable_weights))
            epoch_loss += loss
            cer += models.get_character_error_rate(actual_word, predicted_word)
            count += 1
            if(count < 6):
                print (actual_word, predicted_word, sep=' ', end='    ')

        try:        
            epoch_loss = int(epoch_loss)
            print ('Epoch {} Loss {}  CER {}\n'.format(epoch, epoch_loss, cer))
        except:
            print ('Error in conversion {} {}'.format(epoch, epoch_loss))
            print (loss)
            return (ts_model, ms_model, bs_model)

        if epoch % 10 == 0:  #Create checkpoint of model weights every 10 epochs
            file_name = 'ts_model epoch {}.ckpt'.format(epoch)
            ts_model.save_weights(os.path.join(save_path, file_name))

            file_name = 'ms_model epoch {}.ckpt'.format(epoch)
            ms_model.save_weights(os.path.join(save_path, file_name))

            file_name = 'bs_model epoch {}.ckpt'.format(epoch)
            bs_model.save_weights(os.path.join(save_path, file_name))

    ts_model.save(os.path.join(save_path, 'ts_model.h5'))
    ms_model.save(os.path.join(save_path, 'ms_model.h5'))
    bs_model.save(os.path.join(save_path, 'bs_model.h5'))

    return (ts_model, ms_model, bs_model)



In [24]:
def test(ts_model, ms_model, bs_model, test_data):
    _, labels_pos_map, pos_labels_map = find_unique_characters()

    test_imgs_ts, test_imgs_bs, test_labels, test_n =\
         test_data['imgs_ts'], test_data['imgs_bs'], test_data['labels'], test_data['n']

    total_characters = 0
    correct_prediction = 0

    for idx in range(len(test_imgs_ts)):
        img_ts = tf.convert_to_tensor(test_imgs_ts[idx])  #top stream input
        img_bs = tf.convert_to_tensor(test_imgs_bs[idx])  # bottom stream input
        actual_word = test_labels[idx]

        out = ms_model(ts_model(img_ts) + bs_model(img_bs))
        out = tf.reshape(out, (out.shape[2], out.shape[3]))
        out = out / tf.norm(out, axis=1, keepdims=True)
        predicted_word = list()
        for i in range(1, len(out), 2):
            predicted_word.append(pos_labels_map[np.argmax(out[i])])
        predicted_word = ''.join(predicted_word)

        total_characters += len(actual_word)
        for i in range(len(actual_word)):
            if actual_word[i] == predicted_word[i]:
                correct_prediction += 1

        if (idx < 5):
            print (actual_word, predicted_word, sep=' ', end='   ')

    print ('Total characters {} correct prediction {}'.format(total_characters, correct_prediction))
    print ("Character wise accuracy {}".format(correct_prediction*100 / total_characters))
    return (0)

def find_unique_characters():
#Finding Ns and creating a map of labels and index
    _, labels = dataloader.get_image_paths_labels()
    unique_characters = list()
    for i, label in enumerate(labels):
        for char in label:
            if char not in unique_characters:
                unique_characters.append(char)

    unique_characters.sort()
    Ns = len(unique_characters)
    
    pos_labels_map = {}
    labels_pos_map = {}
    for i, char in enumerate(unique_characters):
        labels_pos_map[char] = i
        pos_labels_map[i] = char

    return (Ns, labels_pos_map, pos_labels_map)



In [25]:
def adjust_gradient(grads, eta=0.03, gamma=0.55):
    #Adding gradient noise and clipping the gradients
    var = eta / ((1 + epoch) ** gamma)
    for i, grad in enumerate(grads):
        grads[i] = grad + np.random.normal(0, var, grad.shape)
    return (grads)

In [26]:
def main():
    
    Ns, _, __ = find_unique_characters()
    train_data, test_data = dataloader.make_train_test()

    train_count = len(train_data['n'])
    test_count = len(test_data['n'])
    print ('Total samples in training data is {} and in test data is {}'.format(train_count, test_count))

    ts_model = models.get_top_stream()
    bs_model = models.get_bottom_stream()
    ms_model = models.get_middle_stream(Ns)

    optimizer = models.get_optimizer()

#    ts_model, ms_model, bs_model = train(ts_model, ms_model, bs_model, train_data, optimizer)
#    test(ts_model, ms_model, bs_model, test_data)
#Training model


if __name__ == '__main__':
    models = Models()
    dataloader = DataLoader()
    main()


Total number of images 115320
Length of training data is 52850 test data is 22650
Total samples in training data is 99 and in test data is 9


ResourceExhaustedError: OOM when allocating tensor with shape[16384,1024] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc [Op:Mul]

In [27]:
######MAIN CODE #############

Ns, _, __ = find_unique_characters()
train_data, test_data = dataloader.make_train_test()

train_count = len(train_data['n'])
test_count = len(test_data['n'])
print ('Total samples in training data is {} and in test data is {}'.format(train_count, test_count))

ts_model = models.get_top_stream()
bs_model = models.get_bottom_stream()
ms_model = models.get_middle_stream(Ns)

tsp = models.get_top_stream()
bsp = models.get_bottom_stream()
msp = models.get_middle_stream(Ns)

#optimizer = models.get_optimizer()
optimizer = tf.keras.optimizers.Adagrad(learning_rate=0.01)

######TRAIN CODE #############
save_path = '/root/arun/Handwriting-Recognition/models'

_, labels_pos_map, pos_labels_map = find_unique_characters()

epochs = 100
train_imgs_ts, train_imgs_bs, train_labels, train_n  =\
     train_data['imgs_ts'], train_data['imgs_bs'], train_data['labels'], train_data['n']

for epoch in range(epochs):
    epoch_loss = 0
    cer = 0
    count = 0
    indexes = np.arange(0, len(train_imgs_ts), 1)
    np.random.shuffle(indexes)
    
    for idx in indexes:
        tsp.set_weights(ts_model.get_weights())
        bsp.set_weights(bs_model.get_weights())
        msp.set_weights(ms_model.get_weights())
        
        img_ts = tf.convert_to_tensor(train_imgs_ts[idx])  #top stream input
        img_bs = tf.convert_to_tensor(train_imgs_bs[idx])  # bottom stream input
        actual_word = train_labels[idx]

        with tf.GradientTape(persistent=True) as tape:
        # ms_model - middle stream model, ts - top stream model, bs_model - bottom stream model
            out = ms_model(ts_model(img_ts) + bs_model(img_bs))
            out = tf.reshape(out, (out.shape[2], out.shape[3]))
            out = out / tf.norm(out, axis=1, keepdims=True)
            predicted_word = list()
            for i in range(1, len(out), 2):
                predicted_word.append(pos_labels_map[np.argmax(out[i])])
            predicted_word = ''.join(predicted_word)

        # loss = get_character_error_rate(actual_word, predicted_word)    ->  Not able to construct gradients using this method
            target = np.zeros(out.shape)
            for i in range(1, out.shape[0], 2):
                char = actual_word[(i - 1)//2]  # Shape of out is 2n
                pos = labels_pos_map[char]
                target[i][pos] = 1.0
            target = tf.convert_to_tensor(target, dtype=tf.float32)
            break

            loss = models.norm_loss(target, out)

        grads_ms = tape.gradient(loss, ms_model.trainable_weights)
        grads_ts = tape.gradient(loss, ts_model.trainable_weights)
        grads_bs = tape.gradient(loss, bs_model.trainable_weights)
        
        grads_ms, _ = tf.clip_by_global_norm(grads_ms, 1.0)
        grads_ts, _ = tf.clip_by_global_norm(grads_ts, 1.0)
        grads_bs, _ = tf.clip_by_global_norm(grads_bs, 1.0)

        grads_ms = adjust_gradient(grads_ms)
        grads_ts = adjust_gradient(grads_ts)
        grads_bs = adjust_gradient(grads_bs)
        
        optimizer.apply_gradients(zip(grads_ms, ms_model.trainable_weights))
        optimizer.apply_gradients(zip(grads_bs, bs_model.trainable_weights))
        optimizer.apply_gradients(zip(grads_ts, ts_model.trainable_weights))
        try:
            loss = int(loss)
        except:
            print ('Error in conversion of loss {}'.fornat(loss))
        epoch_loss += loss
        
        cer += models.get_character_error_rate(actual_word, predicted_word)
        count += 1
        if(count < 6):
            print (actual_word, predicted_word, sep=' ', end='    ')

    try:        
        epoch_loss = int(epoch_loss)
        print ('Epoch {} Loss {}  CER {}\n'.format(epoch, epoch_loss, cer))
    except:
        print ('Error in conversion {} {}'.format(epoch, epoch_loss))
        print (loss)

    if epoch % 10 == 0:  #Create checkpoint of model weights every 10 epochs
        file_name = 'ts_model epoch {}.ckpt'.format(epoch)
        ts_model.save_weights(os.path.join(save_path, file_name))

        file_name = 'ms_model epoch {}.ckpt'.format(epoch)
        ms_model.save_weights(os.path.join(save_path, file_name))

        file_name = 'bs_model epoch {}.ckpt'.format(epoch)
        bs_model.save_weights(os.path.join(save_path, file_name))
    break
ts_model.save(os.path.join(save_path, 'ts_model.h5'))
ms_model.save(os.path.join(save_path, 'ms_model.h5'))
bs_model.save(os.path.join(save_path, 'bs_model.h5'))

Total number of images 115320
Length of training data is 52850 test data is 22650
Total samples in training data is 99 and in test data is 9


ResourceExhaustedError: OOM when allocating tensor with shape[3,3,128,256] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc [Op:Mul]

In [31]:
target = np.zeros((2, 3))
target

array([[0., 0., 0.],
       [0., 0., 0.]])

In [33]:
out = np.ones((2, 3))
out

array([[1., 1., 1.],
       [1., 1., 1.]])

In [38]:
help (tf.norm)

Help on function norm_v2 in module tensorflow.python.ops.linalg_ops:

norm_v2(tensor, ord='euclidean', axis=None, keepdims=None, name=None)
    Computes the norm of vectors, matrices, and tensors.
    
    This function can compute several different vector norms (the 1-norm, the
    Euclidean or 2-norm, the inf-norm, and in general the p-norm for p > 0) and
    matrix norms (Frobenius, 1-norm, 2-norm and inf-norm).
    
    Args:
      tensor: `Tensor` of types `float32`, `float64`, `complex64`, `complex128`
      ord: Order of the norm. Supported values are `'fro'`, `'euclidean'`,
        `1`, `2`, `np.inf` and any positive real number yielding the corresponding
        p-norm. Default is `'euclidean'` which is equivalent to Frobenius norm if
        `tensor` is a matrix and equivalent to 2-norm for vectors.
        Some restrictions apply:
          a) The Frobenius norm `'fro'` is not defined for vectors,
          b) If axis is a 2-tuple (matrix norm), only `'euclidean'`, '`fro'`, 

In [40]:
tf.norm(target - out, np.inf)

<tf.Tensor: shape=(), dtype=float64, numpy=1.0>

**SGD**
    lr = 0.01 24 epochs, CER = . Test accuracy: 8%.
    lr = 0.0001 Problem: Very slow. Got stock at loss=876. After that, there was no reduction in loss value.
    lr = 0.001 Problem: Very slow. Epoch: 30 loss 326 CER 856. Updates are slow.
    
**Adagrad**
    lr = 0.01

In [None]:
tsp.weights

In [None]:
ts_model.weights

In [None]:
grads_ts

<tf.Tensor: shape=(), dtype=float32, numpy=nan>

In [21]:
test(tsp, msp, bsp, test_data)

have hrem   smashed Aiiient   Committee fieeieziw   or ll   In hh   Total characters 40 correct prediction 5
Character wise accuracy 12.5


0

In [None]:
tsp.weights

**Problem - Model weights becoming nan**. Reasong: gradients becoming nan.

**problem - gradients being nan** Reason: ----

When will nan's be generated? nan - not a number - division by 0.

One possible reason is that in the gradient computation, if both dy and dx are 0 or inf, then it might return nan.

In [14]:
ts_model.set_weights(tsp.get_weights())
bs_model.set_weights(bsp.get_weights())
ms_model.set_weights(msp.get_weights())

In [15]:
idx = 26
with tf.GradientTape(persistent=True) as tape:
    out = ms_model(ts_model(train_imgs_ts[idx]) + bs_model(train_imgs_bs[idx]))
    actual_word = train_labels[idx]

    out = tf.reshape(out, (out.shape[2], out.shape[3]))
    out = out / tf.norm(out, axis=1, keepdims=True)
    predicted_word = list()
    for i in range(1, len(out), 2):
        predicted_word.append(pos_labels_map[np.argmax(out[i])])
    predicted_word = ''.join(predicted_word)

    target = np.zeros(out.shape)
    for i in range(1, out.shape[0], 2):
        char = actual_word[(i - 1)//2]  # Shape of out is 2n
        pos = labels_pos_map[char]
        target[i][pos] = 1.0
    target = tf.convert_to_tensor(target, dtype=tf.float32)

    loss = models.norm_loss(target, out)

In [16]:
loss

<tf.Tensor: shape=(), dtype=float32, numpy=1.8473893>

In [None]:
ms_model.weights

In [18]:
grads_ms = adjust_gradient(tape.gradient(loss, ms_model.trainable_weights))
grads_ts = adjust_gradient(tape.gradient(loss, ts_model.trainable_weights))
grads_bs = adjust_gradient(tape.gradient(loss, bs_model.trainable_weights))


In [None]:
grads_ms

In [None]:
ms_model.weights

In [21]:
optimizer.apply_gradients(zip(grads_ms, ms_model.trainable_weights))
optimizer.apply_gradients(zip(grads_bs, bs_model.trainable_weights))
optimizer.apply_gradients(zip(grads_ts, ts_model.trainable_weights))


<tf.Variable 'UnreadVariable' shape=() dtype=int64, numpy=5073>

In [None]:
ms_model.weights

In [23]:
loss

<tf.Tensor: shape=(), dtype=float32, numpy=1.8473893>

In [None]:
grads_ms

In [None]:
ms_model.weights

In [46]:
t = np.ones((2, 2))
tf.norm(t, 2)

<tf.Tensor: shape=(), dtype=float64, numpy=2.0>

Try changing gradients
Try annealing gradients.
A new loss function

https://cs231n.github.io/neural-networks-3/