In [33]:
'''
Author       : Aditya Jain
Date Started : This notebook was created on 2nd December, 2020
About        : Implementing CNN+RNN+CTC
'''

from tensorflow import keras
from tensorflow.keras.layers import Dropout, Dense, Input, Reshape, TimeDistributed, Lambda, LSTM, Bidirectional, Conv2D, MaxPooling2D, Flatten
import tensorflow.keras.backend as K


from tensorflow.keras.models import Model 
from tensorflow.keras.utils import to_categorical
import tensorflow as tf

import numpy as np

#### Importing Data

In [48]:
import h5py
import matplotlib.pyplot as plt
from helper_func.misc import slide_window

dataset_path = "/home/aditya/Dropbox/LearningfromDemons/ctc_data/iam_lines.h5"
no_classes   = 80

with h5py.File(dataset_path, "r") as f:
    x_train = f['x_train'][:]
    y_train = f['y_train'][:]
    x_test  = f['x_test'][:]
    y_test  = f['y_test'][:]
    

x_train = x_train[:1000]
y_train = y_train[:1000]
x_test  = x_test[:1000]
y_test  = y_test[:1000]
    
# y_train = to_categorical(y_train, no_classes)
y_test  = to_categorical(y_train, no_classes)

y_train = y_train[:, :4]

print(x_train.shape)
print(y_train.shape)

print(x_test.shape)

(1000, 28, 952)
(1000, 4)
(1000, 28, 952)


#### Model Architecture

In [40]:
#### Doing Here

image_shape = x_train.shape[1:]        # the image shape
no_channels = 1                        # no of channels in the image, 3 in case of RGB
print(image_shape)

no_classes        = 80
max_label_len = 4
print(type(image_shape[0]))
# architecture is defined below

inputs     = Input(shape=image_shape)
reshape1   = Reshape((image_shape[0], image_shape[1], 1))(inputs)
conv_1     = Conv2D(32, (3,3), activation = 'relu', padding='same')(reshape1)
max_pool1  = MaxPooling2D(pool_size=(2, 2))(conv_1)
conv_2     = Conv2D(64, (3,3), activation = 'relu', padding='same')(max_pool1)
max_pool2  = MaxPooling2D(pool_size=(2, 2))(conv_2)
reshape    = Reshape(target_shape=(int(image_shape[0]/4), int(image_shape[1]/4*64)))(max_pool2)
dense1     = Dense(64)(reshape)                                                  # this dense helps reduce no of params
blstm1     = Bidirectional(LSTM(64, return_sequences=True))(reshape)
outputs    = Dense(no_classes+1, activation="softmax")(blstm1)


model_arch = Model(inputs, outputs)           # for viz the model architecture
model_arch.summary()

(28, 952)
<class 'int'>
Model: "functional_38"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_21 (InputLayer)        [(None, 28, 952)]         0         
_________________________________________________________________
reshape_16 (Reshape)         (None, 28, 952, 1)        0         
_________________________________________________________________
conv2d_40 (Conv2D)           (None, 28, 952, 32)       320       
_________________________________________________________________
max_pooling2d_40 (MaxPooling (None, 14, 476, 32)       0         
_________________________________________________________________
conv2d_41 (Conv2D)           (None, 14, 476, 64)       18496     
_________________________________________________________________
max_pooling2d_41 (MaxPooling (None, 7, 238, 64)        0         
_________________________________________________________________
reshape_17 (Reshape)         

#### Loss Function

In [41]:
labels       = Input(name='the_labels', shape=[max_label_len], dtype='float32')
input_length = Input(name='input_length', shape=[1], dtype='int64')
label_length = Input(name='label_length', shape=[1], dtype='int64')

def ctc_lambda_func(args):
    y_pred, labels, input_length, label_length = args
 
    return K.ctc_batch_cost(labels, y_pred, input_length, label_length)
 

loss_out = Lambda(ctc_lambda_func, output_shape=(1,), name='ctc')([outputs, labels, input_length, label_length])
model = Model(inputs=[inputs, labels, input_length, label_length], outputs=loss_out)

model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer = 'adam')

In [42]:
input_length = np.asarray([7 for i in range(1000)])
label_length = np.asarray([4 for i in range(1000)])

#### Training

In [44]:
model.fit(x=[x_train, y_train, input_length, label_length], y=np.zeros(1000), epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fa76b572c50>

#### Inference

In [60]:
# model.save_weights('first_run.hdf5')
# model_arch.load_weights('first_run.hdf5')
 
# predict outputs on validation images
test_img = x_test[:2]
prediction = model_arch.predict(test_img)

# use CTC decoder
out = K.get_value(K.ctc_decode(prediction, input_length=np.ones(prediction.shape[0])*prediction.shape[1],
                         greedy=True)[0][0])
 

print(out)


[[55 -1 -1 -1 -1 -1 -1]
 [41 55 -1 -1 -1 -1 -1]]
