In [11]:
'''
Author       : Aditya Jain
Date Started : This notebook was created on 2nd December, 2020
About        : Implementing CNN+RNN+CTC
'''

from tensorflow import keras
from tensorflow.keras.layers import Dropout, Dense, Input, Reshape, TimeDistributed, Lambda, LSTM, Bidirectional, Conv2D, MaxPooling2D, Flatten
import tensorflow.keras.backend as K
from tensorflow.keras.models import Model 
from tensorflow.keras.utils import to_categorical
import tensorflow as tf

import numpy as np
import pickle
from sklearn.model_selection import train_test_split

from keras.preprocessing.sequence import pad_sequences

#### Importing MIME Data

In [39]:
data_read  = pickle.load(open("MIME_small.pickle","rb"))

image_data = data_read['data_image']
labels     = data_read['data_label']
prim_map   = data_read['primitive_map']
label_map  = data_read['label_map']

labels  = pad_sequences(labels, padding='post', value = 0)  # making sure all labels are of equal length

print(image_data.shape)
print(labels.shape)
print(prim_map)
print(label_map)

x_train, x_test, y_train, y_test = train_test_split(image_data, labels, test_size=0.2, random_state=43)  
# note: passing a value to random_state produces the exact split every time

print("Training Data: ", x_train.shape, y_train.shape)
print("Testing Data: ", x_test.shape, y_test.shape)

no_classes    = len(prim_map)+1
max_label_len = labels.shape[-1]

training_pts  = int(x_train.shape[0])
test_pts      = int(x_test.shape[0])

print("Total classes of primitives: ", no_classes)
print("Max label length: ", max_label_len)

(8, 30, 800)
(8, 7)
{1: 'Reach', 2: 'Tilt', 3: 'Retract', 4: 'Grasp', 5: 'Release'}
{'Push': [1, 1, 3], 'Pour': [1, 4, 1, 2, 1, 5, 3], 'Pick': [1, 4, 1, 1, 5, 3], 'Stack': [1, 4, 1, 5, 3]}
Training Data:  (6, 30, 800) (6, 7)
Testing Data:  (2, 30, 800) (2, 7)
Total classes of primitives:  6
Max label length:  7


In [40]:
print(training_pts, test_pts)

6 2


#### Model Architecture

In [41]:
#### Doing Here

image_shape = x_train.shape[1:]        # the image shape
no_channels = 1                        # no of channels in the image, 3 in case of RGB
print(image_shape)

# no_classes        = 80
# max_label_len = 4
print(type(image_shape[0]))

# architecture is defined below

inputs     = Input(shape=image_shape)
reshape1   = Reshape((image_shape[0], image_shape[1], 1))(inputs)
conv_1     = Conv2D(32, (3,3), activation = 'relu', padding='same')(reshape1)
max_pool1  = MaxPooling2D(pool_size=(2, 2))(conv_1)
conv_2     = Conv2D(64, (3,3), activation = 'relu', padding='same')(max_pool1)
max_pool2  = MaxPooling2D(pool_size=(2, 2))(conv_2)
reshape    = Reshape(target_shape=(int(image_shape[0]/4), int(image_shape[1]/4*64)))(max_pool2)
dense1     = Dense(64)(reshape)                                                  # this dense helps reduce no of params
blstm1     = Bidirectional(LSTM(64, return_sequences=True))(reshape)
outputs    = Dense(no_classes+1, activation="softmax")(blstm1)


model_arch = Model(inputs, outputs)           # for viz the model architecture
model_arch.summary()

(30, 800)
<class 'int'>
Model: "functional_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         [(None, 30, 800)]         0         
_________________________________________________________________
reshape_4 (Reshape)          (None, 30, 800, 1)        0         
_________________________________________________________________
conv2d_4 (Conv2D)            (None, 30, 800, 32)       320       
_________________________________________________________________
max_pooling2d_4 (MaxPooling2 (None, 15, 400, 32)       0         
_________________________________________________________________
conv2d_5 (Conv2D)            (None, 15, 400, 64)       18496     
_________________________________________________________________
max_pooling2d_5 (MaxPooling2 (None, 7, 200, 64)        0         
_________________________________________________________________
reshape_5 (Reshape)          (

#### Loss Function

In [42]:
labels       = Input(name='the_labels', shape=[max_label_len], dtype='float32')
input_length = Input(name='input_length', shape=[1], dtype='int64')
label_length = Input(name='label_length', shape=[1], dtype='int64')

def ctc_lambda_func(args):
    y_pred, labels, input_length, label_length = args
 
    return K.ctc_batch_cost(labels, y_pred, input_length, label_length)
 

loss_out = Lambda(ctc_lambda_func, output_shape=(1,), name='ctc')([outputs, labels, input_length, label_length])
model = Model(inputs=[inputs, labels, input_length, label_length], outputs=loss_out)

model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer = 'adam')

In [43]:
input_length = np.asarray([7 for i in range(training_pts)])              # the number of timesteps that go as input to LSTM layer
label_length = np.asarray([max_label_len for i in range(training_pts)])

#### Training

In [44]:
model.fit(x=[x_train, y_train, input_length, label_length], y=np.zeros(training_pts), epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fc0f57f9450>

#### Inference

In [60]:
# model.save_weights('first_run.hdf5')
# model_arch.load_weights('first_run.hdf5')
 
# predict outputs on validation images
test_img = x_test[:2]
prediction = model_arch.predict(test_img)

# use CTC decoder
out = K.get_value(K.ctc_decode(prediction, input_length=np.ones(prediction.shape[0])*prediction.shape[1],
                         greedy=True)[0][0])
 

print(out)


[[55 -1 -1 -1 -1 -1 -1]
 [41 55 -1 -1 -1 -1 -1]]
