#### In this notebook, I build and train a dynamic model based on the pretrained static one. All weights are taken in such a way, that at the beginning of the training the prediction of the dynamic model is the same as prediction of the static model. So the precision of the static model is a base line for the dynamic one.

In [2]:
import tensorflow as tf
from keras.preprocessing.image import load_img
import pandas as pd
import numpy as np
import os
from random import shuffle
import matplotlib.pyplot as plt
from tqdm import tqdm
import keras as keras
from keras.layers import DepthwiseConv2D
from keras.applications import MobileNetV2
from keras.models import Model,load_model
from keras.layers import *
import keras.backend as K
%matplotlib inline

Using TensorFlow backend.


##### To use tf.data API tensorflow session should be defined explicidly to run initializer of the data iterator

In [3]:
sess=tf.Session()

In [4]:
K.set_session(sess)

### First the generator is created to feed data from tfrecords files into the model during training

##### This cell contains parsing function to decode serialized samples read from tfrecords files. It returns tuple of tf.tensors of 1) image in format (width,height,color_chanel), 2)label - array of two numbers and 3) the weights to be used in loss function to take into account unbalanced dataset. They are stored in to last two columns of the feature 'train/label' 

In [5]:
def _parse_function(example_proto,seq_num):
    
    feature = {'train/seq': tf.VarLenFeature(tf.string),
               'train/seq_len': tf.FixedLenFeature((1,),tf.int64),
               'train/labels_weights': tf.FixedLenFeature((4,), tf.float32)}
    parsed_features = tf.parse_single_example(example_proto, features=feature)
    
    #random_offset defines the first frame in a sample sequence 
    random_offset = tf.random_uniform(
        shape=(), minval=0,
        maxval= parsed_features["train/seq_len"][0] - seq_num, dtype=tf.int64)
    
    #offset defines the sequence itself from the first to seq_num frame    
    offsets = tf.range(random_offset,random_offset + seq_num,dtype=tf.int64)

    #get sequence of frames and decode jpg into first uint8 and then normalize by 1/255. and convert to float32 type
    seq = tf.map_fn(lambda i: tf.cast(tf.image.decode_jpeg(parsed_features['train/seq'].values[i])[:448,:704,:],tf.int64)
                    ,offsets)
    
    seq = tf.cast(seq,tf.float32)/255.
    
    #get label for a sequence of frame (labals are all the same for the whole video)
    labels_weights = parsed_features['train/labels_weights']
    labels = labels_weights[0:2]
    weights = labels_weights[2:]
    
    
    return seq, labels,weights #, shapeseq_len, , name

#a wrapper function to pass seq_num parameter into parser
def _parse_function_param(seq_num):
    def parse(example_proto):
        return _parse_function(example_proto,seq_num)
    return parse

##### Defining the batch size and the number of frames in a time serie

In [26]:
seq_num = 4 # the number of frames shold be less than 16
batch_size=3

##### Creating the list of tfrecords files to build the dataset from

In [27]:
working_directory = '../Data_samples/train data/tfrecords_seq/'#the directory with data in tfrecord format
filenames = [working_directory+file for file in os.listdir(working_directory) if file.endswith('tfrecords')] #list of the filenames with data

##### Important! To check if the list of the tfrecords files is good. The problem is if the list is empty, or the pathes are not readable for some reason, tf.data API does not produce an error.

In [28]:
filenames[:10]

['../Data_samples/train data/tfrecords_seq/train0.tfrecords']

In [29]:
dataset = tf.data.TFRecordDataset(filenames)
dataset = dataset.map(_parse_function_param(seq_num))
dataset = dataset.repeat(1000)
dataset = dataset.batch(batch_size)

In [30]:
iterator = dataset.make_initializable_iterator()
next_element = iterator.get_next()


##### Now, we have an iterator that produces a sample of frame sequence and corresponding labels. But we need an iterator that produces features after passing each frame through the convolutional base + attention. For that we next build a TimeDistributed feature extractor. The output of this Feature extractor can be directly fed into any kind dynamic layer such as time convolution of RNN.

##### Unfortunately the downloaded model from the h5 file doesnot work with TimeDistributed wrapper. To solve this problem the new identical static model is created and then the weights are taken from the pretrained model. 

In [31]:
#this is convolutional base
conv_base=MobileNetV2(include_top=False)#,input_tensor=next_element[0]



In [32]:
inp=Input((None,None,None))
x = inp

x=conv_base(x)

# next 5 layers are attention mechanism

weights=Conv2D(1,(3,3),activation='sigmoid',padding='SAME',kernel_initializer=keras.initializers.Zeros(),
                                               bias_initializer=keras.initializers.Ones())(x)

#next two lines make the weights sum up to 1: weights->weights/sum(weights)
norm = Lambda(lambda t: 1/K.sum(t,axis=[1,2]))(weights)

weights = merge.multiply([norm,weights])

#next two lines calculate weighted average
x = merge.multiply([x,weights])

x = Lambda(lambda t: K.sum(t,axis=[1,2]))(x)

In [33]:
model=Model(inputs=[inp],outputs=[x])

##### Download the pretrained static model

In [34]:
mobinet1=load_model('../Static_models/mobile_mini_top_attention_3.h5',custom_objects={'relu6':ReLU(6.),'tf':tf},compile=False)

##### Redefining loaded model to output prediction values instead of the loss. In this case we don't need to feed true values and loss_weights for the loss function into the model. Defined in this way the architecture will be identical with the model build above and the trained weights can be assined (unfurtunately loaded model does not work with TimeDistributed wrapper, because of the bug).

In [35]:
mobinet2=Model(inputs=[mobinet1.layers[0].input],outputs=[mobinet1.layers[6].output])

##### Assign the weights

In [36]:
model.set_weights(mobinet2.get_weights())

In [37]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_8 (InputLayer)            (None, None, None, N 0                                            
__________________________________________________________________________________________________
mobilenetv2_1.00_224 (Model)    (None, None, None, 1 2257984     input_8[0][0]                    
__________________________________________________________________________________________________
conv2d_2 (Conv2D)               (None, None, None, 1 11521       mobilenetv2_1.00_224[1][0]       
__________________________________________________________________________________________________
lambda_3 (Lambda)               (None, 1)            0           conv2d_2[0][0]                   
__________________________________________________________________________________________________
multiply_3

##### Next step is to wrapp the model in TimeDistributed wrapper, so it can take the sequence of frames simultaneously. At the moment we use stationary model without the top layers (conv_base + attention) to extract features.

In [38]:
mob_time=TimeDistributed(model)

In [39]:
inp=Input((None,None,None,None))

x=inp
x=mob_time(x)

In [40]:
time_mob=Model(inputs=[inp], outputs=[x])

##### Creating a generator that produces sequence of features from the sequence of frames

In [41]:
iterator = dataset.make_initializable_iterator()
next_element = iterator.get_next()

def batch_generator(dataset):
    sess.run(iterator.initializer)
    while True:
        inputs, labels, weights = sess.run(next_element)
        features=time_mob.predict_on_batch(inputs)
        yield ([features, labels, weights],np.zeros((batch_size,))) # np.zeros are used as a dummy labels, since real labels
                                                                        # will be fed alongside with the sample 
            
gen=batch_generator(dataset)


##### Testing the generator

In [42]:
next(gen)

([array([[[9.24750030e-01, 8.42631832e-02, 8.35173484e-03, ...,
           1.24393380e-03, 5.84371507e-01, 1.12202477e+00],
          [7.71530271e-01, 1.88849926e-01, 1.69622526e-02, ...,
           1.67293489e-04, 5.13913989e-01, 1.38477838e+00],
          [6.21242702e-01, 1.37949333e-01, 1.78094618e-02, ...,
           2.85055983e-04, 4.94020581e-01, 1.27388215e+00],
          [5.50869703e-01, 1.48080498e-01, 3.70995104e-02, ...,
           4.37595008e-05, 3.76173437e-01, 1.17835510e+00]],
  
         [[1.02637482e+00, 1.69418883e+00, 1.08623005e-01, ...,
           1.36557028e-01, 4.97790635e-01, 8.70172977e-01],
          [1.04836440e+00, 1.67417526e+00, 2.32100829e-01, ...,
           1.31708965e-01, 2.62945414e-01, 8.26941609e-01],
          [7.79519737e-01, 1.76763773e+00, 2.51860738e-01, ...,
           1.50727287e-01, 1.83511093e-01, 4.90075111e-01],
          [7.64925539e-01, 1.73200524e+00, 2.58311003e-01, ...,
           2.37498417e-01, 9.03775021e-02, 5.37468433e-01]],
  


##### Now the final top layers are constructed to deal with dynamics. This final model contains two branches that are connected residually to produce the final prediction. One branch is essentially averaging of the static model predictions along the time axis. Another branch is either 1) 1D convolution along the time axis. In this case the initial values of the weights and biases are set to zero, thus the model starts with static prediction averaged over time 2) A layer of GRU cells 3) A layer of LSTM cells. The function that build the model accept 3 possible values for 'type_of_layer' parameter 'conv', 'GRU' and 'LSTM'.

In [43]:
dense1=mobinet1.layers[7] # a final layer of static model

In [52]:
def Build_temp_model(type_top_layer='conv'):
    inp0 = Input(shape=(seq_num,1280))# 
    inp1 = Input(shape=(2,))
    inp2 = Input(shape=(2,))

    x = inp0
    labels = inp1
    loss_weights = inp2

    x = TimeDistributed(dense1)(x) #static prediction

    avg = GlobalAveragePooling1D()(x) #averaging of the static prediction
    
    if type_top_layer=='conv':
        x=Conv1D(2,3,kernel_initializer=keras.initializers.Zeros(), #time convolution branch
                                                           bias_initializer=keras.initializers.Zeros())(x)
        x=GlobalAveragePooling1D()(x) # averaging along time axis
    
    elif type_top_layer=='GRU':
        x=GRU(2)(x)
    
    elif type_top_layer=='LSTM':
        x=LSTM(2)(x)
    
    else:
        raise NameError('Unknown type of model. Possible types are: "conv","GRU" or "LSTM"')

    prediction = Add()([avg,x]) # final prediction is a summ of two branches

    # constructing the weighted loss
    err = subtract([prediction,labels])
    sqr_err = Lambda(K.square)(err)
    weighted_sqr_err = multiply([sqr_err,loss_weights])
    loss = Lambda(K.mean)(weighted_sqr_err)
    
    return Model(inputs=[inp0,inp1,inp2], outputs=[loss])

In [55]:
model1=Build_temp_model(type_top_layer='conv')

##### If the model has been trained before, download pretraind one for further training. Uncomment the next line.

In [56]:
#model_trained=load_model('mobile_dynamics1.h5',custom_objects={'relu6':ReLU(6.),'tf':tf},compile=False)

In [57]:
#model1.set_weights(model_trained.get_weights())

##### As usual dummy los function for compilation

In [37]:
def loss_compile(y_true,y):
      return y

##### Compile the model for training

In [38]:
adam = keras.optimizers.Adam(lr=.0001)

In [39]:
model1.compile(optimizer=adam,loss=loss_compile)

##### Training loop

In [None]:
for i in range(10):
    model1.fit_generator(gen, epochs=1,steps_per_epoch=100)
    model1.save("mobile_dynamics_{}.h5".format(i))

##### Stop the virtual machine after training if on the cloud, to avoid paying for idle GPU time

In [None]:
!sudo poweroff