In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
%%capture
import numpy as np
import math
import random
import os
import cv2
import glob
from tqdm import tqdm

import tensorflow as tf
from tensorflow.keras.callbacks import *
from tensorflow.keras.optimizers import *
from tensorflow.keras.models import Model
from tensorflow.keras.layers import *
from tensorflow.keras import backend as K

import tensorflow_addons as tfa

from knn_data_generator import KNNDataGenerator


import sys
sys.path.insert(1, '../')
from constants import *

# Create DD-Net model

In [None]:
## utils.
def get_JCD(frames_batched):
    
    # distance matrix.
    d_m = batch_frames_cdist(frames_batched, frames_batched)
    d_m = tf.reshape(d_m, (-1, d_m.shape[1], d_m.shape[2]* d_m.shape[3]))   
    
    return d_m


# input shape [batch, 32, 13, 2]
def batch_frames_cdist(a, b):
    return tf.sqrt(tf.reduce_sum(tf.square(tf.expand_dims(a, 2) - tf.expand_dims(b, 3)), axis=-1))


# input shape [batch, 32, 13, 2]
def pose_motion(raw_poses):
    diff_slow = poses_diff(raw_poses)
    # flatten last 2 dims.
    diff_slow = tf.reshape(diff_slow, (-1, diff_slow.shape[1], diff_slow.shape[2]*diff_slow.shape[3]))
    
    # jump frame
    fast = raw_poses[:, ::2, :, :]  
    diff_fast = poses_diff(fast)
    # flatten last 2 dims.
    diff_fast = tf.reshape(diff_fast, (-1, diff_fast.shape[1], diff_fast.shape[2]*diff_fast.shape[3]))
    
    return diff_slow, diff_fast   


def poses_diff(x):    
    # frame t - frame(t-1)
    x = x[:, 1:, :, :] - x[:, :-1, :, :]     
    x_d = tf.expand_dims(x[:, 0, :, :], 1)
    x_d = tf.concat([x_d, x], axis=1)

    return x_d


In [None]:
## nural blocks.
def c1D(x, filters, kernel):
    x = Conv1D(filters, kernel_size=kernel, padding='same', use_bias=False)(x)
    x = BatchNormalization()(x)
    x = PReLU(shared_axes=[1])(x)
    return x


def block(x, filters):
    x = c1D(x, filters, 3)
    x = c1D(x, filters, 3)
    return x


def d1D(x, filters):
    x = Dense(filters, use_bias=False)(x)
    x = BatchNormalization()(x)
    x = PReLU(shared_axes=[1])(x)
    return x


#pose encoder
def encode_jcds(x, filters, drop_out=0.4):
    x = c1D(x, filters*2, 1)
    x = SpatialDropout1D(0.1)(x)
    x = c1D(x, filters, 3)
    x = SpatialDropout1D(0.1)(x)
    x = c1D(x, filters, 1)
    x = MaxPooling1D(2)(x)     
    x = SpatialDropout1D(0.1)(x)
    x = Dropout(drop_out)(x)
    return x


# hands encoder
def joints_encoder(filters, drop_out=0.4):
    encoder_input = Input(shape=(NUM_FRAME_SAMPLES, 441))
    x = c1D(encoder_input, filters*2, 1)
    x = SpatialDropout1D(0.1)(x)
    x = c1D(x, filters, 3)
    x = SpatialDropout1D(0.1)(x)
    x = c1D(x, filters, 1)
    x = MaxPooling1D(2)(x)
    x = SpatialDropout1D(0.1)(x)
    x = Dropout(drop_out)(x)
    
    return Model(inputs=encoder_input, outputs=x)


def encode_diff_slow(diff_slow, filters):
    x_d_slow = c1D(diff_slow, filters*2, 1)
    x_d_slow = SpatialDropout1D(0.1)(x_d_slow)
    x_d_slow = c1D(x_d_slow, filters, 3)
    x_d_slow = SpatialDropout1D(0.1)(x_d_slow)
    x_d_slow = c1D(x_d_slow, filters, 1)
    x_d_slow = MaxPool1D(2)(x_d_slow)
    x_d_slow = SpatialDropout1D(0.1)(x_d_slow)      
    
    return x_d_slow

def encode_diff_fast(diff_fast, filters):
    x_d_fast = c1D(diff_fast, filters*2, 1)
    x_d_fast = SpatialDropout1D(0.1)(x_d_fast)
    x_d_fast = c1D(x_d_fast, filters, 3)
    x_d_fast = SpatialDropout1D(0.1)(x_d_fast)
    x_d_fast = c1D(x_d_fast, filters, 1)
    x_d_fast = SpatialDropout1D(0.1)(x_d_fast)
    
    return x_d_fast

In [None]:
## backbone
def build_backbone(
                    pose_frames,
                    diff_slow, diff_fast,
                    face_frames,
                    left_hand_frames, right_hand_frames,
                    filters):

    
    # pose
    pose_encoded = encode_jcds(pose_frames, filters//2, drop_out=0.4)    
    pose_encoded = c1D(pose_encoded, 256, 3)
    pose_encoded = MaxPooling1D(4)(pose_encoded)
    pose_encoded = c1D(pose_encoded, 256, 3)
    pose_encoded = MaxPooling1D(2)(pose_encoded)   
    pose_encoded= Flatten()(pose_encoded)
    pose_encoded = Dense(256)(pose_encoded)
    pose_encoded = tf.math.l2_normalize(pose_encoded, axis=-1)
        
        
    # face
    face_encoded = encode_jcds(face_frames, filters//2, drop_out=0.3)    
    face_encoded = c1D(face_encoded, 128, 3)
    face_encoded = MaxPooling1D(4)(face_encoded)
    face_encoded = c1D(face_encoded, 128, 3)
    face_encoded = MaxPooling1D(2)(face_encoded) 
    face_encoded= Flatten()(face_encoded)
    face_encoded = Dense(64)(face_encoded)
    face_encoded = tf.math.l2_normalize(face_encoded, axis=-1)
    
    # hands
    diff_slow_encoded = encode_diff_slow(diff_slow, filters)
    diff_fast_encoded = encode_diff_fast(diff_fast, filters)
    hand_encoder = joints_encoder(int(filters*4), drop_out=0.4)   
    left_hands_encoded = hand_encoder(left_hand_frames)
    right_hands_encoded = hand_encoder(right_hand_frames)
    hands = concatenate([diff_slow_encoded, diff_fast_encoded, left_hands_encoded, right_hands_encoded])
    hands = c1D(hands, 256, 3)
    hands = MaxPooling1D(4)(hands)
    hands = c1D(hands, 512, 3)
    hands = MaxPooling1D(2)(hands)   
    hands = Flatten()(hands)
    hands = Dense(512)(hands)       
    hands = tf.math.l2_normalize(hands, axis=-1)
    
    # all feats
    x = concatenate([pose_encoded, face_encoded, hands])    

    return x


In [None]:
## build.
def build_DD_Net():       
    # input layers.
    pose_frames_input = Input(shape=(NUM_FRAME_SAMPLES, NUM_SELECTED_POSENET_JOINTS, POSENET_JOINT_DIMS), name='pose_frames_input')
    face_frames_input = Input(shape=(NUM_FRAME_SAMPLES, NUM_SELECTED_FACE_JOINTS, FACE_JOINT_DIMS), name='face_frames_input')
    left_hand_frames_input = Input(shape=(NUM_FRAME_SAMPLES, NUM_HAND_JOINTS, HAND_JOINT_DIMS), name='left_hand_frames_input')
    right_hand_frames_input = Input(shape=(NUM_FRAME_SAMPLES, NUM_HAND_JOINTS, HAND_JOINT_DIMS), name='right_hand_frames_input')
    
    # poses                 
    pose_frames_jcds = get_JCD(pose_frames_input)    
    hand_cat = concatenate([left_hand_frames_input, right_hand_frames_input], axis=-2)    
    diff_slow, diff_fast = pose_motion(hand_cat)    
    
    # faces
    face_frames = get_JCD(face_frames_input)
    
    # hands
    left_hand_frames = get_JCD(left_hand_frames_input)
    right_hand_frames = get_JCD(right_hand_frames_input)
 
    # embed and backbone.
    x = build_backbone(pose_frames_jcds,
                       diff_slow, diff_fast,
                       face_frames,
                       left_hand_frames, right_hand_frames,
                       filters=NUM_START_FILTERS)

    
    model = Model(inputs=[pose_frames_input, face_frames_input, left_hand_frames_input, right_hand_frames_input], outputs=x)
    return model

In [None]:
## input
batch_size = None
pose_frames_input = Input(batch_shape=(batch_size, NUM_FRAME_SAMPLES, NUM_SELECTED_POSENET_JOINTS, POSENET_JOINT_DIMS), name='pose_frames_input')
face_frames_input = Input(batch_shape=(batch_size, NUM_FRAME_SAMPLES, NUM_SELECTED_FACE_JOINTS, FACE_JOINT_DIMS), name='face_frames_input')
left_hand_frames_input = Input(batch_shape=(batch_size, NUM_FRAME_SAMPLES, NUM_HAND_JOINTS, HAND_JOINT_DIMS), name='left_hand_frames_input')
right_hand_frames_input = Input(batch_shape=(batch_size, NUM_FRAME_SAMPLES, NUM_HAND_JOINTS, HAND_JOINT_DIMS), name='right_hand_frames_input')

embedder_model = build_DD_Net()

# embed.
feats_out = embedder_model([pose_frames_input, face_frames_input, left_hand_frames_input, right_hand_frames_input])

model = Model(inputs=[pose_frames_input, face_frames_input,
                      left_hand_frames_input, right_hand_frames_input],
              outputs=feats_out)




In [None]:
model.summary()

# Data Generator

In [None]:
batch_size = 32
train_generator = KNNDataGenerator('D:/jobs/datasets/video/sign_language/kps/train',
                                     batch_size=batch_size, use_augment=True)
val_generator = KNNDataGenerator('D:/jobs/datasets/video/sign_language/kps/train',
                                     batch_size=batch_size, use_augment=True)

## Train

In [None]:
model.compile(loss=tfa.losses.triplet_hard_loss, optimizer="Adam")

In [None]:
filepath = ("checkpoints/{epoch:02d}-{loss:.4f}.h5")
checkpoint = ModelCheckpoint(filepath, monitor='loss', save_best_only=False,
                             mode='auto', save_weights_only=False)


history = model.fit_generator(train_generator,
                               steps_per_epoch=200,
                               epochs=200,
                               initial_epoch=0,
                               callbacks=[checkpoint],
                               validation_data=val_generator,
                               validation_steps=500,
                               validation_freq=3,
                               workers=3, use_multiprocessing=False
                            )