In [220]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [1]:
!pip install keras-tuner
!pip install mtcnn
!pip install pillow

In [None]:
#import math
#import os
import numpy as np
import tensorflow as tf
import keras
#import pandas as pd
#import glob
#import cv2
#import os
#import seaborn as sns
#import matplotlib.pyplot as plt
from google.colab.patches import cv2_imshow
#import tensorflow as tf
#from mtcnn import MTCNN
#from keras.applications.resnet import ResNet50
#import mxnet as mx
#from mxnet.gluon.model_zoo.vision import resnet34_v1,resnet34_v2
from keras.applications.vgg19 import VGG19
from keras.models import Sequential
from keras.layers import LSTM, Dense, MultiHeadAttention, Lambda, Conv1D, InputLayer
from keras.utils import to_categorical
#from sklearn.metrics import accuracy_score
from keras import metrics
#from keras.applications.vgg16 import VGG16
from keras.models import Model
from sklearn.model_selection import train_test_split
from keras.layers import Input, Dense, LSTM, Attention, Flatten, GlobalAveragePooling1D, LayerNormalization, Dropout, Concatenate, Layer, BatchNormalization
from keras.models import Model
#from tensorflow.keras.preprocessing.image import array_to_img
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping
#from keras.backend import variable
#from tensorflow.keras.applications import vit
#!pip install tensorflow-addons
#import tensorflow_addons as tfa
#from vit_keras import VisionTransformer
from tensorflow.keras.optimizers import SGD
import pickle
from tensorflow.keras.preprocessing.sequence import pad_sequences
#import kerastuner as kt
from mtcnn import MTCNN
from PIL import Image

###Transformer functions

In [None]:
def transformer_block(inputs, key_dim , num_heads, ff_dim, dropout, name):
  attention_out = MultiHeadAttention(num_heads = num_heads, key_dim=key_dim, name=name+'_multi_head')(inputs,inputs)
  #attention_out = Dense(inputs.shape[-1], name=name+'_attn_dense')(attention_out)
  dropout1 = Dropout(dropout)(attention_out)
  out1 = LayerNormalization()(inputs+dropout1)
  ffn = Sequential([
      Dense(ff_dim,activation='relu'),
      Dense(inputs.shape[2])], name=name+"_ffn")
  ffn_out = ffn(out1)
  dropout1 = Dropout(dropout)(ffn_out)
  final_out = LayerNormalization()(out1+dropout1)
  return final_out

def transformer_model(input, num_transformers,key_dim, num_heads, ff_dim, dropout, pos_encod_type):
  if pos_encod_type=='traditional':
    pos_enc = Traditional_Positional_Encoding()(input)
  elif pos_encod_type=='learned':
    pos_enc = Learned_Positional_Encoding()(input)
  else:
    pos_enc = 0
  
  pos_enc = tf.cast(pos_enc, dtype=tf.float32)
  x = pos_enc + input
  for i in range(num_transformers):
    x += transformer_block(x,key_dim=key_dim, num_heads=num_heads,ff_dim =ff_dim, dropout=dropout,name="Transformer_"+str(i+1))
  x = GlobalAveragePooling1D()(x)
  out = Dense(10,activation='softmax')(x)
  model = Model(inputs=input,outputs=out)
  return model


class Traditional_Positional_Encoding(Layer):
    def __init__(self, **kwargs):
        super(Traditional_Positional_Encoding, self).__init__(**kwargs)
        self.seq_len = 22

    def call(self, inputs,**kwargs):
      input_dim = inputs.shape[-1]
      seq_len = self.seq_len
      max_len=25.0
      k = np.arange(0,input_dim,dtype=float)[None,:]
      pos = np.arange(0,seq_len, dtype=float)[:,None]
        
      i = k//2
      pe = pos/(max_len**(2*i/input_dim))
      pe[:,0::2] = np.sin(pe[:,0::2])
      pe[:,1::2] = np.cos(pe[:,1::2])
      pe = tf.convert_to_tensor(pe)
      pe = tf.expand_dims(pe, axis=0)
      return pe


class Learned_Positional_Encoding(Layer):
    def __init__(self, **kwargs):
        super(Learned_Positional_Encoding, self).__init__(**kwargs)
        self.conv1= Conv1D(filters=128, kernel_size=5, padding='same', activation='relu')
        self.conv2 = Conv1D(filters=128, kernel_size=1, activation='linear')
        self.dense1 = Dense(units=1, activation=None)

    def call(self, inputs,**kwargs):
      batch_size, seq_len, input_dim = inputs.shape
      pe = inputs * tf.math.sqrt(tf.cast(input_dim, tf.float32))
      pe = Lambda(lambda i: tf.expand_dims(i, axis=-1))(pe)
      pe = self.conv1(pe)
      pe = self.conv2(pe)
      pe = self.dense1(pe)
      pe = tf.reshape(pe,tf.shape(inputs))
      return pe

###Feature vectors from RESNET for words

In [None]:
with open('/content/gdrive/MyDrive/HW_CS7150_DL/pkl/miracl_feature_maps_resnet_words.pkl', 'rb') as f:
    data = pickle.load(f)

X = data['x']
y = data['y']-1

max_seq_len = max(len(seq) for seq in X)
X_padded = pad_sequences(X, maxlen=max_seq_len, padding='pre')
print(X_padded.shape)

(1500, 22, 2048)


In [None]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(X_padded, np.array(y), test_size=0.2, random_state=42)

print("Training data shape:", X_train2.shape)
print("Testing data shape:", X_test2.shape)
print("Training label shape:", y_train2.shape)
print("Testing label shape:", y_test2.shape)

Training data shape: (1200, 22, 2048)
Testing data shape: (300, 22, 2048)
Training label shape: (1200,)
Testing label shape: (300,)


###RESNET + LSTM for words

In [None]:
# Define model architecture
model_resnet_words = Sequential()
model_resnet_words.add(LSTM(units=128, input_shape=(22,2048)))
model_resnet_words.add(Dense(units=10, activation='softmax'))

# Compile model
model_resnet_words.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Split data into train and test sets

train_y2 = to_categorical(y_train2, num_classes=10)
test_y2 = to_categorical(y_test2, num_classes=10)

# Train model with input data of shape (1500, 22, 2048) and validate on test set
history_resnet_words = model_resnet_words.fit(X_train2, train_y2 , epochs=50, validation_data=(X_test2, test_y2), batch_size=128)

# Predict using the trained model
#predictions = model.predict(X_test2)
#print(predictions.shape)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


### RESNET + LSTM with Attention for words

In [None]:
# Set the input shape
input_shape = (22, 2048)

def feat_res(input_shape):
  
# Define the attention-based LSTM model
  inputs = Input(shape=input_shape)
  lstm_out = LSTM(128, return_sequences=True)(inputs)
  attention_out = Attention()([lstm_out, lstm_out],return_attention_scores=True)
  flatten_out = Flatten()(attention_out[0])
  dense_out = Dense(10, activation='softmax')(flatten_out)
  #heatmap_out = AttentionHeatmap()([dense_out, attention_out[1]])
  model = Model(inputs=inputs, outputs=dense_out)
  return model, attention_out[1],attention_out

# Compile model
model_words_res_att, attention_weights, attention_out = feat_res(input_shape)
model_words_res_att.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Split data into train and test sets
train_y2 = to_categorical(y_train2, num_classes=10)
test_y2 = to_categorical(y_test2, num_classes=10)

# Train model with input data of shape (1500, 22, 2048) and validate on test set
hist_words_res_att = model_words_res_att.fit(X_train2, train_y2 , epochs=50, validation_data=(X_test2, test_y2), batch_size=128)

# Predict using the trained model
#predictions_resnet_attention = model_words_res_att.predict(X_test2)
#print(predictions_resnet_attention.shape)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


###RESNET + Transformer(traditional positional encoding) for words

In [None]:
#with gradient clipping

input_shape = (22, 2048)
input = tf.keras.layers.Input(shape=input_shape)
model_words_res_trans = transformer_model(input, key_dim = 64, num_transformers=2, num_heads=16, ff_dim=768, dropout=0.1, pos_encod_type='traditional')

optimizer = SGD(learning_rate=0.001, momentum=0.9, clipvalue=0.5) 
model_words_res_trans.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

train_y2 = to_categorical(y_train2, num_classes=10)
test_y2 = to_categorical(y_test2, num_classes=10)

# Train model with input data of shape (1500,22,2048) and validate on test set
hist_words_res_trans = model_words_res_trans.fit(X_train2, train_y2 , epochs=50, validation_data=(X_test2, test_y2), batch_size=64)#, callbacks=[EarlyStopping(patience=3)])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


###Feature vectors from VGGnet for words

In [None]:
with open('/content/gdrive/MyDrive/HW_CS7150_DL/pkl/miracl_feature_maps_vgg_words.pkl', 'rb') as f:
    data = pickle.load(f)

X = data['x']
y = data['y']-1


max_seq_len = max(len(seq) for seq in X)
X_padded = pad_sequences(X, maxlen=max_seq_len, padding='pre')
print(X_padded.shape)


(1500, 22, 512)


In [None]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(X_padded, np.array(y), test_size=0.2, random_state=42)

print("Training data shape:", X_train2.shape)
print("Testing data shape:", X_test2.shape)
print("Training label shape:", y_train2.shape)
print("Testing label shape:", y_test2.shape)

Training data shape: (1200, 22, 512)
Testing data shape: (300, 22, 512)
Training label shape: (1200,)
Testing label shape: (300,)


###Vggnet + LSTM for words

In [None]:
# Define model architecture
model_vgg_words = Sequential()
model_vgg_words.add(LSTM(units=128, input_shape=(22,512)))
model_vgg_words.add(Dense(units=10, activation='softmax'))

# Compile model
model_vgg_words.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Split data into train and test sets

train_y2 = to_categorical(y_train2, num_classes=10)
test_y2 = to_categorical(y_test2, num_classes=10)

# Train model with input data of shape (1500, 22, 512) and validate on test set
history_vgg_words = model_vgg_words.fit(X_train2, train_y2 , epochs=50, validation_data=(X_test2, test_y2), batch_size=128)

# Predict using the trained model
#predictions = model.predict(X_test2)
#print(predictions.shape)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


###VGGnet + Attention LSTM for words

In [None]:
# Set the input shape
input_shape = (22, 512)

# Define the attention-based LSTM model
inputs = Input(shape=input_shape)
lstm_out = LSTM(128, return_sequences=True)(inputs)
attention_out = Attention()([lstm_out, lstm_out])
flatten_out = Flatten()(attention_out)
dense_out = Dense(10, activation='softmax')(flatten_out)
model_words_vgg_att = Model(inputs=inputs, outputs=dense_out)

# Compile model
model_words_vgg_att.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Split data into train and test sets
train_y2 = to_categorical(y_train2, num_classes=10)
test_y2 = to_categorical(y_test2, num_classes=10)

# Train model with input data of shape (1500, 22, 512) and validate on test set
hist_words_vgg_att = model_words_vgg_att.fit(X_train2, train_y2 , epochs=50, validation_data=(X_test2, test_y2), batch_size=128)

# Predict using the trained model
#predictions_resnet_attention = model_words_res_att.predict(X_test2)
#print(predictions_resnet_attention.shape)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


###Video Preprocessing to add the captions

In [None]:
import cv2
import os

# Open the video file
video = cv2.VideoCapture('/content/gdrive/MyDrive/HW_CS7150_DL/Web.mp4')

# Create a directory to save the frames
if not os.path.exists('/content/gdrive/MyDrive/HW_CS7150_DL/vid_frames5'):
    os.makedirs('/content/gdrive/MyDrive/HW_CS7150_DL/vid_frames5')

# Initialize frame count and flag for reading the video
frame_count = 0
success = True

# Loop through the frames and save them to the frames directory
while success:
    # Read the frame
    success, image = video.read()

    if success:
        # Save the frame as an image
        cv2.imwrite(f'/content/gdrive/MyDrive/HW_CS7150_DL/vid_frames5/frame_{frame_count:04d}.jpg', image)

        # Increment the frame count
        frame_count += 1

# Release the video file and print the number of frames extracted
video.release()
print(f'{frame_count} frames extracted.')


49 frames extracted.


In [None]:
import glob
file_paths = glob.glob('/content/gdrive/MyDrive/HW_CS7150_DL/vid_frames5' + '/*.jpg')
print(file_paths[1][-6], file_paths[13][-6])
imgs = []
for i in file_paths:
  if int(i[-6])==0:
    if int(i[-5])%3==0:
      imgs.append(i)
  elif int(i[-6:-4]) == 32 or int(i[-6:-4])==28 or int(i[-6:-4]) == 14 or int(i[-6:-4])== 23 or int(i[-6:-4])== 37:
    imgs.append(i)
  else:
    if int(i[-6:-4])%3==0:
      imgs.append(i)

len(imgs)

0 1


22

In [None]:
# Load the MTCNN model
detector = MTCNN(min_face_size=75)
save_path = '/content/gdrive/MyDrive/HW_CS7150_DL/lip_test4/'
for k in range(len(imgs)):
   i = imgs[k]
   # Load the image
   img = cv2.imread(imgs[k])
   # Detect faces in the image
   faces = detector.detect_faces(img)
   # Crop and save each detected face
   cnt=0
   for j, face in enumerate(faces):
     x, y, w, h = face["box"]
     cropped_img = img[y:y+h, x:x+w]

    # Crop the bounding box area and save it as a new image
     box_size=65
     x1 = x + w // 2 - 46  # calculate top-left corner x-coordinate
     y1 = y + h // 2 + 50    #box_size // 25  # calculate top-left corner y-coordinate
     x2 = x1 + 110            # calculate bottom-right corner x-coordinate
     y2 = y1 + 55    #box_size//2             # calculate bottom-right corner y-coordinate
     cropped_box = img[y1:y2, x1:x2]
     cv2.imwrite(os.path.join(save_path, f"cropped_{int(i[-6:-4])}.jpg"), cropped_box)



In [None]:
cropped_imgs = glob.glob('/content/gdrive/MyDrive/HW_CS7150_DL/lip_test4' + '/*.jpg')
for filename in cropped_imgs:
    img = Image.open(filename)
    new_img = img.resize((58,25))
    new_img.save(os.path.join('/content/gdrive/MyDrive/HW_CS7150_DL/resized_lip2/', filename[-13:]))

In [None]:
cropped_imgs = glob.glob('/content/gdrive/MyDrive/HW_CS7150_DL/resized_lip2' + '/*.jpg')
sor = {}
for i in cropped_imgs:
  if i[-6]=="_":
    sor[int(i[-5])] = i
  else:
    sor[int(i[-6:-4])] = i
sor

{0: '/content/gdrive/MyDrive/HW_CS7150_DL/resized_lip2/cropped_0.jpg',
 3: '/content/gdrive/MyDrive/HW_CS7150_DL/resized_lip2/cropped_3.jpg',
 6: '/content/gdrive/MyDrive/HW_CS7150_DL/resized_lip2/cropped_6.jpg',
 9: '/content/gdrive/MyDrive/HW_CS7150_DL/resized_lip2/cropped_9.jpg',
 12: '/content/gdrive/MyDrive/HW_CS7150_DL/resized_lip2/ropped_12.jpg',
 14: '/content/gdrive/MyDrive/HW_CS7150_DL/resized_lip2/ropped_14.jpg',
 15: '/content/gdrive/MyDrive/HW_CS7150_DL/resized_lip2/ropped_15.jpg',
 18: '/content/gdrive/MyDrive/HW_CS7150_DL/resized_lip2/ropped_18.jpg',
 21: '/content/gdrive/MyDrive/HW_CS7150_DL/resized_lip2/ropped_21.jpg',
 23: '/content/gdrive/MyDrive/HW_CS7150_DL/resized_lip2/ropped_23.jpg',
 24: '/content/gdrive/MyDrive/HW_CS7150_DL/resized_lip2/ropped_24.jpg',
 27: '/content/gdrive/MyDrive/HW_CS7150_DL/resized_lip2/ropped_27.jpg',
 28: '/content/gdrive/MyDrive/HW_CS7150_DL/resized_lip2/ropped_28.jpg',
 30: '/content/gdrive/MyDrive/HW_CS7150_DL/resized_lip2/ropped_30.jp

In [None]:
img_seq = []
for i in range(len(sor)):
  img_p = cv2.imread(list(sor.values())[i])
  img_array = np.array(img_p).reshape(1,25,58,3)
  img_seq.append(img_array)

img_seq=np.array(img_seq).reshape(22,25,58,3)
img_seq.shape

(22, 25, 58, 3)

In [None]:
x_norm = img_seq/255.

# Load the pre-trained VGG16 model
vgg_model = VGG19(weights='imagenet', include_top=False)

# Remove the last layer of the VGG16 model
features_model = Model(inputs=vgg_model.input, outputs=vgg_model.get_layer('block5_conv4').output)

# Reshape input data
input_shape = (25, 58, 3)
X = tf.keras.Input(shape=input_shape)
Y = tf.keras.layers.Reshape((25, 58, 3))(X)

# Pass input through VGG16 model
features = features_model(Y)

# Create model with inputs and outputs
model = tf.keras.Model(inputs=X, outputs=features)

# Compile model
model.compile(optimizer='adam', loss='categorical_crossentropy')

# Generate fake data for testing
x = x_norm

features_vgg16 = model.predict(x)
print(features_vgg16.shape)


(22, 1, 3, 512)


In [221]:
xt = np.mean(features_vgg16,axis=2).reshape(1,22,512)
print(xt.shape)
pred = model_words_vgg_att.predict(xt)
pred = np.argmax(pred[0])

words = {1:"Begin", 2:"Choose", 3:"Connection", 4:"Navigation", 5:"Next",
         6:"Previous", 7:"Start", 8:"Stop", 9:"Hello", 10:"Web"}


(1, 22, 512)


###Adding captions to the video from the predicted class

In [235]:
import cv2
import numpy as np

# Load the video file
cap = cv2.VideoCapture('/content/gdrive/MyDrive/HW_CS7150_DL/Web.mp4')

# Define the codec and create VideoWriter object
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter('/content/gdrive/MyDrive/HW_CS7150_DL/output_web5.mp4', fourcc, cap.get(cv2.CAP_PROP_FPS), (int(cap.get(3)), int(cap.get(4))))

# Set the text caption and font
text = words[pred+1]
font = cv2.FONT_HERSHEY_SIMPLEX
font_scale = 1
font_thickness = 2

# Calculate the desired time to display the text
total_time = cap.get(cv2.CAP_PROP_FRAME_COUNT) / cap.get(cv2.CAP_PROP_FPS)
text_time = total_time - 0.65  # Display the text 2 seconds before the end of the video

# Process each frame in the video
while True:
    # Capture frame-by-frame
    ret, frame = cap.read()
    if not ret:
        break

    # Get the current time in seconds
    current_time = cap.get(cv2.CAP_PROP_POS_MSEC) / 1000

    # Write text on the frame if the time is almost at the end
    if current_time >= text_time:
        text_size = cv2.getTextSize(text, font, font_scale, font_thickness)[0]
        text_x = int(frame.shape[1] / 2 - text_size[0] / 2)
        text_y = frame.shape[0] - text_size[1] - 10
        cv2.putText(frame, text, (text_x, text_y), font, font_scale, (255, 255, 255), font_thickness)

    # Write the frame to the output video file
    out.write(frame)

    # Display the resulting frame
    #cv2.imshow('frame', frame)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# Release everything if job is finished
cap.release()
out.release()
#cv2.destroyAllWindows()


##Hyerparameter Tuning for Tranformer

In [None]:
# Define the function to create the model
def create_model(hp):
    input_shape = (22, 2048)
    input_layer = tf.keras.layers.Input(shape=input_shape)
    num_transformers = hp.Int('num_transformers', min_value=2, max_value=4, step=1)
    num_heads = hp.Int('num_heads', min_value=4, max_value=16, step=4)
    ff_dim = hp.Int('ff_dim', min_value=256, max_value=1024, step=256)
    key_dim = hp.Int('key_dim', min_value=32, max_value=128, step=32)
    #dropout = hp.Float('dropout', min_value=0.1, max_value=0.1, step=0.1)
    #pos_encod_type = hp.Choice('pos_encod_type', values=['traditional'])

    model = transformer_model(input_layer, num_transformers=num_transformers, num_heads=num_heads, ff_dim=ff_dim, key_dim=key_dim, dropout=0.1, pos_encod_type='traditional')
    #out = Dense(10, activation='softmax')(x)
    
    learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 0.005])
    optimizer = hp.Choice('optimizer', values=['adam', 'sgd'])
    
    if optimizer == 'adam':
        opt = keras.optimizers.Adam(learning_rate=learning_rate)
    else:
        opt = keras.optimizers.SGD(learning_rate=learning_rate, momentum=0.9)
    
    #model = Model(inputs=input_layer, outputs=out)
    model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# Define the search space for hyperparameters
tuner = kt.Hyperband(
    create_model,
    objective='val_accuracy',
    max_epochs=30,
    factor=3
    #directory='my_dir',
    #project_name='my_project'
)

tuner.search_space_summary()
# Prepare the data
train_y2 = tf.keras.utils.to_categorical(y_train2, num_classes=10)
test_y2 = tf.keras.utils.to_categorical(y_test2, num_classes=10)

# Run the hyperparameter search
tuner.search(X_train2, train_y2, epochs=30, validation_data=(X_test2, test_y2), batch_size=64)#, callbacks=[EarlyStopping(patience=5)])

# Get the best hyperparameters
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
print(best_hps.values)


Trial 90 Complete [00h 03m 28s]
val_accuracy: 0.7566666603088379

Best val_accuracy So Far: 0.8600000143051147
Total elapsed time: 00h 53m 48s
{'num_transformers': 2, 'num_heads': 16, 'ff_dim': 768, 'key_dim': 64, 'learning_rate': 0.005, 'optimizer': 'sgd', 'tuner/epochs': 50, 'tuner/initial_epoch': 17, 'tuner/bracket': 2, 'tuner/round': 2, 'tuner/trial_id': '0068'}


###VGGnet + Transformer for words (traditional positional encoding)

In [None]:
#with gradient clipping

input_shape = (22, 2048)
input = tf.keras.layers.Input(shape=input_shape)
model = transformer_model(input, key_dim = 128, num_transformers=2, num_heads=16, ff_dim=1024, dropout=0.1, pos_encod_type='traditional')

optimizer = SGD(learning_rate=0.005, momentum=0.9, clipvalue=0.5) # Define optimizer with learning rate and momentum
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

train_y2 = to_categorical(y_train2, num_classes=10)
test_y2 = to_categorical(y_test2, num_classes=10)

# Train model with input data of shape (1500, 22, 2048) and validate on test set
model.fit(X_train2, train_y2 , epochs=50, validation_data=(X_test2, test_y2), batch_size=64)#, callbacks=[EarlyStopping(patience=3)])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7f0bbce86460>

###VGGnet + Transformer for words (learned positional encoding)

In [None]:
#with gradient clipping

input_shape = (22, 2048)
input = tf.keras.layers.Input(shape=input_shape)
model = transformer_model(input, key_dim = 128, num_transformers=2, num_heads=16, ff_dim=1024, dropout=0.1, pos_encod_type='learned')

optimizer = SGD(learning_rate=0.005, momentum=0.9, clipvalue=0.5) # Define optimizer with learning rate and momentum
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

train_y2 = to_categorical(y_train2, num_classes=10)
test_y2 = to_categorical(y_test2, num_classes=10)

# Train model with input data of shape (1500,22,2048) and validate on test set
model.fit(X_train2, train_y2 , epochs=50, validation_data=(X_test2, test_y2), batch_size=128)#, callbacks=[EarlyStopping(patience=3)])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7f0bb8c2dc40>

In [None]:
#without gradient clipping

input_shape = (22, 2048)
input = tf.keras.layers.Input(shape=input_shape)
model = transformer_model(input, key_dim = 128, num_transformers=2, num_heads=16, ff_dim=1024, dropout=0.1, pos_encod_type='learned')

optimizer = SGD(learning_rate=0.005, momentum=0.9)#, clipvalue=0.5) # Define optimizer with learning rate and momentum
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

train_y2 = to_categorical(y_train2, num_classes=10)
test_y2 = to_categorical(y_test2, num_classes=10)

# Train model with input data of shape (1500,22,2048) and validate on test set
model.fit(X_train2, train_y2 , epochs=50, validation_data=(X_test2, test_y2), batch_size=128)#, callbacks=[EarlyStopping(patience=3)])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7f0b6c6c14c0>