## Create encoder
I will use the pretrained VGG16 for the encoding part


In [1]:
import keras
from keras.applications.vgg16 import preprocess_input
from keras.applications.vgg16 import VGG16
import numpy as np
# import required module
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from keras.utils import to_categorical, pad_sequences
from keras.layers import (LSTM, Embedding, 
    TimeDistributed, Dense, RepeatVector, 
    Activation, Flatten, Reshape, concatenate,  
    Dropout, BatchNormalization)

# assign directory

In [2]:

import os
import string
import glob
from keras.applications import MobileNet
import keras.applications.mobilenet  

from keras.applications.inception_v3 import InceptionV3
import keras.applications.inception_v3


from tqdm import tqdm
import keras.preprocessing.image
import pickle
from time import time
import numpy as np
from PIL import Image
from keras.models import Sequential
from keras.layers import (LSTM, Embedding, 
    TimeDistributed, Dense, RepeatVector, 
    Activation, Flatten, Reshape, concatenate,  
    Dropout, BatchNormalization)
from keras.optimizers import Adam, RMSprop
from keras import Input, layers
from keras import optimizers

from keras.models import Model

from keras.layers import add
from keras.utils import to_categorical
import matplotlib.pyplot as plt


## Translate the groundTruth.txt to english
I decided to go with english given my expreience with existing arabic `word2vec` models. <br>
For the embeddings I will use spaCy by which yields 300 words embedding

In [3]:
captionlist = {
    1: "god name",
    2: "thank god",
    3: "all deaf hearing arabs",
    4: "peace be upon you",
    5: "today i present to you another programme",
    6: "the subject of the study of arabic sign language",
    7: "today words are sparse in religion",
    8: "also ordinary words",
    9: "no partner of god",
    10: "allah is the greatest"

}

captions = pd.DataFrame.from_dict(captionlist, orient='index')
# change the index to be in range 1 - 10
captions.index = [i+1 for i in range(10)]
captions

Unnamed: 0,0
1,god name
2,thank god
3,all deaf hearing arabs
4,peace be upon you
5,today i present to you another programme
6,the subject of the study of arabic sign language
7,today words are sparse in religion
8,also ordinary words
9,no partner of god
10,allah is the greatest


Set variables

In [4]:
MAX_SEQ_LENGTH = len(max(captions[0], key=len).split(" "))
VOCAB = set(" ".join(captions[0]).split(" "))
VOCAB_SIZE = len(VOCAB)
IMG_SIZE = 224
START = "startseq"
STOP = "endseq"
EMBEDDING_SHAPE = 300
OUTPUT_DIM = 2048


add the start and end tokens for each caption <br>
`startseq`  {Caption}   `endseq `

In [5]:
for i in range(10):
    captions[0][i+1] = START + " " + captions[0][i+1] + " " + STOP

captions

Unnamed: 0,0
1,startseq god name endseq
2,startseq thank god endseq
3,startseq all deaf hearing arabs endseq
4,startseq peace be upon you endseq
5,startseq today i present to you another progra...
6,startseq the subject of the study of arabic si...
7,startseq today words are sparse in religion en...
8,startseq also ordinary words endseq
9,startseq no partner of god endseq
10,startseq allah is the greatest endseq


## Load data
Only dirs are loaded. This is done to avoid the heavy load on memeory, the data will be dynamically loaded through some utilites 

In [6]:


directory = '../data/train'

videos = []
images_dir = []
labels = []
df = pd.DataFrame()
# iterate over files in
# that directory
for label in os.listdir(directory):
    f1 = os.path.join(directory, label)
    for video in os.listdir(f1):
        f2 = os.path.join(f1, video)
        for frame in os.listdir(f2):
            videos.append(f2)
            images_dir.append(os.path.join(f2, frame))
            labels.append(int(label))

df['video_name'] = videos
df['image_dir'] = images_dir
df['caption'] = labels
df['caption'] = df['caption'].apply(lambda x: captions[0][x])

df.head()


Unnamed: 0,video_name,image_dir,caption
0,../data/train/0003/01_0003_(10_03_21_21_04_26)_c,../data/train/0003/01_0003_(10_03_21_21_04_26)...,startseq all deaf hearing arabs endseq
1,../data/train/0003/01_0003_(10_03_21_21_04_26)_c,../data/train/0003/01_0003_(10_03_21_21_04_26)...,startseq all deaf hearing arabs endseq
2,../data/train/0003/01_0003_(10_03_21_21_04_26)_c,../data/train/0003/01_0003_(10_03_21_21_04_26)...,startseq all deaf hearing arabs endseq
3,../data/train/0003/01_0003_(10_03_21_21_04_26)_c,../data/train/0003/01_0003_(10_03_21_21_04_26)...,startseq all deaf hearing arabs endseq
4,../data/train/0003/01_0003_(10_03_21_21_04_26)_c,../data/train/0003/01_0003_(10_03_21_21_04_26)...,startseq all deaf hearing arabs endseq


In [8]:
def build_feature_extractor():
    InceptionV3_model = keras.applications.InceptionV3(
        weights="imagenet",
        include_top=False,
        pooling="avg",
        input_shape=(IMG_SIZE, IMG_SIZE, 3),
    )

    inputs = keras.Input((IMG_SIZE, IMG_SIZE, 3))

    # preprocess the input
    preprocess_input = keras.applications.inception_v3.preprocess_input(inputs)

    # extract the features from the preprocessed input
    outputs = InceptionV3_model(preprocess_input)

    # form the final model 
    myModel = keras.Model(inputs, outputs, name="feature_extractor")
    
    return myModel


feature_extractor = build_feature_extractor()

## Define utilites for data reading and encoding

In [9]:

def image_reader(img_paths):
    """
    Takes and array of paths and read and resize the them.
    This is for dynamic reading
    """
    if not isinstance(img_paths, list):
        img_paths = [img_paths]
    images = []
    for path in img_paths:
        image = plt.imread(path)
        image = resize_image(image)
        images.append(image)
    return np.array(images)


def resize_image(image):
    """
    Resize images to a desired shape, does not account for aspect ratio 
    """
    _,_,depth = image.shape
    return np.resize(image, (IMG_SIZE, IMG_SIZE, depth))

def gather_video_frames(video_path, df):
    """
    Returns all dirs of frames that belong to a video
    """
    frames = df[df['video_name'] == video_path]['image_dir'].values.tolist()
    return frames
    
# def prepare_all_videos(df):
#     video_paths = np.unique(df['video_name']).tolist()
#     captions = [df[df['video_name'] == i]['caption'] for i in video_paths]

#     allVideosFeatures = []
#     for idx, path in enumerate(video_paths):
#         frames = image_reader(gather_video_frames(path, df))
#         videoFeatures = feature_extractor.predict(frames, verbose=0)
#         allVideosFeatures.append(videoFeatures.squeeze())

#     return np.array(allVideosFeatures), captions
    
def prepare_single_video(df, path):
    """
    Takes a video and encode all its frames and return the encoded frames and the captions
    The backbone of the generator as it allows dynamic reading of the data
    """
    caption = df[df['video_name'] == path]['caption'].values.tolist()[0]
    videoFeatures = []
    frames = image_reader(gather_video_frames(path,df))
    features = feature_extractor.predict(frames, verbose=0)
    videoFeatures.append(features.squeeze())

    return np.squeeze(np.array(videoFeatures)), caption

## Prepare word dictionaries and embeddings
Two dictionaries are defined:
* `idxtoword`: assists in the prediction 
* `wordtoidx`: to set up the labels correctly for the model

In [10]:
import spacy

nlp = spacy.load('en_core_web_lg')
embedding_matrix = np.zeros((VOCAB_SIZE, EMBEDDING_SHAPE))

def get_word_embedding(word):
    return nlp(word).vector

idxtoword = {}
wordtoidx = {}

for i,w in enumerate(VOCAB):
    print(i,w)
    wordtoidx[w] = i
    idxtoword[i] = w
    embedding_matrix[i] = get_word_embedding(w)


embedding_matrix.shape

0 name
1 the
2 partner
3 another
4 upon
5 also
6 allah
7 be
8 programme
9 is
10 thank
11 all
12 hearing
13 of
14 are
15 to
16 religion
17 no
18 words
19 present
20 sparse
21 ordinary
22 sign
23 peace
24 you
25 today
26 subject
27 study
28 god
29 language
30 deaf
31 i
32 greatest
33 arabic
34 in
35 arabs


(36, 300)

In [11]:
def data_generator(df, wordtoidx, \
                    batch_size):
  # x1 - Training data for df
  # x2 - The caption that goes with each photo
  # y - The predicted rest of the caption
  x1, x2, y = [], [], []
  n=0

  videos = np.unique(df['video_name']).tolist()

  while True:
    for video in videos:
      n+=1
      hidden_state, caption = prepare_single_video(df, video)
      # Convert each word into a list of sequences.
      seq = [wordtoidx[word] for word in caption.split(' ') \
               if word in wordtoidx]
      # Generate a training case for every possible sequence and outcome
      for i in range(1, len(seq)):
          in_seq, out_seq = seq[:i], seq[i]
          in_seq = pad_sequences([in_seq], maxlen=MAX_SEQ_LENGTH)[0]
          out_seq = to_categorical([out_seq], num_classes=VOCAB_SIZE)[0]
          x1.append(hidden_state)
          x2.append(in_seq)
          y.append(out_seq)
      if n==batch_size:
        yield ([np.array(x1)[0], np.array(x2)], np.array(y))
        x1, x2,y = [], [], []
        n=0


In [12]:
from keras.models import Model

inputs1 = keras.Input(shape=(80, OUTPUT_DIM))
fe1 = Dropout(0.5)(inputs1)
fe2 = LSTM(256, activation='relu')(fe1)
inputs2 = keras.Input(shape=(MAX_SEQ_LENGTH,))
se1 = Embedding(VOCAB_SIZE, EMBEDDING_SHAPE, mask_zero=True)(inputs2)
se2 = Dropout(0.5)(se1)
se3 = LSTM(256)(se2)
decoder1 = add([fe2, se3])
decoder2 = Dense(256, activation='relu')(decoder1)
outputs = Dense(VOCAB_SIZE, activation='softmax')(decoder2)
caption_model = Model(inputs=[inputs1, inputs2], outputs=outputs)

caption_model.layers[2].set_weights([embedding_matrix])
caption_model.layers[2].trainable = False
adamOptimizer = keras.optimizers.Adam(learning_rate = 1e-4)
caption_model.compile(loss='categorical_crossentropy', optimizer=adamOptimizer, metrics=['accuracy'])


In [13]:
caption_model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_4 (InputLayer)           [(None, 9)]          0           []                               
                                                                                                  
 input_3 (InputLayer)           [(None, 80, 2048)]   0           []                               
                                                                                                  
 embedding (Embedding)          (None, 9, 300)       10800       ['input_4[0][0]']                
                                                                                                  
 dropout (Dropout)              (None, 80, 2048)     0           ['input_3[0][0]']                
                                                                                              

In [14]:
checkpoint = keras.callbacks.ModelCheckpoint(
        '../model', save_weights_only=True, save_best_only=True, verbose=1 )
earlyStopper = keras.callbacks.EarlyStopping(monitor='val_loss', patience=10)     

generator = data_generator(df, wordtoidx, 80)
caption_model.fit(generator, epochs=10,
                  verbose=1, callbacks=[checkpoint, earlyStopper])

2022-12-05 14:15:02.422046: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Epoch 1/10


ValueError: in user code:

    File "/Users/waleedalasad/Documents/GitHub/ICS471_Assignments/venv/lib/python3.9/site-packages/keras/engine/training.py", line 1160, in train_function  *
        return step_function(self, iterator)
    File "/Users/waleedalasad/Documents/GitHub/ICS471_Assignments/venv/lib/python3.9/site-packages/keras/engine/training.py", line 1146, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/Users/waleedalasad/Documents/GitHub/ICS471_Assignments/venv/lib/python3.9/site-packages/keras/engine/training.py", line 1135, in run_step  **
        outputs = model.train_step(data)
    File "/Users/waleedalasad/Documents/GitHub/ICS471_Assignments/venv/lib/python3.9/site-packages/keras/engine/training.py", line 993, in train_step
        y_pred = self(x, training=True)
    File "/Users/waleedalasad/Documents/GitHub/ICS471_Assignments/venv/lib/python3.9/site-packages/keras/utils/traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "/Users/waleedalasad/Documents/GitHub/ICS471_Assignments/venv/lib/python3.9/site-packages/keras/engine/input_spec.py", line 232, in assert_input_compatibility
        raise ValueError(

    ValueError: Exception encountered when calling layer "model" "                 f"(type Functional).
    
    Input 0 of layer "lstm" is incompatible with the layer: expected ndim=3, found ndim=2. Full shape received: (None, None)
    
    Call arguments received by layer "model" "                 f"(type Functional):
      • inputs=('tf.Tensor(shape=(None, None), dtype=float32)', 'tf.Tensor(shape=(None, 9), dtype=int32)')
      • training=True
      • mask=None


In [31]:
def is_prime(n):
  for i in range(2,n):
    if (n%i) == 0:
      return False
  return True


i = 1
a = []
for j in range(10000000):
    a.append(1/(j+1) * i)
    # i = i * -1

sum(a),np.log(10000000)

(16.695311365857272, 16.11809565095832)