In [1]:
# importing the required libraries
import pandas as pd
import numpy as np
import cv2
import os
import glob
import json

In [2]:
# the path from which to extract the features
# *.npy is used as it helps in extracting all the .npy files
path = r'D:\Video_captioning_project\MLDS_hw2_data\training_data\feat\*.npy'

In [3]:
# creating a dictionary with all the features and file names
feature_dict = {}
for file_path in glob.glob(path): 
    # [:-4] means we leave out .npy from the file_name
    filename = file_path.split('\\')[-1][:-4]
    feature_dict[filename] = np.load(file_path)

In [4]:
len(feature_dict)

1450

In [5]:
# loading the labels
with open(r'D:\Video_captioning_project\MLDS_hw2_data\training_label.json', 'r') as f:
    captions_dict_list = json.load(f)
f.close()

In [6]:
# creating a captions dict, like the feature dict
captions_dict = {}

# the captions_dict maps the filenames to the list of captions 
for i in range(len(captions_dict_list)):
    file_name = captions_dict_list[i]["id"]
    captions_dict[file_name] = captions_dict_list[i]["caption"]

# deleting captions_dict_list as we don't need it anymore
del captions_dict_list

# Text Preprocessing

In [7]:
# lowering the sentences 
for name, caption_list in captions_dict.items():
    new_list = []
    for caption in caption_list:
        # we add <sos> and <eos> tokens too 
        new_list.append('<sos> '+str(caption.lower().strip('.'))+ ' <eos>')
    captions_dict[name] = new_list

In [8]:
# making a dictionary for glove embeddings
word_to_vec_map = {}
with open('glove.6B.50d.txt', 'r', encoding = 'utf-8') as f: 
    for lines in f:
        line = lines.strip().split()
        word = line[0]
        word_to_vec_map[word] = np.array(line[1:], dtype = np.float64)
f.close()
# word_to_vec_map maps each word to its corresponding glove vector

In [9]:
# tokenizing
tokens = {}
counter = 1
for name, caption_list in captions_dict.items():
    for caption in caption_list:
        for word in caption.split():
            if(word not in tokens):
                # each word can now be repesented by a number ('token')
                tokens[word] = counter 
                counter+=1

In [10]:
def text_to_seq(string):
    '''
        arguments -->
            string = the string which we will convert into a list of tokens
        returns -->
            seq = list of tokens representing each word in string
    '''
    seq = []
    word_list = string.split()
    for i in range(len(word_list)):
        seq.append(tokens[word_list[i]])
    return seq

In [11]:
# converting to seq and finding max length (MAX_LEN)
MAX_LEN = 0
for name, caption_list in captions_dict.items():
    seq_list = []
    for caption in caption_list:
        # converting each caption to a sequence
        seq = text_to_seq(caption)
        # logic for calculating MAX_LEN 
        if(len(seq)>MAX_LEN):
            MAX_LEN = len(seq)
        seq_list.append(seq)
        
    # mapping the filename to the sequence representing a caption
    captions_dict[name] = seq_list

In [12]:
print(MAX_LEN)

42


In [13]:
# importing additional libraries to create the input
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Creating input and output

In [14]:
# generating input_output pairs in batches because it throws a memory error without batches
def batch_generator(feature_dict, captions_dict, batch_size, start_index):
    '''
        arguments -->
            feature_dict = a dictionary mapping file names to feature vectors
            captions_dict = a dictionary mapping file names to caption list
            batch_size = number of 'file names' we take 
            start_index = the index from which to start the batch
        returns -->
            X_batch = a numpy array containing all the feature vetors 
                    corresponding to input
            y_in_batch = a numpy array containing the input seq
            y_out_batch = a numpy array containing the output seq
            next_index = the index where the next batch should start from
    '''
    # initializing the input and output lists
    X_batch = []
    y_in_batch = []
    y_out_batch = []
    
    for i in range(start_index, start_index+batch_size):
        for seq in list(captions_dict.values())[i]:
            for k in range(1, len(seq)):
                X_batch.append(feature_dict[list(captions_dict.keys())[i]])
                
                # taking the first k tokens as input
                in_seq = [seq[:k]]
                # using the (k+1) token as output
                out_seq = [seq[k]]
                
                # padding 0 at the end of sequences for sequences that are shorter than MAX_LEN
                in_seq = pad_sequences(in_seq, maxlen = MAX_LEN, padding = 'post', truncating='post')
                
                # converting our output into categorical variables
                # here we use len(tokens)+1 as the '0' is specifically used for padding, and
                # hence we cannot use it to represent words
                out_seq = to_categorical(out_seq, num_classes=len(tokens)+1)
                
                # appending out input and output to a list
                y_in_batch.append(in_seq)
                y_out_batch.append(out_seq)
          
    # calculating next_index
    next_index = start_index+batch_size
    
    return np.array(X_batch),  np.array(y_in_batch), np.array(y_out_batch) , next_index

# Creating the embedding matrix

In [15]:
def create_embedding_matrix(vocabulary_dict, word_to_vec_map):
    '''
        arguments -->
            vocabulary_dict = a dict mapping words to tokens
            word_to_vec_map = a dict mapping words to glove embeddings
        returns -->
            embedding_matrix = a numpy array that can be used as an embedding matrix to the model
    '''
    
    vocab_size = len(vocabulary_dict)
    dimensions = len(list(word_to_vec_map.values())[0])
    
    # initializing it to all zeros
    embedding_matrix = np.zeros(shape = (vocab_size+1, dimensions))
    
    for word, i in vocabulary_dict.items():
        embedding_vector = word_to_vec_map.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            
    return np.array(embedding_matrix)   

In [16]:
# storing our matrix into embed
embed = create_embedding_matrix(tokens, word_to_vec_map)
# deleting word_to_vec_map as we won't need it for training anymore.
del  word_to_vec_map

In [22]:
embed.shape

(6451, 50)

In [None]:
# from here the basic idea is to train a model taking inputs in batches, then use the weights
# learned to decode the testing set.