In [1]:
# importing the required libraries
import pandas as pd
import numpy as np
import cv2
import os
import glob
import json

In [2]:
# the path from which to extract the features
# *.npy is used as it helps in extracting all the .npy files
path = r'D:\Video_captioning_project\MLDS_hw2_data\training_data\feat\*.npy'

In [3]:
# creating a dictionary with all the features and file names
feature_dict = {}
for file_path in glob.glob(path):
    filename = file_path.split('\\')[-1][:-4]
    feature_dict[filename] = np.load(file_path)

In [4]:
len(feature_dict)

1450

In [5]:
# loading the labels
with open(r'D:\Video_captioning_project\MLDS_hw2_data\training_label.json', 'r') as f:
    captions_dict_list = json.load(f)
f.close()

In [6]:
# creating a captions dict, like the feature dict
captions_dict = {}
for i in range(len(captions_dict_list)):
    file_name = captions_dict_list[i]["id"]
    captions_dict[file_name] = captions_dict_list[i]["caption"]
    
del captions_dict_list

# Text Preprocessing

In [7]:
# lowering the sentences
for name, caption_list in captions_dict.items():
    new_list = []
    for caption in caption_list:
        new_list.append('<sos> '+str(caption.lower().strip('.'))+ ' <eos>')
    captions_dict[name] = new_list

In [8]:
# making a dictionary for glove embeddings
word_to_vec_map = {}
with open('glove.6B.50d.txt', 'r', encoding = 'utf-8') as f: 
    for lines in f:
        line = lines.strip().split()
        word = line[0]
        word_to_vec_map[word] = np.array(line[1:], dtype = np.float64)
f.close()

In [9]:
# tokenizing
tokens = {}
counter = 0
for name, caption_list in captions_dict.items():
    for caption in caption_list:
        for word in caption.split():
            if(word not in tokens):
                tokens[word] = counter
                counter+=1

In [10]:
def text_to_seq(string):
    seq = []
    word_list = string.split()
    for i in range(len(word_list)):
        seq.append(tokens[word_list[i]])
    return seq

In [11]:
# converting to seq and finding max length
MAX_LEN = 0
for name, caption_list in captions_dict.items():
    seq_list = []
    for caption in caption_list:
        seq = text_to_seq(caption)
        if(len(seq)>MAX_LEN):
            MAX_LEN = len(seq)
        seq_list.append(seq)
    captions_dict[name] = seq_list

In [12]:
print(MAX_LEN)

42


In [13]:
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [19]:
# generating input_output pairs in batches because it throws a memory error without batches
def batch_generator(feature_dict, captions_dict, batch_size, start_index):
    '''
        parameters -->
            feature_dict = a dictionary mapping file names to feature vectors
            captions_dict = a dictionary mapping file names to caption list
            batch_size = number of 'file names' we take 
            start_index = the index from which to start the batch
        returns -->
            X_batch = a numpy array containing all the feature vetors 
                    corresponding to input
            y_in_batch = a numpy array containing the input seq
            y_out_batch = a numpy array containing the output seq
    '''
    X_batch = []
    y_in_batch = []
    y_out_batch = []
    
    for i in range(start_index, start_index+batch_size):
        for seq in list(captions_dict.values())[i]:
            for k in range(len(seq)):
                X_batch.append(feature_dict[list(captions_dict.keys())[i]])
                
                in_seq = [seq[:k]]
                out_seq = [seq[k]]
                
                in_seq = pad_sequences(in_seq, maxlen = MAX_LEN, padding = 'post', truncating='post')
                out_seq = to_categorical(out_seq, num_classes=len(tokens))
                
                y_in_batch.append(in_seq)
                y_out_batch.append(out_seq)
                
    next_index = start_index+batch_size
    
    return np.array(X_batch),  np.array(y_in_batch), np.array(y_out_batch) , next_index

In [15]:
def create_embedding_matrix(vocabulary_dict, word_to_vec_map):
    vocab_size = len(vocabulary_dict)
    dimensions = len(list(word_to_vec_map.values())[0])
    
    embedding_matrix = np.zeros(shape = (vocab_size, dimensions))
    
    for word, i in vocabulary_dict.items():
        embedding_vector = word_to_vec_map.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            
    return np.array(embedding_matrix)   

In [24]:
embed = create_embedding_matrix(tokens, word_to_vec_map)
del  word_to_vec_map

(6450, 50)