In [None]:
"""
Train a model on the MMS Dataset.
"""
import os
import torch
import random
import pickle
import torchvision
import numpy as np
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import torch.nn.functional as F
import torch.optim.lr_scheduler as sched
import torchvision.transforms as transforms

from collections import OrderedDict
from PIL import Image
from json import dumps
# from models import MMBiDAF
from tensorboardX import SummaryWriter
from tqdm import tqdm
from ujson import load as json_load
from multimodal_bidaf.datasets import ImageDataset

def main(embedding_path, audio_path, image_dir):
    # Get embeddings
    """
    The embeddings need to be imported from the
    sentence embeddings generated by using gensim.
    """
    embedding_dict = torch.load(embedding_path)                  #TODO : The absolute path needs to be changed 
    
    print('Loading embeddings...')
    
    word_vectors = torch.zeros(len(embedding_dict),300)
    for count, embedding in enumerate(embedding_dict):
        word_vectors[count] = embedding_dict[embedding]
    
    print(word_vectors) 
    print('The shape is : {}'.format(word_vectors.size()))

    # Get Audio embeddings
    """
    The features are imported from the extracted MFCC features. 
    """
    with open(audio_path, 'rb') as fp:
        audio_vectors = pickle.load(fp)
    
    audio_vectors = np.transpose(audio_vectors)
    audio_vectors = torch.from_numpy(audio_vectors)

    print(audio_vectors)
    print('The shape is : {}'.format(audio_vectors.size()))
    
    # Preprocess the image in prescribed format
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    transform = transforms.Compose([transforms.RandomResizedCrop(256), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize,])
    
    train_loader = torch.utils.data.DataLoader(ImageDataset(image_dir, transform), batch_size = 1, shuffle = True, num_workers = 2)
    
    # Get model
    log.info('Building model')
    model = MMBiDAF(word_vector = word_vectors,
                    audio_vectors = audio_vectors,
                    image_vectors = image_vectors,
                    hidden_size = args.hidden_size,
                    drop_prob = args.drop_prob)
    
if __name__ == '__main__':
    embedding_path = '/home/anish17281/NLP_Dataset/dataset/1/sentence_features/1.pt'
    audio_path = '/home/anish17281/NLP_Dataset/dataset/1/audio-features/1.pkl'
    image_dir = '/home/anish17281/NLP_Dataset/dataset/1/video_key_frames/1/'
    main(embedding_path, audio_path, image_dir)

In [None]:
print('hello')

### The Image Encoder code

In [2]:
from datasets import ImageDataset
import numpy as np
import torch
import torchvision
import torch.nn as nn
import torch.nn.functional as F
from PIL import Image
import torchvision.transforms as transforms
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
transform = transforms.Compose([transforms.RandomResizedCrop(256), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize,])
image_dir = '/home/anish17281/NLP_Dataset/dataset/'
import os
from layers import ImageEncoder

In [3]:
image_dataset = ImageDataset(image_dir, transform)

In [4]:
len(image_dataset)

937

In [8]:
image_dataset.image_paths[0]

['/home/anish17281/NLP_Dataset/dataset/1/video_key_frames/1/1_i_frame_1.jpg',
 '/home/anish17281/NLP_Dataset/dataset/1/video_key_frames/1/1_i_frame_2.jpg',
 '/home/anish17281/NLP_Dataset/dataset/1/video_key_frames/1/1_i_frame_3.jpg',
 '/home/anish17281/NLP_Dataset/dataset/1/video_key_frames/1/1_i_frame_4.jpg',
 '/home/anish17281/NLP_Dataset/dataset/1/video_key_frames/1/1_i_frame_5.jpg',
 '/home/anish17281/NLP_Dataset/dataset/1/video_key_frames/1/1_i_frame_6.jpg',
 '/home/anish17281/NLP_Dataset/dataset/1/video_key_frames/1/1_i_frame_7.jpg',
 '/home/anish17281/NLP_Dataset/dataset/1/video_key_frames/1/1_i_frame_8.jpg',
 '/home/anish17281/NLP_Dataset/dataset/1/video_key_frames/1/1_i_frame_9.jpg',
 '/home/anish17281/NLP_Dataset/dataset/1/video_key_frames/1/1_i_frame_10.jpg',
 '/home/anish17281/NLP_Dataset/dataset/1/video_key_frames/1/1_i_frame_11.jpg',
 '/home/anish17281/NLP_Dataset/dataset/1/video_key_frames/1/1_i_frame_12.jpg',
 '/home/anish17281/NLP_Dataset/dataset/1/video_key_frames/1/1

In [9]:
train_loader = torch.utils.data.DataLoader(ImageDataset(image_dir, transform), batch_size = 1, shuffle = False, num_workers = 2)


In [10]:
print(train_loader)

<torch.utils.data.dataloader.DataLoader object at 0x7f18a1eda390>


In [11]:
image_encoder = ImageEncoder()

In [12]:
batch_encodings = torch.Tensor(0,0)
for i,batch_images in enumerate(train_loader):
    original_shape = batch_images.size()
    batch_images = torch.reshape(batch_images, (-1, batch_images.size(2), batch_images.size(3), batch_images.size(4)))
    batch_encodings = image_encoder(batch_images)
    batch_encodings = torch.reshape(batch_encodings, (batch_encodings.size(0), -1))
    print(batch_encodings.size())
    break
    
m_linear = nn.Linear(batch_encodings.size(-1), 300)
output = m_linear(batch_encodings)
print(output.size())
# for count, batch_images in enumerate(train_loader):
#     print(type(batch_images))
#     print(batch_images.size())

torch.Size([81, 401408])
torch.Size([81, 300])


In [None]:
for i in train_loader:
    print(i.size())
    break

In [None]:
print(len(train_loader))
print(type(train_loader))

In [None]:
image_dataset = ImageDataset(image_dir, transform)

In [None]:
print(len(image_dataset.images))
print(image_dataset.images)


In [None]:
import re
sorted_image_dir = sorted(os.listdir(image_dir), key = int)
images = []

def get_num(str):
    return int(re.search(r'\d+', re.search(r'_\d+', str).group()).group())

for video_path in sorted_image_dir:
    keyframes = [os.path.join(video_path, img) for img in os.listdir(os.path.join(image_dir, video_path)) \
                if os.path.isfile(os.path.join(image_dir, video_path, img))]
    keyframes.sort(key = get_num)
    images.extend([keyframes])
    
print(images)

In [None]:
# print(images[0][0])
# print(os.path.join(image_dir, images[0][0]))
print(type(transform))

### Code for testing TextEmbedding in layers.py

In [None]:
from multimodal_bidaf.layers import TextEmbedding
from multimodal_bidaf.layers import HighwayEncoder

In [None]:
from multimodal_bidaf.datasets import TextDataset, ImageDataset, AudioDataset
import torch
import torchvision
import torch.nn as nn
import torch.nn.functional as F
text_embedding_dir = '/home/anish17281/NLP_Dataset/dataset/1/sentence_features/'
train_text_loader = torch.utils.data.DataLoader(TextDataset(text_embedding_dir), batch_size = 1, shuffle = False, num_workers = 2)

In [None]:
for count, batch_text in enumerate(train_text_loader):
    print(batch_text)
    print(batch_text.size())
    break

In [None]:
import torch
embedding_dict = torch.load('/home/anish17281/NLP_Dataset/dataset/1/sentence_features/1.pt')
for count, embedding in enumerate(embedding_dict):
    word_vectors[count] = embedding_dict[embedding]
    
print(word_vectors)

In [None]:
emb = TextEmbedding(word_vectors=word_vectors, hidden_size=100, drop_prob=0.2)
print(type(emb))


In [None]:
import torch
import torchvision
import torch.nn as nn
embed = nn.Embedding.from_pretrained(word_vectors)
input = torch.LongTensor([[0, 1], [2,3]])
print(embed(input).size())

In [None]:
from multimodal_bidaf.layers import RNNEncoder

enc = RNNEncoder(input_size=100, hidden_size=100, num_layers=1, drop_prob=0.2)

In [None]:
text_emb = emb(input)
print(text_emb.size())

In [None]:
text_enc = enc(text_emb, torch.Tensor([2, 2]))

In [None]:
temp = torch.LongTensor([[2, 1], [3, 1]])
print(temp)

In [None]:
temp_bool = torch.zeros_like(temp) != temp

In [None]:
print(temp_bool.size())

In [None]:
temp_sum = temp_bool.sum(-1)

In [None]:
temp_sum

In [None]:
print(text_enc.size())

### Code for testing Audio Encoding

In [None]:
import pickle
import numpy as np
audio_path = '/home/anish17281/NLP_Dataset/dataset/1/audio-features/7.pkl'
with open(audio_dir, 'rb') as fp:
    audio_vectors = pickle.load(fp)
    
print(audio_vectors)
print(audio_vectors.shape)
audio_vectors = np.transpose(audio_vectors)
audio_vectors = torch.from_numpy(audio_vectors)
print(audio_vectors.size())
print(type(audio_vectors))

In [None]:
import re
def get_num(str):
    return int(re.search(r'\d+',str).group())

In [None]:
import os
audio_path = '/home/anish17281/NLP_Dataset/dataset/1/audio-features/'
audio_list = sorted(os.listdir(audio_path), key = get_num)

In [None]:
audio_list[0]

In [None]:
with open(os.path.join(audio_path, audio_list[0]), 'rb') as fp:
    a_v = pickle.load(fp)
print(a_v.shape)

In [None]:
len(audio_list)

In [None]:
from multimodal_bidaf.datasets import AudioDataset
import torch
import torchvision
import torch.nn as nn
import torch.nn.functional as F
audio_dir = '/home/anish17281/NLP_Dataset/dataset/1/audio-features/'
train_loader_1 = torch.utils.data.DataLoader(AudioDataset(audio_dir), batch_size = 1, shuffle = False, num_workers = 2)

In [None]:
audio_dataset = AudioDataset(audio_dir)

In [None]:
print(type(audio_dataset))

In [None]:
print(train_loader_1)

In [None]:
for count, batch_audio in enumerate(train_loader_1):
    print(batch_audio)
    print(batch_audio.size())
    break

In [None]:
print(batch_audio.size(1))
temp_audio = torch.randn(1, 10, 20)
print(temp_audio.size())

In [None]:
from multimodal_bidaf.layers import RNNEncoder

In [None]:
audio_encoder = RNNEncoder(input_size=20, hidden_size=300, num_layers=3, drop_prob=0.2)
x, _ = audio_encoder.rnn(temp_audio)

In [None]:
print(type(x))
print(x.size())

### Testing the Bidirectional Attention layer

In [None]:
import torch
import torchvision
import torch.nn as nn
import torch.nn.functional as F
from multimodal_bidaf.layers import TextImageBiDAFAttention

In [None]:
image_tensor = torch.randn(1, 81, 200)
text_tensor = torch.randn(1, 48, 200)
audio_tensor = torch.randn(1, 32453, 200)

In [None]:
text_image_att = TextImageBiDAFAttention(hidden_size=200, drop_prob=0.2)

In [None]:
cw_idxs = torch.randn(1, 48)
img_idxs = torch.randn(1, 81)

c_mask = torch.zeros_like(cw_idxs) != cw_idxs
img_mask = torch.zeros_like(img_idxs) != img_idxs

In [None]:
text_image_attention = text_image_att(text_tensor, image_tensor, c_mask, img_mask)

In [None]:
from multimodal_bidaf.util import masked_softmax