In [115]:
import os
from pathlib import Path

import pandas as pd
import numpy as np
import cv2
import matplotlib.pyplot as plt

from tqdm import tqdm
import torch
from torch.utils.data import Dataset, DataLoader,random_split
import torch.nn as nn
import torch.optim as optim
import torchvision.models as models

In [3]:
# path configuration
LABELS_PATH = '../data/train_val_annotation/train_val_videodatainfo.json'
DATA_PATH = '../data/train_val_features/'

In [30]:
import json
from pickle import dump, load

In [5]:
# JSON file
f = open (LABELS_PATH, "r")
  # Reading from file
data = json.loads(f.read())

In [96]:
data['videos'][0]

{'category': 9,
 'url': 'https://www.youtube.com/watch?v=9lZi22qLlEo',
 'video_id': 'video0',
 'start time': 137.72,
 'end time': 149.44,
 'split': 'train',
 'id': 0}

In [7]:
## randomly select sentence
#label_df = pd.DataFrame(data['sentences'])
#label_final_df = label_df.groupby('video_id')['sen_id'].unique().apply(lambda x: x[np.random.randint(0,20)]).to_frame().reset_index()
#label_final_df['video_id'].nunique()
#label_final_df = label_final_df.join(label_df[['sen_id', 'caption']].set_index('sen_id'), on='sen_id')
#label_final_df.to_csv('../data/label_final.csv', index=False)

## DataSet

In [8]:
label_final_df = pd.read_csv('../data/label_final.csv')

### 1. Create embedding matrix from google news word2vec

In [9]:
import spacy
from collections import Counter
nlp = spacy.load('en_core_web_sm', disable = ['ner', 'parser'])

In [35]:
# embedding
import gensim.downloader

print(list(gensim.downloader.info()['models'].keys()))
glove_emb = gensim.downloader.load('word2vec-google-news-300')

['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']


In [11]:
all_sent = label_final_df['caption'].tolist()#.astype('unicode')

In [26]:
wc = Counter()
try:
    for doc in nlp.pipe(all_sent):
        for word in doc:
            #print(word)
            wc[str(word)] += 1
except Exception as e:
    print(e)
    print(doc,'\nword:', word)

In [51]:
# create embedding matrix
# initialization
EMBEDDING_SIZE = 300
embedding = np.zeros((len(wc)+4, 300)) # +4 for start, end, unk, padding
word2idx = {}
idx2word = {}

word2idx['<PAD>'] = 0
idx2word[0] = '<PAD>'
embedding[0] = np.random.rand(300)*2 - 1

word2idx['<START>'] = 1
idx2word[1] = '<START>'
embedding[1] = np.random.rand(300)*2 - 1

word2idx['<END>'] = 2
idx2word[2] = '<END>'
embedding[2] = np.random.rand(300)*2 - 1

word2idx['<UNK>'] = 3
idx2word[3] = '<UNK>'
embedding[3] = np.random.rand(300)*2 - 1

count = 0
for word, _ in wc.most_common():
    wid = len(word2idx)
    word2idx[word] = wid
    idx2word[wid] = word
    if word in glove_emb:
        embedding[wid] = glove_emb.get_vector(word)
    else:
        embedding[wid] = np.random.rand(300)*2 - 1 # random initialisation (-1, 1)
        count += 1

In [53]:
print(f'{count} words are not in google news werd2vec')

401 words are not in google news werd2vec


### 2.Dataset Class for pytorch

In [90]:
class MSRVTT(Dataset):
    def __init__(self, df, word2idx, DATA_PATH):
        super(MSRVTT, self).__init__()
        self.df = df
        self.path = DATA_PATH
        self.word2idx = word2idx
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        vid = row['video_id']
        filename = self.path + f'{vid}-feature.pt5'
        x = torch.load(filename)
        y = [self.word2idx.get(word, self.word2idx['<UNK>']) for word in row['caption'].split(' ')]
        #true_sentence = row['caption']
        return x, torch.tensor(y).long()#, true_sentence

In [98]:
ds = MSRVTT(label_final_df, word2idx, DATA_PATH)

In [99]:
x, y = next(iter(train_ds))

In [100]:
x

tensor([[0.1810, 0.3634, 0.6611,  ..., 0.3846, 0.6012, 0.1157],
        [0.0812, 0.3774, 0.7720,  ..., 0.6932, 0.7245, 0.2265],
        [0.0763, 0.4366, 0.8833,  ..., 0.4071, 1.0229, 0.1807],
        ...,
        [0.1818, 0.6062, 0.4046,  ..., 0.4644, 0.5192, 0.4749],
        [0.3772, 0.6626, 0.1883,  ..., 0.4252, 0.4500, 0.3894],
        [0.4353, 0.6842, 0.9035,  ..., 0.5593, 1.5382, 0.2510]])

In [101]:
y

tensor([  4,   8,   5, 106,  63,   4, 124])

### 3.Dataloaders

In [113]:
# split train/test
train_proportion = 0.93
train_size = int(train_proportion * len(ds))
validation_size = len(ds) - train_size
print(train_size, validation_size)

6519 491


In [116]:
train_ds, validation_ds = random_split(ds, [train_size, validation_size])
# dataloaders
train_dl = DataLoader(train_ds, batch_size=64, shuffle=True)
validation_dl = DataLoader(validation_ds, batch_size=64, shuffle=False)