## Check which transcripts are missing, but have images/audios - to generate a dataset where each video has the images, audio and text features along with targets

In [23]:
import os

courses_dir = '/home/anish17281/NLP_Dataset/dataset/'

# Get sorted list of all courses (excluding any files)
dirlist = []
for fname in os.listdir(courses_dir):
    if os.path.isdir(os.path.join(courses_dir, fname)):
        dirlist.append(fname)

In [66]:
vids = set()
trs = set()
gts = set()
audios = set()

for course_num in sorted(dirlist, key=int):
    print("Processing course " + str(course_num))
    for vid in os.listdir(os.path.join(courses_dir, course_num, 'videos')):
        if 'mp4' not in vid or '_' in vid:
            continue
        vids.add('{}/{}'.format(course_num, vid[:-4]))
    
    for audio in os.listdir(os.path.join(courses_dir, course_num, 'audio-features')):
        if 'pkl' not in audio or '_' in audio:
            continue
        audios.add('{}/{}'.format(course_num, audio[:-4]))
    
    for tr in os.listdir(os.path.join(courses_dir, course_num, 'transcripts')):
        if 'txt' not in tr or '_' in tr:
            continue
        trs.add('{}/{}'.format(course_num, tr[:-4]))
    
    for gt in os.listdir(os.path.join(courses_dir, course_num, 'ground-truth')):
        if 'txt' not in gt or '_' in gt:
            continue
        gts.add('{}/{}'.format(course_num, gt[:-4]))

Processing course 1
Processing course 2
Processing course 3
Processing course 4
Processing course 5
Processing course 6
Processing course 7
Processing course 8
Processing course 9
Processing course 10
Processing course 11
Processing course 12
Processing course 13
Processing course 14
Processing course 15
Processing course 16
Processing course 17
Processing course 18
Processing course 19
Processing course 20
Processing course 21
Processing course 22
Processing course 23
Processing course 24


In [67]:
len(vids), len(audios), len(trs), len(gts)

(965, 965, 961, 961)

In [68]:
inter = vids.intersection(audios).intersection(trs).intersection(gts)
len(inter)

961

In [71]:
import _pickle as pickle

with open('dataset_inter.pkl', 'wb') as f:
    pickle.dump(inter, f)

In [41]:
import _pickle as pickle

with open('dataset_inter2.pkl', 'rb') as f:
    dataset_inter = pickle.load(f)

if '22/2' in dataset_inter:
    dataset_inter.remove('22/2')

if '22/4' in dataset_inter:
    dataset_inter.remove('22/4')

if '22/7' in dataset_inter:
    dataset_inter.remove('22/7')

len(dataset_inter)

958

In [42]:
import _pickle as pickle

with open('dataset_inter2.pkl', 'wb') as f:
    pickle.dump(dataset_inter, f)

In [1]:
from datasets import TextDataset, ImageDataset, AudioDataset, TargetDataset

import torch
import torchvision
import torch.nn as nn
import torch.nn.functional as F

In [2]:
text_embedding_dir = '/home/anish17281/NLP_Dataset/dataset/'
train_text_loader = torch.utils.data.DataLoader(TextDataset(text_embedding_dir, 405), batch_size = 1, shuffle = False, num_workers = 2)

In [3]:
from PIL import Image
import torchvision.transforms as transforms
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
transform = transforms.Compose([transforms.RandomResizedCrop(256), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize,])

image_dir = '/home/anish17281/NLP_Dataset/dataset/'
train_image_loader = torch.utils.data.DataLoader(ImageDataset(image_dir, transform), batch_size = 1, shuffle = False, num_workers = 2)

In [4]:
audio_dir = '/home/anish17281/NLP_Dataset/dataset/'
train_audio_loader = torch.utils.data.DataLoader(AudioDataset(audio_dir), batch_size = 1, shuffle = False, num_workers = 2)

In [5]:
courses_dir = '/home/anish17281/NLP_Dataset/dataset/'
train_target_loader = torch.utils.data.DataLoader(TargetDataset(courses_dir), batch_size = 1, shuffle = False, num_workers = 2)

In [6]:
len(train_text_loader)

961

In [7]:
len(train_image_loader)

961

In [8]:
len(train_audio_loader)

961

In [9]:
len(train_target_loader)

961

In [10]:
len(train_target_loader.dataset.target_sentences_path), len(train_target_loader.dataset.source_sentences_path)

(961, 961)

# Generate train, test indices

In [30]:
from datasets import AudioDataset
import numpy as np
import pickle

courses_dir = '/home/anish17281/NLP_Dataset/dataset/'
dataset = AudioDataset(courses_dir)

with open('none_idxs.pkl', 'rb') as f:
    none_idxs = pickle.load(f)

test_split = 0.1
shuffle_dataset = True
dataset_size = len(dataset)
indices = [idx for idx in range(dataset_size) if idx not in none_idxs]
split = int(np.floor(test_split * len(indices)))
if shuffle_dataset:
    np.random.seed(42)
    np.random.shuffle(indices)
train_indices, test_indices = indices[split:], indices[:split]

In [31]:
test_indices = set(test_indices)

In [32]:
len(test_indices)

93

In [33]:
test_indices

{25,
 33,
 42,
 49,
 70,
 75,
 78,
 80,
 94,
 104,
 128,
 144,
 145,
 173,
 183,
 201,
 207,
 217,
 218,
 219,
 224,
 227,
 248,
 253,
 256,
 259,
 263,
 276,
 290,
 300,
 308,
 314,
 328,
 331,
 333,
 335,
 339,
 345,
 364,
 367,
 374,
 437,
 452,
 457,
 470,
 480,
 481,
 496,
 498,
 508,
 509,
 541,
 543,
 546,
 547,
 563,
 574,
 590,
 599,
 602,
 604,
 648,
 658,
 666,
 671,
 681,
 684,
 695,
 698,
 701,
 704,
 730,
 732,
 739,
 752,
 759,
 768,
 774,
 789,
 790,
 825,
 827,
 839,
 848,
 851,
 878,
 890,
 894,
 895,
 909,
 910,
 922,
 959}

In [34]:
import pickle

with open('test_indices.pkl', 'wb') as f:
    pickle.dump(test_indices, f)