## Check which transcripts are missing, but have images/audios - to generate a dataset where each video has the images, audio and text features along with targets

In [23]:
import os

courses_dir = '/home/anish17281/NLP_Dataset/dataset/'

# Get sorted list of all courses (excluding any files)
dirlist = []
for fname in os.listdir(courses_dir):
    if os.path.isdir(os.path.join(courses_dir, fname)):
        dirlist.append(fname)

In [66]:
vids = set()
trs = set()
gts = set()
audios = set()

for course_num in sorted(dirlist, key=int):
    print("Processing course " + str(course_num))
    for vid in os.listdir(os.path.join(courses_dir, course_num, 'videos')):
        if 'mp4' not in vid or '_' in vid:
            continue
        vids.add('{}/{}'.format(course_num, vid[:-4]))
    
    for audio in os.listdir(os.path.join(courses_dir, course_num, 'audio-features')):
        if 'pkl' not in audio or '_' in audio:
            continue
        audios.add('{}/{}'.format(course_num, audio[:-4]))
    
    for tr in os.listdir(os.path.join(courses_dir, course_num, 'transcripts')):
        if 'txt' not in tr or '_' in tr:
            continue
        trs.add('{}/{}'.format(course_num, tr[:-4]))
    
    for gt in os.listdir(os.path.join(courses_dir, course_num, 'ground-truth')):
        if 'txt' not in gt or '_' in gt:
            continue
        gts.add('{}/{}'.format(course_num, gt[:-4]))

Processing course 1
Processing course 2
Processing course 3
Processing course 4
Processing course 5
Processing course 6
Processing course 7
Processing course 8
Processing course 9
Processing course 10
Processing course 11
Processing course 12
Processing course 13
Processing course 14
Processing course 15
Processing course 16
Processing course 17
Processing course 18
Processing course 19
Processing course 20
Processing course 21
Processing course 22
Processing course 23
Processing course 24


In [67]:
len(vids), len(audios), len(trs), len(gts)

(965, 965, 961, 961)

In [68]:
inter = vids.intersection(audios).intersection(trs).intersection(gts)
len(inter)

961

In [71]:
import _pickle as pickle

with open('dataset_inter.pkl', 'wb') as f:
    pickle.dump(inter, f)

In [26]:
def get_num(str):
    return int(re.search(r'\d+', str).group())

def load_sentence_embeddings_path():
    transcript_embeddings = []

    # Get sorted list of all courses (excluding any files)
    dirlist = []
    for fname in os.listdir(courses_dir):
        if os.path.isdir(os.path.join(courses_dir, fname)):
            dirlist.append(fname)

    for course_number in sorted(dirlist, key=int):
        course_transcript_path = os.path.join(courses_dir, course_number, 'sentence_features/')
        text_embedding_path = [courses_dir + course_number + '/sentence_features/' + transcript_path for transcript_path in sorted(os.listdir(course_transcript_path), key=get_num)]
        transcript_embeddings.append(text_embedding_path)
    
    return transcript_embeddings

def load_image_paths():
    images = []

    # Get sorted list of all courses (excluding any files)
    dirlist = []
    for fname in os.listdir(courses_dir):
        if os.path.isdir(os.path.join(courses_dir, fname)):
            dirlist.append(fname)

    for course_dir in sorted(dirlist, key=int):
        keyframes_dir_path = os.path.join(courses_dir, course_dir, 'video_key_frames/')
        for video_dir in sorted(os.listdir(keyframes_dir_path), key=int):
            video_dir_path = os.path.join(keyframes_dir_path, video_dir)
            keyframes = [os.path.join(video_dir_path, img) for img in os.listdir(video_dir_path) \
                        if os.path.isfile(os.path.join(video_dir_path, img))]
            keyframes.sort(key = get_num)
            images.extend([keyframes])

    return images

def load_audio_path():
    audio_embeddings = []

    # Get sorted list of all courses (excluding any files)
    dirlist = []
    for fname in os.listdir(courses_dir):
        if os.path.isdir(os.path.join(courses_dir, fname)):
            dirlist.append(fname)

    for course_number in sorted(dirlist, key=int):
        course_audio_path = os.path.join(courses_dir, course_number, 'audio-features/')
        audio_embedding_path = [courses_dir + course_number + '/audio-features/' + audio_path for audio_path in sorted(os.listdir(course_audio_path), key=get_num)]
        audio_embeddings.append(audio_embedding_path)

    return [val for sublist in audio_embeddings for val in sublist]     #Flatten the list of lists

def load_target_sentences_path():
    target_sentences = []
    dirlist = []
    for fname in os.listdir(courses_dir):
        if os.path.isdir(os.path.join(courses_dir, fname)):
            dirlist.append(fname)

    for course_number in sorted(dirlist, key=int):
        target_path = os.path.join(courses_dir, course_number, 'ground-truth/')
        target_sentence_path = [target_path + target_sentence for target_sentence in sorted([item for item in os.listdir(target_path) if os.path.isfile(os.path.join(target_path, item)) and '.txt' in item and '_' not in item], key=get_num)]
        target_sentences.append(target_sentence_path)

    return [val for sublist in target_sentences for val in sublist]    #Flatten the list of lists

def load_source_sentences_path():
    source_sentences = []

    # Get sorted list of all courses (excluding any files)
    dirlist = []
    for fname in os.listdir(courses_dir):
        if os.path.isdir(os.path.join(courses_dir, fname)):
            dirlist.append(fname)

    for course_number in sorted(dirlist, key=int):
        source_path = os.path.join(courses_dir, course_number, 'transcripts/')
        source_sentence_path = [source_path + transcript_path for transcript_path in sorted([item for item in os.listdir(source_path) if os.path.isfile(os.path.join(source_path, item)) and '.txt' in item], key=get_num)]

        source_sentences.append(source_sentence_path)

    return [val for sublist in source_sentences for val in sublist]    #Flatten the list of lists

In [1]:
from datasets import TextDataset, ImageDataset, AudioDataset, TargetDataset

import torch
import torchvision
import torch.nn as nn
import torch.nn.functional as F

In [2]:
text_embedding_dir = '/home/anish17281/NLP_Dataset/dataset/'
train_text_loader = torch.utils.data.DataLoader(TextDataset(text_embedding_dir, 405), batch_size = 1, shuffle = False, num_workers = 2)

In [3]:
from PIL import Image
import torchvision.transforms as transforms
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
transform = transforms.Compose([transforms.RandomResizedCrop(256), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize,])

image_dir = '/home/anish17281/NLP_Dataset/dataset/'
train_image_loader = torch.utils.data.DataLoader(ImageDataset(image_dir, transform), batch_size = 1, shuffle = False, num_workers = 2)

In [4]:
audio_dir = '/home/anish17281/NLP_Dataset/dataset/'
train_audio_loader = torch.utils.data.DataLoader(AudioDataset(audio_dir), batch_size = 1, shuffle = False, num_workers = 2)

In [5]:
courses_dir = '/home/anish17281/NLP_Dataset/dataset/'
train_target_loader = torch.utils.data.DataLoader(TargetDataset(courses_dir), batch_size = 1, shuffle = False, num_workers = 2)

In [6]:
len(train_text_loader)

961

In [7]:
len(train_image_loader)

961

In [8]:
len(train_audio_loader)

961

In [9]:
len(train_target_loader)

961

In [10]:
len(train_target_loader.dataset.target_sentences_path), len(train_target_loader.dataset.source_sentences_path)

(961, 961)