In [58]:
import pickle
import numpy as np
from pathlib import Path
from torch.utils.data import Dataset
from moviepy.editor import *
from typing import Tuple, Dict

In [59]:
VIDEO_PATH = Path('../data') / 'urfunny2_video'
AUDIO_PATH = Path('../data') / 'urfunny2_audio'
DATA_PATH = Path('../data/')

In [None]:
target_dict = load_pickle(DATA_PATH / "humor_label_sdk.pkl")
for idx in target_dict.keys():
    video = VideoFileClip(f'../data/urfunny2_video/{idx}.mp4')
    video.audio.write_audiofile(f'../data/urfunny2_audio/{idx}.mp3')


In [60]:
def load_pickle(pickle_file) -> Dict:
    try:
        with open(pickle_file, 'rb') as f:
            pickle_data = pickle.load(f)
    except UnicodeDecodeError as e:
        with open(pickle_file, 'rb') as f:
            pickle_data = pickle.load(f, encoding='latin1')
    except Exception as e:
        print('Unable to load data ', pickle_file, ':', e)
        raise
    return pickle_data

In [128]:
class HumorDataset(Dataset):

    def __init__(self):

        self.origin_text = load_pickle(DATA_PATH / "language_sdk.pkl")
        self.preprocessed_text = []
        for idx in self.origin_text:
            language_feats = self.origin_text[idx]['context_sentences'] + \
                [self.origin_text[idx]['punchline_sentence']]
            language_feats = list(
                map(lambda i: ' ' + language_feats[i], range(0, len(language_feats))))
            language_feats = '.'.join(language_feats)
            self.preprocessed_text.append(language_feats)

        target_dict = load_pickle(DATA_PATH / "humor_label_sdk.pkl")
        self.video = [VIDEO_PATH / f'{idx}.mp4' for idx in target_dict.keys()]
        self.audio = [AUDIO_PATH / f'{idx}.mp3' for idx in target_dict.keys()]
        self.files_idx = list(target_dict.keys())
        self.target = list(target_dict.values())

    def save(self):
        with open(DATA_PATH / 'text_sdk.pkl', 'wb') as t_file:
            pickle.dump(self.preprocessed_text, t_file)

        with open(DATA_PATH / 'video_sdk.pkl', 'wb') as v_file:
            pickle.dump(self.video, v_file)

        with open(DATA_PATH / 'audio_sdk.pkl', 'wb') as a_file:
            pickle.dump(self.audio, a_file)

    def __len__(self):
        return len(self.language_feats)

    def __getitem__(self, index: int) -> Tuple[str, int]:
        return self.files_idx[index], self.preprocessed_text[index], self.video[index], self.audio[index], self.target[index]

In [129]:
dataset = HumorDataset()

In [130]:
dataset[1]

(3,
 " but physics does tell us what particles can be produced. these particles must have just as much mass and energy as is carried in by the proton and. any particles more massive than this energy limit aren't produced and remain invisible to us. this is why this new particle accelerator is so exciting. it's going to push this energy limit seven times beyond what's ever been done before so we're going to get to see some new particles very soon. but before talking about what we might see let me describe the particles we already know of. there's a whole zoo of subatomic particles. most of us are familiar with electrons. a lot of people in this room make a good living pushing them around",
 WindowsPath('../data/urfunny2_video/3.mp4'),
 WindowsPath('../data/urfunny2_audio/3.mp3'),
 1)

In [131]:
dataset.save()