In [1]:
import av
from video import Video
path = '../videos/SOX5yA1l24A.mp4'

In [2]:
test_object = Video(path)

List of available streams: (id, stream_type, stream)
	 0, video, <av.VideoStream #0 h264, yuv420p 340x256 at 0x7ff41f98b8a0> 
	 1, audio, <av.AudioStream #1 aac at 48000Hz, mono, fltp at 0x7ff41f98ba60> 
Metadata set
../videos/SOX5yA1l24A.mp4 
 	 default stream:  <av.VideoStream #0 h264, yuv420p 340x256 at 0x7ff41f98b8a0>




In [3]:
test_object.metadata

{<av.VideoStream #0 h264, yuv420p 340x256 at 0x7ff41f98b8a0>: {'fps': Fraction(30000, 1001),
  'duration': 332332},
 <av.AudioStream #1 aac at 48000Hz, mono, fltp at 0x7ff41f98ba60>: {'fps': 48000,
  'duration': 530880}}

# Implementing rudimentary read video function

### That supports most of the functionality from the current TV

In [1]:
import torch
import torchvision
torchvision.set_video_backend("pyav")
def get_tv(path):
    vframes, aframes, _ = torchvision.io.read_video(path, pts_unit="sec")
    return vframes.size(), aframes.size()

sa, sb = get_tv(path)
sizes = f"Expected sizes: {sa}, {sb}"

NameError: name 'path' is not defined

In [None]:
import torch
from torchvision import transforms as t


def read_video(vid, start=0, end=None, height=-1, width=-1, read_video=True, read_audio=True):
    if not isinstance(vid, Video):
        vid = Video(path)
    
    # safety checks - basic stuff
    if end is None:
        end = float("inf")
    if end < start:
        raise ValueError(
            "end_pts should be larger than start_pts, got "
            "start_pts={} and end_pts={}".format(start_pts, end_pts)
        )
    
    # safety checks, streams
    stream_types = [x['type'] for x in vid.available_streams]
    if read_video:
        assert "video" in stream_types
    if read_audio:
        assert "audio" in stream_types
    
    # get video_transform to apply per frame
    # should save on memory
    transforms = [t.ToTensor()]
    if width > 0 and height>0:
            transforms.insert(0, t.Resize((height, width), interpolation=2))
            transforms.insert(0, t.ToPILImage())     
    frame_transform = t.Compose(transforms)
    
    current_pts = start
    if read_video:
        video_frames = [] # video frame buffer 
    if read_audio:
        audio_frames = [] # audio frame buffer
    
    # this should get us close to the actual starting point we want
    if read_video:
        vid.seek(start, stream="video")
    
    while current_pts < end:
        if read_video:
            frame, current_pts, stream_t = vid.next("video")
            if current_pts >= start and current_pts < end:
                video_frames.append(frame_transform(frame))
        if read_audio:
            frame, current_pts, stream_t = vid.next("audio")
            assert stream_t == "audio"
            if current_pts >= start and current_pts < end:
                audio_frames.append(torch.tensor(frame))
    
    output = {'video': torch.stack(video_frames, 0) if read_video else torch.empty(0),
              'audio': torch.stack(audio_frames, 0) if read_audio else torch.empty(0)}
    
    return output

In [2]:
from video import Video
path = '../videos/SOX5yA1l24A.mp4'
test_object = Video(path)

test = read_video(test_object)

print(sizes)
print(test['video'].size(), test['audio'].size())

List of available streams: (id, stream_type, stream)
	 0, video, <av.VideoStream #0 h264, yuv420p 340x256 at 0x7f054062af30> 
	 1, audio, <av.AudioStream #1 aac at 48000Hz, mono, fltp at 0x7f054062ac90> 
Metadata set
../videos/SOX5yA1l24A.mp4 
 	 default stream:  <av.VideoStream #0 h264, yuv420p 340x256 at 0x7f054062af30>




NameError: name 'read_video' is not defined

# Dataset using the new API

a dumb approach mimiking the MAP-styple dataset

In [1]:
import os
import random

import torch
from torchvision.datasets.folder import make_dataset
from torchvision import transforms as t

from video import Video


def _find_classes(dir):
    classes = [d.name for d in os.scandir(dir) if d.is_dir()]
    classes.sort()
    class_to_idx = {cls_name: i for i, cls_name in enumerate(classes)}
    return classes, class_to_idx

def read_video_frames(vid, start=0, nframes=1, height=-1, width=-1, read_video=True, read_audio=False, from_keyframes=True):
    if not isinstance(vid, Video):
        vid = Video(path)    
    # safety checks, streams
    stream_types = [x['type'] for x in vid.available_streams]
    if read_video:
        assert "video" in stream_types
    if read_audio:
        assert "audio" in stream_types
    
    # get video_transform to apply per frame
    # should save on memory
    transforms = [t.ToTensor()]
    if width > 0 and height>0:
        transforms.insert(0, t.Resize((height, width), interpolation=2))
        transforms.insert(0, t.ToPILImage())     
    frame_transform = t.Compose(transforms)
    
    current_pts = start
    if read_video:
        video_frames = [] # video frame buffer 
    if read_audio:
        audio_frames = [] # audio frame buffer
            
    # this should get us close to the actual starting point we want
    vid.seek(start, stream="video")
    while len(video_frames) < nframes:
        frame, current_pts, stream_t = vid.next("video")
        if from_keyframes:
            video_frames.append(frame_transform(frame))
        else:
            if current_pts >= start:
                video_frames.append(frame_transform(frame))
    
    output = {'video': torch.stack(video_frames, 0) if read_video else torch.empty(0)}
    return output

class VideoDataset(torch.utils.data.IterableDataset):
    def __init__(self, root, clip_len=16, shuffle=True, bs_multiplier=5, sampling='random', alpha=0.2, height=-1, width=-1):
        super(VideoDataset).__init__()
        # safety checks
        assert isinstance(bs_multiplier, int) and bs_multiplier >= 1
        assert sampling in ["random", "uniform"]
        
        self.root = root
        self.clip_len = clip_len
        self.height=height
        self.width=width
        self.alpha = alpha  #  hack to ensure readin is correct
        self._build_dataset(bs_multiplier, sampling)  
    
    def _build_dataset(self, bs_multiplier, sampling):
        _, ctidx = _find_classes(self.root)
        samples = make_dataset(self.root, ctidx, extensions=(".mp4", ".avi"))
        self.samples = []
        for sample in samples:
            path, target = sample
            vid = Video(path, debug=False)
            max_seek = vid.metadata[vid.current_stream]['duration'] - (self.clip_len / vid.metadata[vid.current_stream]['fps'] + self.alpha)
            if sampling == "random":
                tss = sorted([random.uniform(0., max_seek) for _ in range(bs_multiplier)])
            else:
                step = max(length // self.max_clips_per_video, 1)
                tss = [i.item() for i in list(torch.linspace(0, max_seek, steps=bs_multiplier))]
            
            for ts in tss:
                self.samples.append((path, target, ts))
    
    def _get_sample(self, sample):
        path, target, ts = sample
        vid = Video(path, debug=False)
        sample = read_video_frames(vid, start=ts, nframes=self.clip_len, height=self.height, width=self.width)
        sample['target'] = target
        return sample
        
    
    def __iter__(self):
        return iter([self._get_sample(sample) for sample in self.samples])


In [2]:
ds = VideoDataset("../dataset_files/", height=112, width=112)



In [3]:
from torch.utils.data import DataLoader
from itertools import islice
loader = DataLoader(ds, batch_size=2)

In [4]:
for b in loader:
    print(b)

{'video': tensor([[[[[0.0824, 0.0824, 0.0824,  ..., 0.5490, 0.5451, 0.5412],
           [0.0824, 0.0824, 0.0824,  ..., 0.5490, 0.5529, 0.5569],
           [0.0824, 0.0824, 0.0824,  ..., 0.5490, 0.5569, 0.5569],
           ...,
           [0.8627, 0.9020, 0.9255,  ..., 0.9137, 0.9020, 0.8902],
           [0.8157, 0.8118, 0.8118,  ..., 0.8510, 0.8667, 0.8627],
           [0.8980, 0.8902, 0.8706,  ..., 0.9725, 0.9333, 0.8863]],

          [[0.0549, 0.0549, 0.0549,  ..., 0.3373, 0.3333, 0.3294],
           [0.0549, 0.0549, 0.0549,  ..., 0.3373, 0.3412, 0.3451],
           [0.0549, 0.0549, 0.0549,  ..., 0.3373, 0.3412, 0.3451],
           ...,
           [0.7529, 0.7922, 0.8157,  ..., 0.8275, 0.8157, 0.8196],
           [0.6980, 0.6941, 0.6941,  ..., 0.7647, 0.7804, 0.7922],
           [0.7804, 0.7725, 0.7529,  ..., 0.8863, 0.8510, 0.8157]],

          [[0.0941, 0.0941, 0.0941,  ..., 0.4275, 0.4196, 0.4118],
           [0.0941, 0.0941, 0.0941,  ..., 0.4275, 0.4275, 0.4275],
           [0.09

## Attempt 2 - refining things

Ok, so the first one was just a mockup - a copy of the way we do things using map. Now we create a more "iterator style" based one.

In [9]:
import os
import random

import torch
from torchvision.datasets.folder import make_dataset
from torchvision import transforms as t

from video import Video

class VideoDataset(torch.utils.data.IterableDataset):
    def __init__(self, root, epoch_size=None, frame_transform=None, video_transform=None, clip_len=16, shuffle=True, from_keyframes=True, alpha=0.2):
        super(VideoDataset).__init__()
        _, class_to_idx = VideoDataset._find_classes(root)
        self.samples = make_dataset(root, class_to_idx, extensions=(".mp4", ".avi"))
        
        # allow for temporal jittering and stuff
        if epoch_size is None:
            epoch_size = len(self.samples)
        self.epoch_size = epoch_size
        # length of a clip
        self.clip_len = clip_len
        # transform for every frame
        self.frame_transform = frame_transform if frame_transform is not None else t.ToTensor()
        # transform on batch of frames (video sequence)
        self.video_transform = video_transform
        # tollerance for seeking
        self.alpha = alpha
        # only look at keyframes
        self.from_keyframes = from_keyframes
        
    
    @staticmethod
    def _find_classes(dir):
        classes = [d.name for d in os.scandir(dir) if d.is_dir()]
        classes.sort()
        class_to_idx = {cls_name: i for i, cls_name in enumerate(classes)}
        return classes, class_to_idx

    def __iter__(self):
        for i in range(self.epoch_size):
            # get random sample
            path, target = random.choice(self.samples)
            # get video object
            vid = Video(path, debug=False)
            video_frames = [] # video frame buffer 
            # seek and return frames
            max_seek = vid.metadata[vid.current_stream]['duration'] - (self.clip_len / vid.metadata[vid.current_stream]['fps'] + self.alpha)
            start = random.uniform(0., max_seek)
            vid.seek(start, stream="video")
            while len(video_frames) < self.clip_len:
                frame, current_pts, stream_t = vid.next("video")
                if self.from_keyframes:
                    video_frames.append(self.frame_transform(frame))
                else:
                    if current_pts >= start:
                        video_frames.append(self.frame_transform(frame))
            video = torch.stack(video_frames, 0)
            if self.video_transform:
                video = self.video_transform(video)
            output = {
                'path': path,
                'video': video,
                'target': target,
                'start': start,
                'end': current_pts}
            yield output

In [10]:
transforms = [t.ToPILImage(), t.Resize((112, 112), interpolation=2), t.ToTensor()]
frame_transform = t.Compose(transforms)
ds = VideoDataset("../dataset_files/", epoch_size=20, frame_transform=frame_transform, from_keyframes=False)

In [11]:
from torch.utils.data import DataLoader
from itertools import islice
loader = DataLoader(ds, batch_size=12)

In [12]:
for b in loader:
    print(b["start"], b['end'])

tensor([5.5447, 3.8373, 8.6429, 4.8835, 4.7216, 1.2900, 1.8597, 6.3515, 1.2064,
        3.6049, 6.0683, 0.6378], dtype=torch.float64) tensor([6.0727, 4.3710, 9.1667, 5.4054, 5.2386, 1.8018, 2.3690, 6.8667, 1.7000,
        4.1375, 6.5732, 1.1333], dtype=torch.float64)
tensor([0.5249, 4.1899, 4.2973, 0.7974, 5.5179, 3.0670, 8.3097, 1.2130],
       dtype=torch.float64) tensor([1.0000, 4.7047, 4.8048, 1.2667, 6.0394, 3.5702, 8.8422, 1.7351],
       dtype=torch.float64)
