In [None]:
import av
from video import Video
path = '../videos/SOX5yA1l24A.mp4'

In [7]:
test_object = Video(path)

List of available streams: (id, stream_type, stream)
	 0, video, <av.VideoStream #0 h264, yuv420p 340x256 at 0x7f222b3a00c0> 
	 1, audio, <av.AudioStream #1 aac at 48000Hz, mono, fltp at 0x7f222b3a09f0> 
../videos/SOX5yA1l24A.mp4 
 	 default stream:  <av.VideoStream #0 h264, yuv420p 340x256 at 0x7f222b3a00c0>


In [8]:
test_object.list_streams()

List of available streams: (id, stream_type, stream)
	 0, video, <av.VideoStream #0 h264, yuv420p 340x256 at 0x7f222b3a00c0> 
	 1, audio, <av.AudioStream #1 aac at 48000Hz, mono, fltp at 0x7f222b3a09f0> 


[{'type': 'video',
  'stream': <av.VideoStream #0 h264, yuv420p 340x256 at 0x7f222b3a00c0>},
 {'type': 'audio',
  'stream': <av.AudioStream #1 aac at 48000Hz, mono, fltp at 0x7f222b3a09f0>}]

# Implementing rudimentary read video function

### That supports most of the functionality from the current TV

In [9]:
import torch
import torchvision
torchvision.set_video_backend("pyav")
def get_tv(path):
    vframes, aframes, _ = torchvision.io.read_video(path, pts_unit="sec")
    return vframes.size(), aframes.size()

sa, sb = get_tv(path)
sizes = f"Expected sizes: {sa}, {sb}"

In [10]:
import torch
from torchvision import transforms as t


def read_video(vid, start=0, end=None, height=-1, width=-1, read_video=True, read_audio=True):
    if not isinstance(vid, Video):
        vid = Video(path)
    
    # safety checks - basic stuff
    if end is None:
        end = float("inf")
    if end < start:
        raise ValueError(
            "end_pts should be larger than start_pts, got "
            "start_pts={} and end_pts={}".format(start_pts, end_pts)
        )
    
    # safety checks, streams
    stream_types = [x['type'] for x in vid.available_streams]
    if read_video:
        assert "video" in stream_types
    if read_audio:
        assert "audio" in stream_types
    
    # get video_transform to apply per frame
    # should save on memory
    transforms = [t.ToTensor()]
    if width > 0 and height>0:
            transforms.insert(0, t.Resize((height, width), interpolation=2))
            transforms.insert(0, t.ToPILImage())     
    frame_transform = t.Compose(transforms)
    
    current_pts = start
    if read_video:
        video_frames = [] # video frame buffer 
    if read_audio:
        audio_frames = [] # audio frame buffer
    
    # this should get us close to the actual starting point we want
    if read_video:
        vid.seek(start, stream="video")
    
    while current_pts < end:
        if read_video:
            frame, current_pts, stream_t = vid.next("video")
            if current_pts >= start and current_pts < end:
                video_frames.append(frame_transform(frame))
        if read_audio:
            frame, current_pts, stream_t = vid.next("audio")
            assert stream_t == "audio"
            if current_pts >= start and current_pts < end:
                audio_frames.append(torch.tensor(frame))
    
    output = {'video': torch.stack(video_frames, 0) if read_video else torch.empty(0),
              'audio': torch.stack(audio_frames, 0) if read_audio else torch.empty(0)}
    
    return output

In [12]:
from video import Video
path = '../videos/SOX5yA1l24A.mp4'
test_object = Video(path)

test = read_video(test_object)

print(sizes)
print(test['video'].size(), test['audio'].size())

List of available streams: (id, stream_type, stream)
	 0, video, <av.VideoStream #0 h264, yuv420p 340x256 at 0x7f222b43c360> 
	 1, audio, <av.AudioStream #1 aac at 48000Hz, mono, fltp at 0x7f2226318c90> 
../videos/SOX5yA1l24A.mp4 
 	 default stream:  <av.VideoStream #0 h264, yuv420p 340x256 at 0x7f222b43c360>
Expected sizes: torch.Size([332, 256, 340, 3]), torch.Size([1, 530432])
torch.Size([331, 3, 256, 340]) torch.Size([330, 1, 1024])


In [9]:
container

NameError: name 'container' is not defined

# Dataset using the new API

In [40]:
import os
import random

import torch
from torchvision.datasets.folder import make_dataset
from torchvision import transforms as t

from video import Video


def _find_classes(dir):
    classes = [d.name for d in os.scandir(dir) if d.is_dir()]
    classes.sort()
    class_to_idx = {cls_name: i for i, cls_name in enumerate(classes)}
    return classes, class_to_idx

def read_video_frames(vid, start=0, nframes=1, height=-1, width=-1, read_video=True, read_audio=False, from_keyframes=True):
    if not isinstance(vid, Video):
        vid = Video(path)    
    # safety checks, streams
    stream_types = [x['type'] for x in vid.available_streams]
    if read_video:
        assert "video" in stream_types
    if read_audio:
        assert "audio" in stream_types
    
    # get video_transform to apply per frame
    # should save on memory
    transforms = [t.ToTensor()]
    if width > 0 and height>0:
        transforms.insert(0, t.Resize((height, width), interpolation=2))
        transforms.insert(0, t.ToPILImage())     
    frame_transform = t.Compose(transforms)
    
    current_pts = start
    if read_video:
        video_frames = [] # video frame buffer 
    if read_audio:
        audio_frames = [] # audio frame buffer
            
    # this should get us close to the actual starting point we want
    vid.seek(start, stream="video")
    while len(video_frames) < nframes:
        frame, current_pts, stream_t = vid.next("video")
        if from_keyframes:
            video_frames.append(frame_transform(frame))
        else:
            frame, current_pts, stream_t = vid.next("video")
            if current_pts >= start:
                video_frames.append(frame_transform(frame))
    
    output = {'video': torch.stack(video_frames, 0) if read_video else torch.empty(0)}
    return output

class VideoDataset(torch.utils.data.IterableDataset):
    def __init__(self, root, clip_len=16, shuffle=True, bs_multiplier=5, sampling='random', alpha=0.2, height=-1, width=-1):
        super(VideoDataset).__init__()
        # safety checks
        assert isinstance(bs_multiplier, int) and bs_multiplier >= 1
        assert sampling in ["random", "uniform"]
        
        self.root = root
        self.clip_len = clip_len
        self.height=height
        self.width=width
        self.alpha = alpha  #  hack to ensure readin is correct
        self._build_dataset(bs_multiplier, sampling)  
    
    def _build_dataset(self, bs_multiplier, sampling):
        _, ctidx = _find_classes(self.root)
        samples = make_dataset(self.root, ctidx, extensions=(".mp4", ".avi"))
        self.samples = []
        for sample in samples:
            path, target = sample
            vid = Video(path, debug=False)
            max_seek = vid.duration - (self.clip_len / vid.fps + self.alpha)
            if sampling == "random":
                tss = sorted([random.uniform(0., max_seek) for _ in range(bs_multiplier)])
            else:
                step = max(length // self.max_clips_per_video, 1)
                tss = [i.item() for i in list(torch.linspace(0, max_seek, steps=bs_multiplier))]
            
            for ts in tss:
                self.samples.append((path, target, ts))
    
    def _get_sample(self, sample):
        path, target, ts = sample
        vid = Video(path, debug=False)
        sample = read_video_frames(vid, start=ts, nframes=self.clip_len, height=self.height, width=self.width)
        sample['target'] = target
        return sample
        
    
    def __iter__(self):
        return iter([self._get_sample(sample) for sample in self.samples])


In [41]:
ds = VideoDataset("../dataset_files/", height=112, width=112)

In [42]:
from torch.utils.data import DataLoader
from itertools import islice
loader = DataLoader(ds, batch_size=2)

In [43]:
for b in loader:
    print(b)

Compose(
    ToPILImage()
    Resize(size=(112, 112), interpolation=PIL.Image.BILINEAR)
    ToTensor()
)
Compose(
    ToPILImage()
    Resize(size=(112, 112), interpolation=PIL.Image.BILINEAR)
    ToTensor()
)
Compose(
    ToPILImage()
    Resize(size=(112, 112), interpolation=PIL.Image.BILINEAR)
    ToTensor()
)
Compose(
    ToPILImage()
    Resize(size=(112, 112), interpolation=PIL.Image.BILINEAR)
    ToTensor()
)
Compose(
    ToPILImage()
    Resize(size=(112, 112), interpolation=PIL.Image.BILINEAR)
    ToTensor()
)
Compose(
    ToPILImage()
    Resize(size=(112, 112), interpolation=PIL.Image.BILINEAR)
    ToTensor()
)
Compose(
    ToPILImage()
    Resize(size=(112, 112), interpolation=PIL.Image.BILINEAR)
    ToTensor()
)
Compose(
    ToPILImage()
    Resize(size=(112, 112), interpolation=PIL.Image.BILINEAR)
    ToTensor()
)
Compose(
    ToPILImage()
    Resize(size=(112, 112), interpolation=PIL.Image.BILINEAR)
    ToTensor()
)
Compose(
    ToPILImage()
    Resize(size=(112, 112), i