In [1]:
import av
from video import Video
path = '../videos/SOX5yA1l24A.mp4'

In [2]:
test_object = Video(path)

List of available streams: (id, stream_type, stream)
	 0, video, <av.VideoStream #0 h264, yuv420p 340x256 at 0x7f336c8ae750> 
	 1, audio, <av.AudioStream #1 aac at 48000Hz, mono, fltp at 0x7f336c8ae6e0> 
../videos/SOX5yA1l24A.mp4 
 	 default stream:  <av.VideoStream #0 h264, yuv420p 340x256 at 0x7f336c8ae750> 
 	 keyframes only:  False




In [12]:
test_object.list_streams()

List of available streams: (id, stream_type, stream)
	 0, video, <av.VideoStream #0 h264, yuv420p 340x256 at 0x7f66b307ea60> 
	 1, audio, <av.AudioStream #1 aac at 48000Hz, mono, fltp at 0x7f66b307e980> 


[{'type': 'video',
  'stream': <av.VideoStream #0 h264, yuv420p 340x256 at 0x7f66b307ea60>},
 {'type': 'audio',
  'stream': <av.AudioStream #1 aac at 48000Hz, mono, fltp at 0x7f66b307e980>}]

# Implementing rudimentary read video function

### That supports most of the functionality from the current TV

In [6]:
import torch
import torchvision
torchvision.set_video_backend("pyav")
def get_tv(path):
    vframes, aframes, _ = torchvision.io.read_video(path, 8, 10, pts_unit="sec")
    return vframes.size(), aframes.size()

sa, sb = get_tv(path)
sizes = f"Expected sizes: {sa}, {sb}"

In [7]:
import torch
from torchvision import transforms as t

def read_video(vid, start=0, end=None, height=-1, width=-1, read_video=True, read_audio=True):
    if not isinstance(vid, Video):
        vid = Video(path)
    
    # safety checks - basic stuff
    if end is None:
        end = float("inf")
    if end < start:
        raise ValueError(
            "end_pts should be larger than start_pts, got "
            "start_pts={} and end_pts={}".format(start_pts, end_pts)
        )
    
    # safety checks, streams
    stream_types = [x['type'] for x in vid.available_streams]
    if read_video:
        assert "video" in stream_types
    if read_audio:
        assert "audio" in stream_types
    
    # get video_transform to apply per frame
    # should save on memory
    transforms = [t.ToTensor()]
    if width > 0 and height>0:
            transforms.insert(0, t.Resize((height, width), interpolation=2))
            transforms.insert(0, t.ToPILImage())     
    frame_transform = t.Compose(transforms)
    
    current_pts = start
    if read_video:
        video_frames = [] # video frame buffer 
    if read_audio:
        audio_frames = [] # audio frame buffer
    
    # this should get us close to the actual starting point we want
    if read_video:
        vid.seek(start, stream="video")
    if read_audio:
        vid.seek(start, stream="audio")
    
    while current_pts <= end:
        if read_video:
            frame, current_pts, stream_t = vid.next("video")
            assert stream_t == "video"
            if current_pts >= start and current_pts <= end:
                video_frames.append(frame_transform(frame))
        if read_audio:
            frame, current_pts, stream_t = vid.next("audio")
            assert stream_t == "audio"
            if current_pts >= start and current_pts <= end:
                audio_frames.append(torch.tensor(frame))
    
    output = {'video': torch.stack(video_frames, 0) if read_video else torch.empty(0),
              'audio': torch.stack(audio_frames, 0) if read_audio else torch.empty(0)}
    
    return output

In [8]:
from video import Video
path = '../videos/SOX5yA1l24A.mp4'
test_object = Video(path)

test = read_video(test_object, 0, 2)

print(sizes)
print(test['video'].size(), test['audio'].size())

List of available streams: (id, stream_type, stream)
	 0, video, <av.VideoStream #0 h264, yuv420p 340x256 at 0x7f65057df210> 
	 1, audio, <av.AudioStream #1 aac at 48000Hz, mono, fltp at 0x7f65057df3d0> 
../videos/SOX5yA1l24A.mp4 
 	 default stream:  <av.VideoStream #0 h264, yuv420p 340x256 at 0x7f65057df210> 
 	 keyframes only:  False
Expected sizes: torch.Size([61, 256, 340, 3]), torch.Size([1, 0])
torch.Size([60, 3, 256, 340]) torch.Size([60, 1, 1024])


torch.Size([60, 1, 384, 340]) torch.Size([60, 1, 1024])
