In [1]:
import av
import math
from video_b import Video
path = '../videos/SOX5yA1l24A.mp4'

In [2]:
bla = Video(path)

In [3]:
bla.seek(8)

In [4]:
print(bla.next())

<av.VideoStream #0 h264, yuv420p 340x256 at 0x7fe979377de0> 7.3073
<av.AudioStream #1 aac at 48000Hz, mono, fltp at 0x7fe979377d00> 7.317333333333333
[(array([[ 26, 101, 123, ..., 216, 216, 216],
       [ 29, 104, 126, ..., 219, 219, 219],
       [ 31, 106, 128, ..., 226, 226, 226],
       ...,
       [132, 132, 132, ..., 132, 132, 132],
       [132, 132, 132, ..., 132, 132, 132],
       [132, 132, 132, ..., 132, 132, 132]], dtype=uint8), 7.3073, 'video'), (array([[ 0.        ,  0.        ,  0.        , ..., -0.00657651,
        -0.00641972, -0.00634111]], dtype=float32), 7.317333333333333, 'audio')]


# Implementing rudimentary read video function

In [5]:
import torch
from torchvision import transforms as t


# read video
def _read_video(vo, per_frame_transform, start=0, end=None ):
    if not isinstance(vo, Video):
        vo = Video(path)
    
    if end is None:
        end = float("inf")
    
    if end < start:
        raise ValueError(
            "end_pts should be larger than start_pts, got "
            "start_pts={} and end_pts={}".format(start_pts, end_pts)
        )
    
    
    current_pts = start
    samples = {}

       
    # this should get us close to the actual starting point we want
    vo.seek(start)
    while current_pts <= end:
        data = vo.next()
        for frame in data:
            if frame[2] not in samples:
                samples[frame[2]] = []
                samples[f"{frame[2]}_pts"] = []
            if frame[2] == "video":
                samples[frame[2]].append(per_frame_transform(frame[0]))
                current_pts = frame[1]
            else:
                samples[frame[2]].append(torch.from_numpy(frame[0]))
            samples[f"{frame[2]}_pts"].append(frame[1])
        
    
    return samples
    

In [6]:
def read_video(vo, start=0, end=None, width=-1, height=-1):
    # get transfroms per frames
    transforms = [t.ToTensor()]
    if width > 0 and height>0:
            transforms.insert(0, t.Resize((height, width), interpolation=2))
            transforms.insert(0, t.ToPILImage())
            
    transform = t.Compose(transforms)
    
    return _read_video(vo, transform, start, end)


In [7]:
test = read_video(bla, 0, 1)
print(test)

<av.VideoStream #0 h264, yuv420p 340x256 at 0x7fe979377de0> 0.0
<av.AudioStream #1 aac at 48000Hz, mono, fltp at 0x7fe979377d00> 0.0
<av.VideoStream #0 h264, yuv420p 340x256 at 0x7fe979377de0> 0.03336666666666667
<av.AudioStream #1 aac at 48000Hz, mono, fltp at 0x7fe979377d00> 0.042666666666666665
<av.VideoStream #0 h264, yuv420p 340x256 at 0x7fe979377de0> 0.06673333333333334
<av.AudioStream #1 aac at 48000Hz, mono, fltp at 0x7fe979377d00> 0.08533333333333333
<av.VideoStream #0 h264, yuv420p 340x256 at 0x7fe979377de0> 0.1001
<av.AudioStream #1 aac at 48000Hz, mono, fltp at 0x7fe979377d00> 0.10666666666666667
<av.VideoStream #0 h264, yuv420p 340x256 at 0x7fe979377de0> 0.13346666666666668
<av.AudioStream #1 aac at 48000Hz, mono, fltp at 0x7fe979377d00> 0.14933333333333335
<av.VideoStream #0 h264, yuv420p 340x256 at 0x7fe979377de0> 0.16683333333333333
<av.AudioStream #1 aac at 48000Hz, mono, fltp at 0x7fe979377d00> 0.17066666666666666
<av.VideoStream #0 h264, yuv420p 340x256 at 0x7fe97937

In [41]:
test = read_video(bla, 8, 10, 376, 224)
print(test.size())

torch.Size([60, 1, 224, 376])


In [42]:
import torch
import torchvision
torchvision.set_video_backend("pyav")
def get_tv(path):
    vframes, _, _ = torchvision.io.read_video(path, 8, 10, pts_unit="sec")
    print("TVAV", len(vframes))

get_tv(path)

TVAV 61
