In [1]:
import av
import numpy as np
import torch 
from typing import List, Tuple
from numpy.random import randint

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
import os
os.chdir('../')

from src.opts.opts import parser

from src.dataset.prepare_split_list import get_video_name_list
from src.dataset.video_dataset import prepare_clips_data
from src.dataset.frame_loader import load_av_frames_from_video
from src.dataset.temporal_sampling import temporal_sampling

from src.hands.loader import read_hand_pose_txt, load_hands_coords, make_paths_to_hands
from src.hands.temporal_sampling import hands_temporal_sampling
from src.hands.vis import vis_hands_skeleton
from src.hands.graph import joints

In [3]:
args = parser.parse_args(args=[])
args.__dict__

{'holoassist_dir': '/Users/artemmerinov/data/holoassist',
 'raw_annotation_file': '/Users/artemmerinov/data/holoassist/data-annotation-trainval-v1_1.json',
 'split_dir': '/Users/artemmerinov/data/holoassist/data-splits-v1',
 'fine_grained_actions_map_file': '/Users/artemmerinov/data/holoassist/fine_grained_actions_map.txt',
 'base_model': 'InceptionV3',
 'fusion_mode': None,
 'num_segments': 8,
 'num_classes': 1887,
 'resume': None,
 'start_epoch': 0,
 'num_epochs': 10,
 'lr': 0.01,
 'momentum': 0.9,
 'weight_decay': 0.0005,
 'clip_gradient': None,
 'batch_size': 16,
 'num_workers': 4,
 'prefetch_factor': 2,
 'debug': False}

In [4]:
video_name_list = get_video_name_list(
    split_dir=args.split_dir,
    holoassist_dir=args.holoassist_dir,
    mode="train"
)
video_name_list

There are 7 videos in the list There are 13 videos as video files There are 0 videos that present in the list but are missing as videos.


['R0027-12-GoPro',
 'R029-12July-DSLR',
 'R034-12July-Switch',
 'R061-15July-Belt',
 'z183-sep-08-22-marius_disassemble',
 'z090-july-08-22-printer_small',
 'z206-sep-26-22-gopro']

In [5]:
make_paths_to_hands(
    holoassist_dir=args.holoassist_dir,
    video_name="R029-12July-DSLR",
)

('/Users/artemmerinov/data/holoassist/hands/R029-12July-DSLR/Export_py/Hands/Left_sync.txt',
 '/Users/artemmerinov/data/holoassist/hands/R029-12July-DSLR/Export_py/Hands/Right_sync.txt')

In [6]:
left_hand_data, right_hand_data = load_hands_coords(
    holoassist_dir=args.holoassist_dir,
    video_name="R029-12July-DSLR",
    start_secs=60,
    end_secs=70
)

In [7]:
left_hand_data.shape

torch.Size([299, 26, 3])

In [8]:
left_hand_data

tensor([[[ 0.4920,  2.5695, -0.5036],
         [ 0.4409,  2.5598, -0.5026],
         [ 0.4574,  2.5716, -0.4807],
         ...,
         [ 0.5445,  2.5388, -0.5236],
         [ 0.5592,  2.5271, -0.5200],
         [ 0.5692,  2.5187, -0.5127]],

        [[ 0.4917,  2.5697, -0.5037],
         [ 0.4406,  2.5600, -0.5026],
         [ 0.4571,  2.5719, -0.4808],
         ...,
         [ 0.5441,  2.5390, -0.5237],
         [ 0.5589,  2.5273, -0.5200],
         [ 0.5690,  2.5190, -0.5128]],

        [[ 0.4915,  2.5697, -0.5040],
         [ 0.4404,  2.5599, -0.5030],
         [ 0.4569,  2.5718, -0.4811],
         ...,
         [ 0.5441,  2.5391, -0.5237],
         [ 0.5588,  2.5275, -0.5200],
         [ 0.5688,  2.5192, -0.5127]],

        ...,

        [[ 0.5238,  2.5529, -0.5209],
         [ 0.4721,  2.5481, -0.5235],
         [ 0.4866,  2.5576, -0.5043],
         ...,
         [ 0.5787,  2.5208, -0.5370],
         [ 0.5947,  2.5102, -0.5342],
         [ 0.6057,  2.5027, -0.5275]],

        [[

In [9]:
# Extract frames from video using start and end time. 
frames = load_av_frames_from_video(
    path_to_video="/Users/artemmerinov/data/holoassist/video_pitch_shifted/R0027-12-GoPro/Export_py/Video_pitchshift.mp4",
    start_secs=378.588, 
    end_secs=380.211,
)

# Perform temporal sampling
sampling_portions, frames = temporal_sampling(
    frames=frames,
    num_segments=8,
    mode="train"
)
sampling_portions, frames

(array([0.02439024, 0.12195122, 0.34146341, 0.46341463, 0.48780488,
        0.6097561 , 0.7804878 , 0.87804878]),
 [<av.VideoFrame #79, pts=281499966 yuv420p 896x504 at 0x7fcc883eb990>,
  <av.VideoFrame #83, pts=281621538 yuv420p 896x504 at 0x7fcc883ebb50>,
  <av.VideoFrame #92, pts=281895075 yuv420p 896x504 at 0x7fcc883ebf40>,
  <av.VideoFrame #97, pts=282047040 yuv420p 896x504 at 0x7fcc884b8200>,
  <av.VideoFrame #98, pts=282077433 yuv420p 896x504 at 0x7fcc884b8270>,
  <av.VideoFrame #103, pts=282229398 yuv420p 896x504 at 0x7fcc884b84a0>,
  <av.VideoFrame #110, pts=282442149 yuv420p 896x504 at 0x7fcc884b87b0>,
  <av.VideoFrame #114, pts=282563721 yuv420p 896x504 at 0x7fcc884b8970>])

In [10]:
left_hand_data = hands_temporal_sampling(
    data=left_hand_data, 
    sampling_portions=sampling_portions
)
right_hand_data = hands_temporal_sampling(
    data=right_hand_data, 
    sampling_portions=sampling_portions
)

left_hand_data.size(), right_hand_data.size()

(torch.Size([8, 26, 3]), torch.Size([8, 26, 3]))