In [1]:
import cv2
import clip
import torch
import numpy as np
from PIL import Image


In [2]:

# Function to extract frames from a video file
def extract_frames(video_path, interval=1):
    # Read the video
    cap = cv2.VideoCapture(video_path)
    fps = cap.get(cv2.CAP_PROP_FPS)  # Frame Per Second
    frames = []
    
    # Check if video opened successfully
    if not cap.isOpened():
        print("Error opening video file")
        return frames

    frame_index = 0
    while cap.isOpened():
        ret, frame = cap.read()
        if ret:
            # Convert frame to RGB (OpenCV uses BGR)
            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            
            # Save frame every 'interval' seconds
            if frame_index % int(fps * interval) == 0:
                frames.append(frame_rgb)
            frame_index += 1
        else:
            break

    # When everything done, release the video capture object
    cap.release()
    return frames

In [3]:
# Function to extract embeddings from a frame using CLIP ViT-L/14
def extract_clip_embeddings(image):
    # Load the CLIP model
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model, preprocess = clip.load("ViT-L/14", device=device)

    # Preprocess the image and compute the features
    image_preprocessed = preprocess(Image.fromarray(image)).unsqueeze(0).to(device)
    with torch.no_grad():
        image_features = model.encode_image(image_preprocessed)
    
    # Convert to numpy array
    return image_features.cpu().numpy()

In [4]:
# Main function to extract embeddings from all frames of a video
def extract_video_embeddings(video_path, interval=1):
    frames = extract_frames(video_path, interval=interval)
    embeddings = []
    for frame in frames:
        embedding = extract_clip_embeddings(frame)
        embeddings.append(embedding)
    return np.vstack(embeddings)  # Stack embeddings of all frames


In [6]:
# Usage example
video_path = 'Interaction_T1_2297.mp4'  # Replace with your video path
embeddings = extract_video_embeddings(video_path, interval=0.5)
print("Embeddings shape:", embeddings.shape)




Embeddings shape: (11, 768)


In [19]:
import torch
import torch.optim as optim
from torch.utils.data import DataLoader
import sys, os
from tqdm import tqdm
from torch.utils.tensorboard import SummaryWriter
from datetime import datetime
import argparse, time
from dataset.star import VideoQADataset, VideoQACollator, repeat_tensor_rows, trans_results
from model.transformer_gf import build_transformer
from model.glance_focus import GF, SetCriterion_UNS
import pdb

In [None]:
# --device_id 0 --test_only 1 --qa_dataset star --base_data_dir dataset/STAR --reload_model_path expm/star/gf_logs/ckpts_2024-01-17T10-30-46/model_3000.tar

In [31]:
import argparse
from argparse import Namespace
args = Namespace(basedir='expm/star', name='gf_logs', device_id=0, batch_size=64, nepoch=10, lr=5e-06, i_val=300, i_test=300, i_print=100, i_weight=1000, test_only=1, reload_model_path='expm/star/gf_logs/ckpts_2024-01-17T10-30-46/model_3000.tar', hidden_dim=512, num_layers=2, num_queries=10, event_pred_dim=50, max_feats=80, qa_dataset='star', task_type='star', num_options=4, output_dim=1, base_data_dir='dataset/STAR', train_data_file_path='{}/txt_db/train.jsonl', test_data_file_path='{}/txt_db/test.jsonl', val_data_file_path='{}/txt_db/val.jsonl', event_anno_path='{}/txt_db/events.json', action_mapping_path='{}/txt_db/action_mapping.txt', app_feat_path='{}/vis_db/s3d.pth', feature_dim=1024, str2num_file='{}/vis_db/strID2numID.json', losses_type=['qa', 'cls', 'giou', 'cert'], qa_loss_coef=1, cls_loss_coef=0.5, giou_loss_coef=0.5, cert_loss_coef=1)

In [32]:
transformer = build_transformer(args)
device = torch.device(f"cuda:{0}" if torch.cuda.is_available() else "cpu")

model = GF(
        transformer,
        num_queries=args.num_queries,
        feature_dim=args.feature_dim,
        output_dim=args.output_dim,
        event_pred_dim=args.event_pred_dim,
        qa_dataset=args.qa_dataset
    ).to(device)

# model = GF(
#         transformer,
#         num_queries= 10, # args.num_queries,
#         feature_dim= 1024, # args.feature_dim,
#         output_dim= 1, # args.output_dim,
#         event_pred_dim= 50, # args.event_pred_dim,
#         qa_dataset= 'star' # args.qa_dataset
#     ).to(device)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
{       'vid': tensor([[0.3739, 0.5834, 0.1591,  ..., 0.2401, 0.0580, 0.5093],
        [0.4297, 0.7157, 0.2008,  ..., 0.2973, 0.0579, 0.5230],
        [0.5246, 0.6812, 0.2543,  ..., 0.2987, 0.0293, 0.4852],
        ...,
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]]), 
        'examples': [{'q_str': 'Which object was put down by the person?', 'question_id': 'Interaction_T1_0', 'label': None, 'options_str_list': ['The food.', 'The laptop.', 'The book.', 'The pillow.']}], 
        'n_examples': 1, 
        'span': tensor([[0.7243, 1.0775],
        [0.7782, 1.0775]]), 
        'hoi': tensor([61, 63])}

In [33]:
18.0/16.705726819301727

1.0774748201438846

In [None]:
"Interaction_T1_0": {"duration": 16.705726819301727, "actions": [[61, 12.1, 18.0], [63, 13.0, 18.0]]}

In [None]:
dict(
    vid=torch.Tensor(appearance_feat),
    examples=examples,
    n_examples=len(examples),  # used to create image feature copies.
    span=span,
    hoi=hoi
)

In [39]:
batch = dict(
    vid = torch.Tensor(embeddings),
    examples = [{'q_str': 'Which object was thrown by the person?', 'question_id': 'Interaction_T1_2297', 'label': None, 'options_str_list': ["The clothes.", "The bag.", "The blanket.", "The pillow."]}],
    n_examples = 1,
    span = torch.Tensor([[0.2620, 0.4627], [0.2620, 0.4422]]),
    hoi = torch.Tensor([18, 16])
)

frame_features = collated_batch
# {"question_id": "Interaction_T1_2297", "question": "Which object was thrown by the person?", "video_id": "MIV2M", "options": ["The clothes.", "The bag.", "The blanket.", "The pillow."], "answer": 1}

In [49]:
frame_features = collated_batch['visual_inputs']
frame_features = torch.stack(frame_features)



In [None]:
# "Interaction_T1_2297": {"duration": 24.4244, "actions": [[18, 6.4, 11.3], [16, 6.4, 10.8]]}

In [50]:
visual_attention_mask = torch.ones(frame_features.shape[:-1], dtype=torch.float).to(device)

memory_cache = model(frame_features, visual_attention_mask, None, encode_and_save=True, glance=True)
outputs_event = model(frame_features, visual_attention_mask, None, encode_and_save=False, glance=True,memory_cache=memory_cache, query_type='event')

RuntimeError: mat1 and mat2 shapes cannot be multiplied (11x768 and 1024x512)

In [38]:
import torch
from torch.utils.data.dataloader import default_collate

# Assuming the MULTI_CHOICE_QA contains 'star', adjust based on your case
MULTI_CHOICE_QA = ['star', 'nextqa_mc']  # Add other types as necessary

# Function to flatten a list of lists
def flat_list_of_lists(lists):
    return [item for sublist in lists for item in sublist]


class VideoQACollator(object):
    def __init__(self, task_type='star', n_options=4):
        self.task_type = task_type
        self.n_options = n_options
        if self.task_type == 'nextqa_mc':
            self.n_options = 5

    def collate_batch(self, batch):
        visual_inputs = [d["vid"] for d in batch]  # <list> (B, dict)
        text_examples = flat_list_of_lists([d["examples"] for d in batch])
        n_examples_list = [d["n_examples"] for d in batch]  # (B, )
        if self.task_type in MULTI_CHOICE_QA:
            text_str_list = flat_list_of_lists(
                [[d["q_str"] + " " + d["options_str_list"][i] for i in range(self.n_options)]
                 for d in text_examples]
            )
        else:
            text_str_list = [d["q_str"] for d in text_examples]
        labels = default_collate([int(d["label"]) for d in text_examples]) if text_examples[0]["label"] is not None else None
        question_ids = [d["question_id"] for d in text_examples]
        span_lst = [d["span"] for d in batch]
        hoi_lst = [d["hoi"] for d in batch]
        return dict(
            visual_inputs=visual_inputs,
            text_str_list=text_str_list,
            question_ids=question_ids,
            labels=labels,
            n_examples_list=n_examples_list,
            span_lst=span_lst,
            hoi_lst=hoi_lst
        )


collator = VideoQACollator(task_type='star', n_options=4)

batch_data = [
    {
        'vid': torch.Tensor(embeddings),  # Embeddings should be defined elsewhere in your code
        'examples': [
            {
                'q_str': 'Which object was thrown by the person?',
                'question_id': 'Interaction_T1_2297',
                'label': None,  # Replace with actual label if available
                'options_str_list': ["The clothes.", "The bag.", "The blanket.", "The pillow."]
            }
        ],
        'n_examples': 1,
        'span': torch.Tensor([[0.2620, 0.4627], [0.2620, 0.4422]]),
        'hoi': torch.Tensor([18, 16])
    }
]

collated_batch = collator.collate_batch(batch_data)
print(collated_batch)


{'visual_inputs': [tensor([[-0.3643,  0.6107,  0.2408,  ..., -0.0514, -0.0760, -0.1956],
        [-0.3247,  0.6130,  0.2897,  ...,  0.0068, -0.1226, -0.1965],
        [-0.2733,  0.5206,  0.3079,  ..., -0.0090, -0.0889, -0.1642],
        ...,
        [ 0.0631, -0.2323,  0.0952,  ...,  0.1765, -0.5188, -0.2757],
        [-0.1976, -0.2839,  0.3241,  ...,  0.1687, -0.2975, -0.2472],
        [ 0.2986, -0.1343,  0.3143,  ...,  0.0101, -0.0746, -0.5556]])], 'text_str_list': ['Which object was thrown by the person? The clothes.', 'Which object was thrown by the person? The bag.', 'Which object was thrown by the person? The blanket.', 'Which object was thrown by the person? The pillow.'], 'question_ids': ['Interaction_T1_2297'], 'labels': None, 'n_examples_list': [1], 'span_lst': [tensor([[0.2620, 0.4627],
        [0.2620, 0.4422]])], 'hoi_lst': [tensor([18., 16.])]}
