In [1]:
import sys
import os

In [2]:
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [3]:
import numpy as np
import os
import io
import cv2

import torch

from config import (Config,
                    eval_dict_leaf)

from demo_utils import (
    retrieve_text,
    _frame_from_video,
    setup_internvideo2,
)

  from .autonotebook import tqdm as notebook_tqdm


[2024-06-09 16:46:57,849] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [4]:
video = cv2.VideoCapture('example1.mp4')
frames = [x for x in _frame_from_video(video)]

In [5]:
text_candidates = ["A playful dog and its owner wrestle in the snowy yard, chasing each other with joyous abandon.",
                   "A man in a gray coat walks through the snowy landscape, pulling a sleigh loaded with toys.",
                   "A person dressed in a blue jacket shovels the snow-covered pavement outside their house.",
                   "A pet dog excitedly runs through the snowy yard, chasing a toy thrown by its owner.",
                   "A person stands on the snowy floor, pushing a sled loaded with blankets, preparing for a fun-filled ride.",
                   "A man in a gray hat and coat walks through the snowy yard, carefully navigating around the trees.",
                   "A playful dog slides down a snowy hill, wagging its tail with delight.",
                   "A person in a blue jacket walks their pet on a leash, enjoying a peaceful winter walk among the trees.",
                   "A man in a gray sweater plays fetch with his dog in the snowy yard, throwing a toy and watching it run.",
                   "A person bundled up in a blanket walks through the snowy landscape, enjoying the serene winter scenery."]

In [6]:
config = Config.from_file('internvideo2_stage2_config.py')
config = eval_dict_leaf(config)

In [7]:
# model_pth = '/work/piyush/pretrained_checkpoints/LargeModels/InternVideo/1B_clip.pth'
# config['pretrained_path'] = model_pth

In [8]:
ckpt_root = "/work/piyush/pretrained_checkpoints/LargeModels/InternVideo/"
model_pth = os.path.join(ckpt_root, "InternVideo2-stage2_1b-224p-f4.pt")
assert os.path.exists(model_pth)
config["model"]["vision_encoder"]["pretrained"] = model_pth

In [9]:
repo_path = os.path.dirname(os.path.abspath("."))

In [10]:
text_encoder_config_path = os.path.join(repo_path, "configs/config_bert_large.json")
assert os.path.exists(text_encoder_config_path)
config['TextEncoders']['bert_large']['config'] = text_encoder_config_path

In [11]:
def num_params(model):
    n = np.sum([p.numel() for p in model.parameters()]) / 1e6
    print(f"Number of parameters in {type(model).__name__}: {np.round(n, 3)}M")

In [12]:
intern_model, tokenizer = setup_internvideo2(config)



Loading pretrained weights from /work/piyush/pretrained_checkpoints/LargeModels/InternVideo/InternVideo2-stage2_1b-224p-f4.pt


In [13]:
num_params(intern_model)

Number of parameters in InternVideo2_Stage2: 1410.144M


In [18]:
model_keys = list(intern_model.vision_encoder.state_dict().keys())
len(model_keys)

569

In [32]:
clip_ppt_ckpt = '/work/piyush/pretrained_checkpoints/LargeModels/InternVideo/1B_clip.pth'
clip_ppt_ckpt = torch.load(clip_ppt_ckpt)
# clip_ppt_ckpt = {k.replace("vision_encoder."): v for k, v in clip_ppt_ckpt.items()}
ckpt_keys = list(clip_ppt_ckpt.keys())
len(ckpt_keys)

47

In [33]:
ckpt_keys[:10]

['temp',
 'vision_encoder.clip_projector.norm1_q.weight',
 'vision_encoder.clip_projector.norm1_q.bias',
 'vision_encoder.clip_projector.norm1_k.weight',
 'vision_encoder.clip_projector.norm1_k.bias',
 'vision_encoder.clip_projector.norm1_v.weight',
 'vision_encoder.clip_projector.norm1_v.bias',
 'vision_encoder.clip_projector.cross_attn.q_bias',
 'vision_encoder.clip_projector.cross_attn.k_bias',
 'vision_encoder.clip_projector.cross_attn.v_bias']

In [37]:
msg = intern_model.load_state_dict(clip_ppt_ckpt, strict=False)

In [50]:
len(msg.missing_keys), len(msg.unexpected_keys), len(intern_model.state_dict())

(1006, 33, 1020)

In [42]:
dir(msg)

['__add__',
 '__class__',
 '__class_getitem__',
 '__contains__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__getnewargs__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__mul__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__rmul__',
 '__setattr__',
 '__sizeof__',
 '__slots__',
 '__str__',
 '__subclasshook__',
 '_asdict',
 '_field_defaults',
 '_fields',
 '_make',
 '_replace',
 'count',
 'index',
 'missing_keys',
 'unexpected_keys']

In [38]:
set(ckpt_keys).intersection(set(model_keys))

set()

In [39]:
texts, probs = retrieve_text(frames, text_candidates, model=intern_model, topk=5, config=config)

for t, p in zip(texts, probs):
    print(f'text: {t} ~ prob: {p:.4f}')



text: A playful dog slides down a snowy hill, wagging its tail with delight. ~ prob: 0.2822
text: A man in a gray sweater plays fetch with his dog in the snowy yard, throwing a toy and watching it run. ~ prob: 0.2673
text: A person in a blue jacket walks their pet on a leash, enjoying a peaceful winter walk among the trees. ~ prob: 0.2297
text: A pet dog excitedly runs through the snowy yard, chasing a toy thrown by its owner. ~ prob: 0.1814
text: A playful dog and its owner wrestle in the snowy yard, chasing each other with joyous abandon. ~ prob: 0.0197


In [19]:
video_path = "/users/piyush/projects/TimeBound.v1/sample_data/folding_paper.mp4"
video = cv2.VideoCapture(video_path)
frames = [x for x in _frame_from_video(video)]

In [20]:
text_candidates = [
    "Someone is folding a paper.",
    "Someone is unfolding a paper.",
]

In [21]:
texts, probs = retrieve_text(frames, text_candidates, model=intern_model, topk=2, config=config)

for t, p in zip(texts, probs):
    print(f'text: {t} ~ prob: {p:.4f}')

text: Someone is folding a paper. ~ prob: 0.7285
text: Someone is unfolding a paper. ~ prob: 0.2715
