In [1]:
from project.dataset.prepare import MomentRetrievalDataset
from transformers import VideoLlavaProcessor
import logging
from datasets import load_from_disk
from project.dataset.utils import view_sample_with_video

In [2]:
processor = VideoLlavaProcessor.from_pretrained('LanguageBind/Video-LLaVA-7B-hf')
processor.patch_size = 14
processor.vision_feature_select_strategy = "default"

In [3]:
Mr = MomentRetrievalDataset("datasets", "processed", processor, num_frames=14, num_worker=2)

In [4]:
logger = logging.getLogger(__name__)

In [5]:
mr = None
try:
    mr = load_from_disk('datasets/processed/moment_retrieval/timestamp/14_frames')
except:
    mr = None

if mr == None:
    mr = Mr.prepare_dataset(use_frame=False)

Loading dataset from disk:   0%|          | 0/21 [00:00<?, ?it/s]

In [6]:
mr

DatasetDict({
    train: Dataset({
        features: ['pixel_values_videos', 'input_ids', 'attention_mask'],
        num_rows: 1213
    })
    test: Dataset({
        features: ['answer', 'pixel_values_videos', 'input_ids', 'attention_mask', 'ts_info'],
        num_rows: 135
    })
    validation: Dataset({
        features: ['answer', 'pixel_values_videos', 'input_ids', 'attention_mask', 'ts_info'],
        num_rows: 64
    })
})

In [7]:
view_sample_with_video(mr['train'][800], processor)

prompt:
USER: Your task is to determine the timestamp range that best represents an action in the video. Use the provided frame-to-timestamp mapping to associate the timestamps with the actual video frames. Find the most similar continuous sequence of timestamp with the action asked.
Provide your answer as two timestamps in the format "mm:ss, mm:ss" (e.g. "00:10, 00:30"), where the first timestamp is the start time of the action and the second timestamp is the end time of the action. Do not provide any other explanation in your response. 
Video duration: 36 seconds
Frames sampled: 14
The frame-to-timestamp mapping for this video:
Frame 1 at 00:00
Frame 2 at 00:03
Frame 3 at 00:06
Frame 4 at 00:08
Frame 5 at 00:11
Frame 6 at 00:14
Frame 7 at 00:17
Frame 8 at 00:19
Frame 9 at 00:22
Frame 10 at 00:25
Frame 11 at 00:28
Frame 12 at 00:30
Frame 13 at 00:33
Frame 14 at 00:36
Video context: A set of teenage boys are in a room playing with a Rubiks cube. 
Action in question: The time begins and

In [8]:
view_sample_with_video(mr['test'][20], processor)

prompt:
USER: Your task is to determine the timestamp range that best represents an action in the video. Use the provided frame-to-timestamp mapping to associate the timestamps with the actual video frames. Find the most similar continuous sequence of timestamp with the action asked.
Provide your answer as two timestamps in the format "mm:ss, mm:ss" (e.g. "00:10, 00:30"), where the first timestamp is the start time of the action and the second timestamp is the end time of the action. Do not provide any other explanation in your response. 
Video duration: 27 seconds
Frames sampled: 14
The frame-to-timestamp mapping for this video:
Frame 1 at 00:00
Frame 2 at 00:02
Frame 3 at 00:04
Frame 4 at 00:06
Frame 5 at 00:08
Frame 6 at 00:10
Frame 7 at 00:12
Frame 8 at 00:14
Frame 9 at 00:16
Frame 10 at 00:18
Frame 11 at 00:20
Frame 12 at 00:22
Frame 13 at 00:24
Frame 14 at 00:26
Video context: A pot of dried pasta is left to boil on the stove. 
Action in question: A person stirs a bowling pot of 

In [10]:
view_sample_with_video(mr['validation'][50], processor)

prompt:
USER: Your task is to determine the timestamp range that best represents an action in the video. Use the provided frame-to-timestamp mapping to associate the timestamps with the actual video frames. Find the most similar continuous sequence of timestamp with the action asked.
Provide your answer as two timestamps in the format "mm:ss, mm:ss" (e.g. "00:10, 00:30"), where the first timestamp is the start time of the action and the second timestamp is the end time of the action. Do not provide any other explanation in your response. 
Video duration: 31 seconds
Frames sampled: 14
The frame-to-timestamp mapping for this video:
Frame 1 at 00:00
Frame 2 at 00:02
Frame 3 at 00:05
Frame 4 at 00:07
Frame 5 at 00:10
Frame 6 at 00:12
Frame 7 at 00:14
Frame 8 at 00:17
Frame 9 at 00:19
Frame 10 at 00:21
Frame 11 at 00:24
Frame 12 at 00:26
Frame 13 at 00:29
Frame 14 at 00:31
Video context: A group of small children play foosball at a tiny child table. 
Action in question: They try to pick the

In [11]:
del mr
del processor
del Mr

In [24]:
!python deepspeed-finetune-mr-stage1.py

[2024-12-18 08:49:04,525] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)
Loading dataset from disk: 100%|█████████████| 21/21 [00:00<00:00, 35161.83it/s]
The model weights are not tied. Please use the `tie_weights` method before using the `infer_auto_device` function.
Loading checkpoint shards: 100%|██████████████████| 3/3 [00:04<00:00,  1.53s/it]
12/18/2024 08:49:39 - INFO - {'max_epochs': 2, 'accumulate_grad_batches': 1, 'limit_val_batches': 48, 'val_check_interval': 0.25, 'precision': '16-mixed', 'gradient_clip_val': 1.0, 'num_sanity_val_steps': 6}
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/opt/conda/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/logger_connector/logger_connector.py:67: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `lightning.pytorch` package, due to potential co