In [1]:
import os
import av
import re
import bisect
import shutil
import numpy as np
from nltk import edit_distance

from transformers import AutoProcessor
from transformers import BitsAndBytesConfig, VideoLlavaForConditionalGeneration
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model

import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from huggingface_hub import snapshot_download
from datasets import load_dataset, concatenate_datasets

import lightning as L
from lightning.pytorch.callbacks.early_stopping import EarlyStopping
import json
from datasets import Dataset

MAX_LENGTH = 256
MODEL_ID = "LanguageBind/Video-LLaVA-7B-hf"
REPO_ID = "RaushanTurganbay/VideoLLava-demo" # Change to your hf-hub repo

USE_LORA = True
USE_QLORA = True 

2024-10-06 15:38:46.430270: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-10-06 15:38:46.443824: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-06 15:38:46.461410: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-10-06 15:38:46.466597: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-10-06 15:38:46.479670: I tensorflow/core/platform/cpu_feature_guar

In [2]:
def read_video_pyav(video_path, start, end):
    """Reads a video for given start-end timestamps interval and uniformly samples 8 frames of it"""
    container = av.open(video_path)
    video = container.streams.get(0)[0]

    av_timestamps = [
        int(packet.pts * video.time_base) for packet in container.demux(video) if packet.pts is not None
    ]

    av_timestamps.sort()
    start_id = bisect.bisect_left(av_timestamps, start)
    end_id = bisect.bisect_left(av_timestamps, end)


    if end_id - start_id < 10:
        end_id = min(len(av_timestamps) - 1, end_id + 10)
        start_id = max(0, start_id - 10)


    end_id = min(len(av_timestamps) - 1, end_id)
    start_id = max(0, start_id)

    num_frames_to_sample = min(2, end_id - start_id + 1)
    indices = np.linspace(start_id, end_id, num_frames_to_sample).astype(int)

    frames = []
    container.seek(0)
    for i, frame in enumerate(container.decode(video=0)):
        if i > end_id:
            break
        if i >= start_id and i in indices:
            frames.append(frame)
    assert len(frames) == 2, f"Got {len(frames)} frames but should be 2. Check the indices: {indices};, start_id: {start_id}, end_id: {end_id}. Len of video is {len(av_timestamps)} frames."
    return np.stack([x.to_ndarray(format="rgb24") for x in frames])

In [3]:
def collate_read_video(example, path):
    # Some datasets have a start-end interval, so we try to get it if exists. Otherwise just set a very large end timestamp
    clip = read_video_pyav(f'{path}/{example["video"]}', example.get("start", 1), example.get("end", 1e+10))
    example["clip"] = clip
    return example

In [4]:
processor = AutoProcessor.from_pretrained(MODEL_ID)
processor.tokenizer.padding_side = "right" # during training, one always uses padding on the right


In [5]:
from torch.utils.data import Dataset

class VideoLlavaDataset(Dataset):
    """
    PyTorch Dataset for VideoLlavaDataset. This class takes a HuggingFace Dataset as input.
    """

    def __init__(self, dataset, video_path):
        super().__init__()
        self.dataset = dataset
        self.video_path = video_path

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx: int):
        sample = self.dataset[idx]
        # Lazy load video clip here
        clip = read_video_pyav(f'{self.video_path}/{sample["video"]}', sample.get("start", 0), sample.get("end", 1e+10))
        answer = sample['conversations'][1]['value']
        tmp_prompt = sample['conversations'][0]['value']

        prompt = f"USER: {tmp_prompt}" \
                 f"\n ASSISTANT: Answer: {answer}"

        return prompt, clip, answer

In [6]:
def train_collate_fn(examples):
    videos = []
    texts = []
    texts, videos, _ = list(zip(*examples))

    batch = processor(text=texts, videos=videos, padding=True, truncation=True, max_length=MAX_LENGTH, return_tensors="pt")

    labels = batch["input_ids"].clone()

    # We don't want to compute loss for pad tokens, lets mask with -100. Some methods also mask the prompt, calculating loss only on the answers/captions/etc
    labels[labels == processor.tokenizer.pad_token_id] = -100
    batch["labels"] = labels

    input_ids = batch["input_ids"]
    attention_mask = batch["attention_mask"]
    pixel_values_videos = batch["pixel_values_videos"]
    labels = batch["labels"]

    return input_ids, attention_mask, pixel_values_videos, labels


def eval_collate_fn(examples):
    # We only feed the prompt to the model
    # Make sure to separate prompt from answers/captions/etc depending on your own task and dataset
    # Otherwise your model will peek into the ground truth
    videos = []
    texts = []
    true_answers = []
    texts, videos, true_answers = list(zip(*examples))
    texts = [text[:-2] for text in texts]  # Get text without answers, so the model has to generate the answers itself during eval

    batch = processor(text=texts, videos=videos, max_length=MAX_LENGTH, return_tensors="pt")

    input_ids = batch["input_ids"]
    attention_mask = batch["attention_mask"]
    pixel_values_videos = batch["pixel_values_videos"]
    # answer_choice = batch["answer"] 

    return input_ids, attention_mask, pixel_values_videos, true_answers



In [7]:
with open('VLM/sample_annotations.json', 'r') as file:
    train_data  = json.load(file)

with open('VLM/sample_annotations.json', 'r') as file:
    test_data  = json.load(file)


In [9]:
# Create dictionary for training data
train_dataset_dict = {
    "id": [item['id'] for item in train_data],
    "video": [item['video'] for item in train_data],
    "conversations": [item['conversations'] for item in train_data],
}

# Create dictionary for testing data
test_dataset_dict = {
    "id": [item['id'] for item in test_data],
    "video": [item['video'] for item in test_data],
    "conversations": [item['conversations'] for item in test_data],
}


In [10]:
from datasets import Dataset

# Convert these dictionaries to HuggingFace datasets
train_dataset_tmp = Dataset.from_dict(train_dataset_dict)
test_dataset_tmp = Dataset.from_dict(test_dataset_dict)

In [11]:
test_dataset_dict

{'id': [10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
 'video': ['airport_hangar_5_2-airport_hangar_5_4.mp4',
  'airport_hangar_5_4-airport_hangar_5_5.mp4',
  'airport_hangar_7_9-airport_hangar_7_5.mp4',
  'airport_hangar_10_5-airport_hangar_10_3.mp4',
  'airport_hangar_10_2-airport_hangar_10_0.mp4',
  'airport_hangar_12_0-airport_hangar_12_1.mp4',
  'airport_hangar_13_4-airport_hangar_13_5.mp4',
  'airport_hangar_14_5-airport_hangar_14_9.mp4',
  'airport_hangar_14_0-airport_hangar_14_7.mp4',
  'airport_hangar_16_1-airport_hangar_16_2.mp4'],
 'conversations': [[{'from': 'human',
    'value': '<video>\n Offer an elaborate explanation of the satellite video, where every frame captures the same location but at different times.'},
   {'from': 'gpt',
    'value': '**Image 1:**  \nThis aerial view showcases a large paved area, likely an airport runway, with several small aircraft parked nearby. Surrounding the runway and aircraft are various buildings, some of which appear to be hangars or servic

In [12]:
train_dataset = VideoLlavaDataset(train_dataset_tmp, "VLM/Videos")
eval_dataset = VideoLlavaDataset(test_dataset_tmp, "VLM/Videos")

In [None]:
prompt, clip, _= train_dataset[0]
prompt

In [13]:
USE_QLORA , USE_LORA

(True, True)

In [14]:
## Load model
# Three options for training, from the lowest precision training to the highest precision training:
# QLoRA: model uses 4-bit quantization, which helps in reducing memory usage while maintaining performance.
# Standard LoRA:  model is loaded with standard LoRA adaptations.
# Full Fine-Tuning: no memory optimization are done. In that case Flash Attention is used to speed up training, if hardware supports it.

if USE_QLORA or USE_LORA:
    if USE_QLORA:
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.float16,
        )
        model = VideoLlavaForConditionalGeneration.from_pretrained(
        MODEL_ID,
        torch_dtype=torch.float16,
        quantization_config=bnb_config,
        device_map="auto",
        # device_map={"": 0},
    )
    model = VideoLlavaForConditionalGeneration.from_pretrained(
        MODEL_ID,
        torch_dtype=torch.float16,
        device_map="auto",
        # device_map={"": 0},
    )
else:
    # for full fine-tuning, we can speed up the model using Flash Attention
    # only available on certain devices, see https://github.com/Dao-AILab/flash-attention?tab=readme-ov-file#installation-and-features
    model = VideoLlavaForConditionalGeneration.from_pretrained(
        MODEL_ID,
        torch_dtype=torch.float16,
        _attn_implementation="flash_attention_2",
        device_map="auto",
        # device_map={"": 0},
    )

The model weights are not tied. Please use the `tie_weights` method before using the `infer_auto_device` function.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

The model weights are not tied. Please use the `tie_weights` method before using the `infer_auto_device` function.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [15]:
def find_all_linear_names(model):
    cls = torch.nn.Linear
    lora_module_names = set()
    multimodal_keywords = ['multi_modal_projector', 'vision_model']
    for name, module in model.named_modules():
        if any(mm_keyword in name for mm_keyword in multimodal_keywords):
            continue
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])

    if 'lm_head' in lora_module_names: # needed for 16-bit
        lora_module_names.remove('lm_head')
    return list(lora_module_names)


lora_config = LoraConfig(
    r=4,
    lora_alpha=4,
    lora_dropout=0.1,
    target_modules=find_all_linear_names(model),
    init_lora_weights="gaussian",
)

model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)

In [16]:
model

PeftModel(
  (base_model): LoraModel(
    (model): VideoLlavaForConditionalGeneration(
      (video_tower): CLIPVisionModel(
        (vision_model): CLIPVisionTransformer(
          (embeddings): CLIPVisionEmbeddings(
            (patch_embedding): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14), bias=False)
            (position_embedding): Embedding(257, 1024)
          )
          (pre_layrnorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (encoder): CLIPEncoder(
            (layers): ModuleList(
              (0-23): 24 x CLIPEncoderLayer(
                (self_attn): CLIPSdpaAttention(
                  (k_proj): lora.Linear4bit(
                    (base_layer): Linear4bit(in_features=1024, out_features=1024, bias=True)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=1024, out_

In [None]:
class VideoLlavaModelPLModule(L.LightningModule):
    def __init__(self, config, processor, model):
        super().__init__()
        self.config = config
        self.processor = processor
        self.model = model

        self.batch_size = config.get("batch_size")

        # List to store predictions and ground truth during validation
        self.results = []

    def training_step(self, batch, batch_idx):

        input_ids, attention_mask, pixel_values_videos, labels = batch

        outputs = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            pixel_values_videos=pixel_values_videos,
            labels=labels
        )
        loss = outputs.loss

        self.log("train_loss", loss)

        return loss

    def validation_step(self, batch, batch_idx, dataset_idx=0):
        
        input_ids, attention_mask, pixel_values_videos, answers  = batch

        # autoregressively generate token IDs
        generated_ids = self.model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            pixel_values_videos=pixel_values_videos,
            max_new_tokens=MAX_LENGTH,
            do_sample=False,
        )
        # turn them back into text, chopping of the prompt
        predictions = self.processor.batch_decode(generated_ids[:, input_ids.size(1):], skip_special_tokens=True)

        correct = 0
        for pred, answer in zip(predictions, answers):
            # cleaned_text = pred.split("\n ASSISTANT: Answer:", 1)[-1].strip() if "\n ASSISTANT: Answer:" in pred else pred
            # result_entry = {
            #     'true': answer,
            #     'generated': pred
            # }
            # self.results.append(result_entry)
            correct += (pred.strip().lower() == answer.lower())
        self.log("val_accuracy", correct / len(answers))
            
        return correct

    # def on_validation_epoch_end(self):
    #     # Save results at the end of validation
    #     with open('validation_results.json', 'w') as f:
    #         json.dump(self.results, f, indent=4)
        
    #     # Clear results for the next validation epoch
    #     self.results.clear()

    def configure_optimizers(self):
        # you could also add a learning rate scheduler if you want
        optimizer = torch.optim.AdamW(self.parameters(), lr=self.config.get("lr"))

        return optimizer

    def train_dataloader(self):
        return DataLoader(train_dataset, collate_fn=train_collate_fn, batch_size=self.batch_size, shuffle=True, num_workers=4)

    def val_dataloader(self):
        return DataLoader(eval_dataset, collate_fn=eval_collate_fn, batch_size=self.batch_size, shuffle=False, num_workers=4)

In [None]:
config = {"max_epochs": 5,
        #   "val_check_interval": 0.2, # how many times we want to validate during an epoch
          "check_val_every_n_epoch": 1,
          "gradient_clip_val": 1.0,
          "accumulate_grad_batches": 1,
          "lr": 1e-4,
          "batch_size": 1,
          "num_nodes": 1,
          "warmup_steps": 50,
          "save_strategy":"epoch",
}

model_module = VideoLlavaModelPLModule(config, processor, model)
early_stop_callback = EarlyStopping(monitor="train_loss", patience=3, verbose=True, mode="min")

In [None]:
# from huggingface_hub import HfApi

# api = HfApi()

# class PushToHubCallback(Callback):
#     def on_train_epoch_end(self, trainer, pl_module):
#         print(f"Pushing model to the hub, epoch {trainer.current_epoch}")
#         pl_module.model.push_to_hub(REPO_ID,
#                                     commit_message=f"Training in progress, epoch {trainer.current_epoch}")

#     def on_train_end(self, trainer, pl_module):
#         print(f"Pushing model to the hub after training")
#         pl_module.processor.push_to_hub(REPO_ID,
#                                     commit_message=f"Training done")
#         pl_module.model.push_to_hub(REPO_ID,
#                                     commit_message=f"Training done")

# early_stop_callback = EarlyStopping(monitor="train_loss", patience=3, verbose=True, mode="min")

In [None]:
from lightning.pytorch.callbacks import ModelCheckpoint  # Ensure correct import from lightning.pytorch

# Define checkpoint callback to save only the most recent 5 checkpoints
checkpoint_callback = ModelCheckpoint(
    save_top_k=5,  # Keeps only the best 5 checkpoints
    monitor="train_loss",  # Monitor training loss for checkpointing
    mode="min",  # Minimize the train_loss
    save_last=True,  # Always save the latest checkpoint
    dirpath="./video_llava_demo/checkpointss",  # Path to save the checkpoints
    filename="video_llava-{epoch:02d}-{train_loss:.2f}"  # Checkpoint file naming convention
)

trainer = L.Trainer(
    default_root_dir="./video_llava_demo",
    accelerator="gpu",
    devices=[0],
    max_epochs=config.get("max_epochs"),
    accumulate_grad_batches=config.get("accumulate_grad_batches"),
    check_val_every_n_epoch=config.get("check_val_every_n_epoch"),
    gradient_clip_val=config.get("gradient_clip_val"),
    precision="16-mixed",
    limit_val_batches=5,
    num_sanity_val_steps=1,
    callbacks=[early_stop_callback, checkpoint_callback],
    log_every_n_steps=1  # Set to 1 to log every batch
)

In [None]:
trainer.fit(model_module)

In [None]:
# Save the processor and model locally
processor.save_pretrained("./local_model")
model.save_pretrained("./local_model")


In [None]:
from transformers import AutoProcessor, BitsAndBytesConfig, VideoLlavaForConditionalGeneration
import torch

# Load processor and model from local directory
processor = AutoProcessor.from_pretrained("./local_model")

# Define quantization config
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

# Load the model from local directory
model = VideoLlavaForConditionalGeneration.from_pretrained(
    "./local_model",  # Load from local path
    torch_dtype=torch.float16,
    quantization_config=quantization_config,
    device_map="auto",
)
# "LanguageBind/Video-LLaVA-7B-hf"


In [17]:
import json
import math

results = []
pattern_to_remove = "\n ASSISTANT: Answer:"
batch_size = 1  # Define your batch size here

processor = AutoProcessor.from_pretrained(MODEL_ID)
processor.tokenizer.padding_side = "left" # during training, one always uses padding on the right
# model.eval()

with torch.no_grad():

    # Split test data into batches
    for i in range(math.ceil(len(test_data) / batch_size)):
        # Define the start and end index for the current batch
        start_idx = i * batch_size
        end_idx = min((i + 1) * batch_size, len(test_data))
        
        # Collect all texts and videos for the current batch
        # batch_texts = [test['conversations'][0]['value'] for test in test_data[start_idx:end_idx]]
        batch_texts = [f"{test['conversations'][0]['value']}" for test in test_data[start_idx:end_idx]]
        batch_videos = [read_video_pyav(f'VLM/Videos/{test["video"]}', 0, 1e+10) for test in test_data[start_idx:end_idx]]
        
        print(batch_texts)

        # Process the entire batch at once
        inputs = processor(text=batch_texts, videos=batch_videos, return_tensors="pt", padding=True).to(model.device)
        # print(inputs)
        generated_ids = model.generate(**inputs, max_new_tokens=1024, do_sample=False)
        generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)
        print(generated_texts)
        
        cleaned_texts = [
            generated_text for generated_text in generated_texts
        ]
        # Create results for the entire batch
        for idx, test in enumerate(test_data[start_idx:end_idx]):
            # print(cleaned_texts[idx])
            true_value = test['conversations'][1]['value']
            result_entry = {
                'id': test['id'],
                'video': test['video'],
                'true': true_value,
                'generated': cleaned_texts[idx]  # Add the cleaned text from the batch
            }
            results.append(result_entry)

# Save results to a JSON file
with open('results.json', 'w') as f:
    json.dump(results, f, indent=4)


Expanding inputs for image tokens in Video-LLaVa should be done in processing. Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. Using processors without these attributes in the config is deprecated and will throw an error in v4.44.


['<video>\n Offer an elaborate explanation of the satellite video, where every frame captures the same location but at different times.']


Expanding inputs for image tokens in Video-LLaVa should be done in processing. Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. Using processors without these attributes in the config is deprecated and will throw an error in v4.47.
Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


['\n Offer an elaborate explanation of the satellite video, where every frame captures the same location but at different times.\n\nThe satellite video showcases a busy city intersection with multiple lanes of traffic, including cars, trucks, and buses. The video captures the same location at different times, providing a comprehensive view of the traffic flow. The frames showcase the movement of vehicles, including cars and trucks, as they navigate through the intersection. The video also highlights the presence of pedestrians, with several individuals visible in the frames.\n\nThe video emphasizes the importance of traffic management and safety measures in a bustling city environment. The different times captured in the video demonstrate the dynamic nature of urban life, where traffic patterns and pedestrian movement are constantly changing. The video serves as a visual representation of the challenges faced by city planners and traffic engineers in managing the flow of vehicles and e

In [None]:
import json
from nltk.translate.bleu_score import sentence_bleu
from bert_score import score
from rouge_score import rouge_scorer

# Load the validation results
with open('results1.json', 'r') as file:
    results = json.load(file)

# Extract the true and generated texts from the results
truth = [entry['true'] for entry in results]
predicted = [entry['generated'] for entry in results]

# 1. Calculate BLEU scores
bleu_scores = [sentence_bleu([t.split()], p.split()) for t, p in zip(truth, predicted)]
average_bleu = sum(bleu_scores) / len(bleu_scores)

# 2. Calculate BERTScore
P, R, F1 = score(predicted, truth, lang="en", verbose=True)

# Average BERTScore metrics
mean_precision = P.mean().item()
mean_recall = R.mean().item()
mean_f1 = F1.mean().item()

# 3. Calculate ROUGE scores
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
rouge_scores = [scorer.score(t, p) for t, p in zip(truth, predicted)]

# Aggregate ROUGE scores
rouge1 = sum([score['rouge1'].fmeasure for score in rouge_scores]) / len(rouge_scores)
rouge2 = sum([score['rouge2'].fmeasure for score in rouge_scores]) / len(rouge_scores)
rougeL = sum([score['rougeL'].fmeasure for score in rouge_scores]) / len(rouge_scores)

# Print BLEU, BERTScore, and ROUGE scores
print(f"Mean BLEU Score: {average_bleu}")
print(f"Mean Precision (BERTScore): {mean_precision}")
print(f"Mean Recall (BERTScore): {mean_recall}")
print(f"Mean F1 (BERTScore): {mean_f1}")

print(f"Mean ROUGE-1 Score: {rouge1}")
print(f"Mean ROUGE-2 Score: {rouge2}")
print(f"Mean ROUGE-L Score: {rougeL}")


In [20]:
from transformers import BitsAndBytesConfig, VideoLlavaForConditionalGeneration

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16
)

processor = AutoProcessor.from_pretrained("LanguageBind/Video-LLaVA-7B-hf")
model = VideoLlavaForConditionalGeneration.from_pretrained(
    "LanguageBind/Video-LLaVA-7B-hf",
    quantization_config=quantization_config,
    device_map='auto'
)

results = []

for test in train_data:

    true_value = test['conversations'][1]['value']

    # Generate the predicted response
    inputs = processor(text=test['conversations'][0]['value'], videos=read_video_pyav(f'VLM/Videos/{test["video"]}', 0, 1e+10), padding=True, return_tensors="pt").to(model.device)

    generate_kwargs = {"max_new_tokens": 256, "do_sample": True, "top_p": 0.9}

    output = model.generate(**inputs, **generate_kwargs)
    generated_text = processor.batch_decode(output, skip_special_tokens=True)

    print(generated_text[0])
    result_entry = {
                'id': test['id'],
                'video': test['video'],
                'true': true_value,
                'generated': generated_text[0]  # Add the cleaned text from the batch
            }
    results.append(result_entry)
    
    # Save results to a JSON file
with open('results_base.json', 'w') as f:
    json.dump(results, f, indent=4)

The model weights are not tied. Please use the `tie_weights` method before using the `infer_auto_device` function.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]


 Provide a report describing the satellite video, where each frame shows the same location at different time points. The video starts off by showing different vehicles moving around a parking lot. In one frame, we can see a couple of cars driving around while some other people are walking around the area. In the next frame, we see a couple more cars and a bus driving around. This goes on for a few more frames, as we observe more cars and a truck passing through the parking lot. The video captures the busy activity of people and vehicles in the parking lot, providing a snapshot of daily life in the area. The video effectively showcases the dynamic nature of urban life, with people and vehicles constantly moving around the city. The different frames are a great representation of how life changes throughout the day and how people move through various locations within a city.

 Provide a report describing the satellite video, where each frame shows the same location at different time poin

In [21]:
from transformers import BitsAndBytesConfig, LlavaNextVideoForConditionalGeneration, LlavaNextVideoProcessor
import torch

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16
)

processor = LlavaNextVideoProcessor.from_pretrained("llava-hf/LLaVA-NeXT-Video-7B-hf")
model = LlavaNextVideoForConditionalGeneration.from_pretrained(
    "llava-hf/LLaVA-NeXT-Video-7B-hf",
    quantization_config=quantization_config,
    device_map='auto'
)


results = []

for test in train_data:

    true_value = test['conversations'][1]['value']

    # Generate the predicted response
    inputs = processor(text=test['conversations'][0]['value'], videos=read_video_pyav(f'VLM/Videos/{test["video"]}', 0, 1e+10), padding=True, return_tensors="pt").to(model.device)

    generate_kwargs = {"max_new_tokens": 128, "do_sample": True, "top_p": 0.9}

    output = model.generate(**inputs, **generate_kwargs)
    generated_text = processor.batch_decode(output, skip_special_tokens=True)

    print(generated_text[0])
    result_entry = {
                'id': test['id'],
                'video': test['video'],
                'true': true_value,
                'generated': generated_text[0]  # Add the cleaned text from the batch
            }
    results.append(result_entry)
    
    # Save results to a JSON file
with open('results_base_next.json', 'w') as f:
    json.dump(results, f, indent=4)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Expanding inputs for image/video tokens in LLaVa-NeXT-Video should be done in processing. Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. Using processors without these attributes in the config is deprecated and will throw an error in v4.47.
Expanding inputs for image.video tokens in LLaVa-NeXT-Video should be done in processing. Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. Using processors without these attributes in the config is deprecated and will throw an error in v4.47.



 Provide a report describing the satellite video, where each frame shows the same location at different time points.

 Provide a report describing the satellite video, where each frame shows the same location at different time points.

 Provide a report describing the satellite video, where each frame shows the same location at different time points.

 Give a detailed account of the satellite video, with each frame depicting the same location at distinct points in time.
As we look at the satellite video, we can observe a scene of an overcast day with no distinct cloud formations visible. The trees in the area are not densely packed but are spread out across the landscape. The landscape appears to be a mixture of urban and rural areas, as we can see both structures and buildings. The colors in the video are not vivid, suggesting that the satellite does not have a high-definition camera. The frame captures a particular location and we can see the same view from different periods of time

In [None]:
print(generated_text)

In [None]:
test_data[1]['conversations'][0]['value']

In [None]:
true_value

In [None]:
import json
import math

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16
)

processor = AutoProcessor.from_pretrained("llava-hf/LLaVA-NeXT-Video-7B-hf")
model = VideoLlavaForConditionalGeneration.from_pretrained(
    "llava-hf/LLaVA-NeXT-Video-7B-hf",
    quantization_config=quantization_config,
    device_map='auto'
)

results = []
batch_size = 1  # Define your batch size here

# processor.tokenizer.padding_side = "left" # during training, one always uses padding on the right

with torch.no_grad():

    # Split test data into batches
    for i in range(math.ceil(len(test_data) / batch_size)):
        # Define the start and end index for the current batch
        start_idx = i * batch_size
        end_idx = min((i + 1) * batch_size, len(test_data))
        
        # Collect all texts and videos for the current batch
        # batch_texts = [test['conversations'][0]['value'] for test in test_data[start_idx:end_idx]]
        batch_texts = [f"{test['conversations'][0]['value']}" for test in test_data[start_idx:end_idx]]
        batch_videos = [read_video_pyav(f'VLM/Videos/{test["video"]}', 0, 1e+10) for test in test_data[start_idx:end_idx]]
        
        print(batch_texts)

        # Process the entire batch at once
        inputs = processor(text=batch_texts, videos=batch_videos, return_tensors="pt", padding=True).to(model.device)
        # print(inputs)
        generated_ids = model.generate(**inputs, max_new_tokens=1024, do_sample=False)
        generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)
        print(generated_texts)
        
        cleaned_texts = [
            generated_text for generated_text in generated_texts
        ]
        # Create results for the entire batch
        for idx, test in enumerate(test_data[start_idx:end_idx]):
            # print(cleaned_texts[idx])
            true_value = test['conversations'][1]['value']
            result_entry = {
                'id': test['id'],
                'video': test['video'],
                'true': true_value,
                'generated': cleaned_texts[idx]  # Add the cleaned text from the batch
            }
            results.append(result_entry)

# Save results to a JSON file
with open('results.json', 'w') as f:
    json.dump(results, f, indent=4)
