In [1]:
from project.dataset.collate import DataCollatorWithPadding
from project.dataset.prepare import MomentRetrievalDataset
from project.trainer.lightning import VideoLlavaModelPLModule
from project.trainer.peft import find_all_linear_names
from project.dataset.utils import view_sample_with_video

from transformers import (
    VideoLlavaProcessor,
    BitsAndBytesConfig,
    VideoLlavaForConditionalGeneration,
    LlamaForCausalLM
)
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
from datasets import load_from_disk
from torch.utils.data import DataLoader
import torch
from dataclasses import dataclass
from lightning import Trainer
from lightning.pytorch.callbacks.early_stopping import EarlyStopping
from lightning.pytorch.callbacks.model_checkpoint import ModelCheckpoint
import argparse
import logging
import os
import sys
from datetime import datetime
from lightning.pytorch.strategies import DeepSpeedStrategy

[2024-12-18 19:43:12,825] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/opt/conda/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/opt/conda/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status


In [2]:
@dataclass
class Config:
    lora_r: int = 8
    lora_alpha: int = 16
    batch_size: int = 2
    max_epoch: int = 2
    val_check_interval: float = 0.25
    learning_rate: float = 2e-5
    dataset_dir: str = "datasets/processed"
    num_frames: int = 14
    num_worker: int = 2
    hub_repo: str = "jwnt4/finetune-videollava-qlora"
    accumulate_grad_batches: int = 4
    limit_val_batches: float = 24

args = Config

In [3]:
torch.set_float32_matmul_precision('high')
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
stream_handler = logging.StreamHandler(sys.stderr)

if not os.path.isdir("logs"):
    os.makedirs("logs")

log_file = f"logs/{str(datetime.now()).replace(' ', '_')}.log"
file_handler = logging.FileHandler(log_file)

log_formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S")
stream_handler.setFormatter(log_formatter)

logger.addHandler(stream_handler)
logger.addHandler(file_handler)

In [4]:
processor = VideoLlavaProcessor.from_pretrained("LanguageBind/Video-LLaVA-7B-hf", use_fast=False)
processor.patch_size = 14
processor.vision_feature_select_strategy = "default"
processor.tokenizer.deprecation_warnings["Asking-to-pad-a-fast-tokenizer"] = True

In [5]:
base_dir = args.dataset_dir.split("/")[0]
processed_dir = args.dataset_dir.split("/")[1]
dp = MomentRetrievalDataset(
    base_dir=base_dir, processed_dir=processed_dir, num_frames=args.num_frames, num_worker=1, processor=processor
)

In [6]:
dataset = None
try:
    dataset = load_from_disk(f"{args.dataset_dir}/moment_retrieval/timestamp/{args.num_frames}_frames")
except:
    dataset = None
if dataset is None:
    dataset = dp.prepare_dataset(use_frame=False)

In [7]:
train_dataset = dataset['train']
eval_dataset = dataset['validation']
train_dataloader = DataLoader(dataset['train'], collate_fn=DataCollatorWithPadding(processor), batch_size=args.batch_size, shuffle=False, num_workers=1)
eval_dataloader = DataLoader(dataset['validation'], collate_fn=DataCollatorWithPadding(processor), batch_size=1, shuffle=False, num_workers=1)


In [8]:
eval_example = next(iter(eval_dataloader))
view_sample_with_video({"pixel_values_videos": eval_example[2], "input_ids": eval_example[0]}, processor)

prompt:
USER:  <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <video> <

In [9]:
eval_example[-1], eval_example[-2]

([['00:00',
   '00:04',
   '00:07',
   '00:11',
   '00:15',
   '00:19',
   '00:22',
   '00:26',
   '00:30',
   '00:34',
   '00:37',
   '00:41',
   '00:45',
   '00:48']],
 [['00:11', '00:22']])

In [10]:
eval_example[0].shape

torch.Size([1, 4000])

In [11]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)

model = VideoLlavaForConditionalGeneration.from_pretrained(
    "LanguageBind/Video-LLaVA-7B-hf",
    torch_dtype=torch.bfloat16,
    quantization_config=bnb_config,
    device_map="auto"
)

model.generation_config.max_new_tokens = 32
model.config.return_dict = True

The model weights are not tied. Please use the `tie_weights` method before using the `infer_auto_device` function.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [12]:
lora_config = LoraConfig(
    r=args.lora_r,
    lora_alpha=args.lora_alpha,
    lora_dropout=0.1,
    target_modules=find_all_linear_names(model),
    init_lora_weights="gaussian",
    task_type="CAUSAL_LM"
) 

model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)

In [13]:
from lightning import LightningModule
from project.trainer.metrics import ao_exact_score, mr_iou_score
from deepspeed.ops.adam import DeepSpeedCPUAdam
from torch import Tensor
from bitsandbytes.optim.adam import Adam8bit


class VideoLlavaModelPLModule(LightningModule):
    def __init__(self, config, processor, model):
        super().__init__()
        self.save_hyperparameters(ignore=["model"])
        self.config = config
        self.processor = processor
        self.model = model


    def training_step(self, batch):
        input_ids: Tensor
        attention_mask: Tensor
        pixel_values_videos: Tensor
        labels: Tensor
        input_ids, attention_mask, pixel_values_videos, labels = batch

        outputs = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            pixel_values_videos=pixel_values_videos,
            labels=labels
        )
        loss = outputs.loss
        self.log("train_loss", loss)

        return loss


    def validation_step(self, batch):

        input_ids, attention_mask, pixel_values_videos, labels, frame_info = batch

        # autoregressively generate token IDs
        generated_ids = self.model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            pixel_values_videos=pixel_values_videos,
            max_new_tokens=50,
            do_sample=False,
        )
        # turn them back into text, chopping of the prompt
        predictions = self.processor.batch_decode(
            generated_ids[:, input_ids.size(1):], 
            skip_special_tokens=True, clean_up_tokenization_spaces=True)
        frame_info = batch[-1]
        score, correct = mr_iou_score(predictions, frame_info, labels) 
            
        self.log("val_accuracy", score)

        return correct


    def configure_optimizers(self):
        # use 8 bit optimizer
        optimizer = Adam8bit(self.parameters(), min_8bit_size=4096, lr=self.config.get("lr"))
        # optimizer = DeepSpeedCPUAdam(self.parameters(), lr=2e-5)
        # optimizer = torch.optim.AdamW(self.parameters(), lr=self.config.get("lr"))

        return optimizer

In [14]:
module = VideoLlavaModelPLModule(
    config={
        "lr": args.learning_rate
    },
    processor=processor,
    model=model
)

In [15]:
limit_val_batches = (args.limit_val_batches // args.batch_size) * args.batch_size
train_conf = {
    "max_epochs": args.max_epoch,
    "accumulate_grad_batches": 1,
    "limit_val_batches": int(limit_val_batches),
    "val_check_interval": args.val_check_interval,
    "precision": "16-mixed",
    "gradient_clip_val": 1.0,
    "num_sanity_val_steps": None
}
print(train_conf)

{'max_epochs': 2, 'accumulate_grad_batches': 1, 'limit_val_batches': 24, 'val_check_interval': 0.25, 'precision': '16-mixed', 'gradient_clip_val': 1.0, 'num_sanity_val_steps': None}


In [16]:
early_stopping = EarlyStopping(monitor="val_accuracy", verbose=False, mode="min")
model_checkpoint = ModelCheckpoint(
    monitor='val_accuracy',
    dirpath='output/',
    filename='videollava-7b-ao-{epoch:02d}-{val_accuracy:.2f}'+f"lora_r{args.lora_r}-lora_alpha{args.lora_alpha}"
)
callbacks = [
    early_stopping, model_checkpoint
]

In [17]:
trainer = Trainer(
    **train_conf,
    accelerator="auto",
    devices=[0],
    callbacks=callbacks,
)

Using 16bit Automatic Mixed Precision (AMP)
/opt/conda/lib/python3.11/site-packages/lightning/pytorch/plugins/precision/amp.py:54: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/opt/conda/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/logger_connector/logger_connector.py:67: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `lightning.pytorch` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default


In [18]:
trainer.validate(module,eval_dataloader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
/opt/conda/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=63` in the `DataLoader` to improve performance.


Validation: |          | 0/? [00:00<?, ?it/s]

/opt/conda/lib/python3.11/site-packages/lightning/pytorch/utilities/data.py:77: Trying to infer the `batch_size` from an ambiguous collection. The batch size we found is 1. To avoid any miscalculations, use `self.log(..., batch_size=batch_size)`.


────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
     Validate metric           DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
      val_accuracy          0.39441820979118347
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


[{'val_accuracy': 0.39441820979118347}]

In [19]:
trainer.fit(module, train_dataloader, eval_dataloader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type                 | Params
-----------------------------------------------
0 | model | PeftModelForCausalLM | 3.8 B 
-----------------------------------------------
27.2 M    Trainable params
3.8 B     Non-trainable params
3.8 B     Total params
15,371.895Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/opt/conda/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=63` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

  return fn(*args, **kwargs)
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=2` reached.


In [21]:
trainer.validate(module,eval_dataloader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Validation: |          | 0/? [00:00<?, ?it/s]

────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
     Validate metric           DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
      val_accuracy          0.5136803388595581
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


[{'val_accuracy': 0.5136803388595581}]

In [22]:
test_dataloader = DataLoader(dataset['test'], collate_fn=DataCollatorWithPadding(processor), batch_size=1, shuffle=False, num_workers=1)

In [26]:
trainer.model.model.push_to_hub(Config.hub_repo, commit_message="stage-1")

adapter_model.safetensors:   0%|          | 0.00/109M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/jwnt4/finetune-videollava-qlora/commit/00e3c2933eb0730cd834938d3f6f787a94a7a181', commit_message='stage-1', commit_description='', oid='00e3c2933eb0730cd834938d3f6f787a94a7a181', pr_url=None, repo_url=RepoUrl('https://huggingface.co/jwnt4/finetune-videollava-qlora', endpoint='https://huggingface.co', repo_type='model', repo_id='jwnt4/finetune-videollava-qlora'), pr_revision=None, pr_num=None)

In [28]:
trainer.model.processor.push_to_hub(Config.hub_repo, commit_message="stage-1 processor")

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/jwnt4/finetune-videollava-qlora/commit/2a68c3634b86cc140db2042e904c218dbe996c67', commit_message='stage-1 processor', commit_description='', oid='2a68c3634b86cc140db2042e904c218dbe996c67', pr_url=None, repo_url=RepoUrl('https://huggingface.co/jwnt4/finetune-videollava-qlora', endpoint='https://huggingface.co', repo_type='model', repo_id='jwnt4/finetune-videollava-qlora'), pr_revision=None, pr_num=None)