In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
ROOT_DIR = './'

In [3]:
import sys

sys.path.append(ROOT_DIR)

In [4]:
from components.dataset import VideoLlavaDataset
from components.model import get_video_llava_peft_model
from components.collate import Collator
from transformers import AutoProcessor
from components.lightning import VideoLlavaModelPLModule
import torch
from lightning.pytorch.callbacks.early_stopping import EarlyStopping
from lightning.pytorch.callbacks import ModelCheckpoint
from lightning.pytorch import Trainer
from torch.utils.data import DataLoader

In [5]:
# Model Constants
DEVICE = 0

MODEL_ID = "LanguageBind/Video-LLaVA-7B-hf"
MODEL_NAME = MODEL_ID.split("/")[-1]
CACHE_DIR = "./cache"

USE_QLORA = True
USE_8BIT = False

LORA_R = 64
LORA_ALPHA = 128

MAX_LENGTH = 350

In [6]:
train_csv_file = "./data/train/valid_clips.csv"
train_video_dir = "./data/train/raw_videos"

val_csv_file = "./data/validation/valid_clips.csv"
val_video_dir = "./data/validation/raw_videos"

In [7]:
train_dataset = VideoLlavaDataset(video_path=train_video_dir, csv_file=train_csv_file)
val_dataset = VideoLlavaDataset(video_path=val_video_dir, csv_file=val_csv_file)

In [8]:
processor = AutoProcessor.from_pretrained(MODEL_ID)
processor.tokenizer.padding_side = "right" # during training, one always uses padding on the right

In [9]:
train_collate_fn = Collator(processor, is_val=False, max_length=MAX_LENGTH)
val_collate_fn = Collator(processor, is_val=True, max_length=MAX_LENGTH)

In [10]:
torch.cuda.empty_cache()

In [11]:
model = get_video_llava_peft_model(
    model_id=MODEL_ID,
    use_qlora=USE_QLORA,
    use_8bit=USE_8BIT,
    lora_r=LORA_R,
    lora_alpha=LORA_ALPHA,
    cache_dir=CACHE_DIR,
    device=DEVICE
)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [12]:
# training constants

BATCH_SIZE = 2

lora_type = "QLORA" if USE_QLORA else "LORA"
bit_type = "8bit" if USE_8BIT else "4bit"

MODEL_PATH = f"./outputs/{MODEL_NAME}_{lora_type}_{bit_type}_r{LORA_R}_alpha{LORA_ALPHA}/"

In [13]:
# training config
config = {
    "max_epochs": 1,
    "val_check_interval": 0.2, # how many times we want to validate during an epoch
    "check_val_every_n_epoch": 1,
    "gradient_clip_val": 1.0,
    "accumulate_grad_batches": 1,
    "lr": 1e-4,
    "batch_size": BATCH_SIZE,
    "num_nodes": 1,
    "warmup_steps": 50,
    "max_new_tokens": MAX_LENGTH,
    "num_workers": 2
}

In [14]:
model_module = VideoLlavaModelPLModule(
    config=config,
    processor=processor,
    model=model,
    train_dataset=train_dataset,
    val_dataset=val_dataset,
    train_collate_fn=train_collate_fn,
    val_collate_fn=val_collate_fn
)
early_stop_callback = EarlyStopping(monitor="train_loss", patience=3, verbose=True, mode="min")

In [15]:
# Define checkpoint callback to save only the most recent 2 checkpoints
checkpoint_callback = ModelCheckpoint(
    save_top_k=2,  # Keeps only the best 2 checkpoints
    monitor="train_loss",  # Monitor training loss for checkpointing
    mode="min",  # Minimize the train_loss
    save_last=True,  # Always save the latest checkpoint
    dirpath=MODEL_PATH,  # Path to save the checkpoints
    filename="videollava-{epoch:02d}-{train_loss:.2f}"  # Checkpoint file naming convention
)

In [16]:
trainer = Trainer(
    default_root_dir=MODEL_PATH,
    accelerator="gpu",
    devices=[DEVICE],
    max_epochs=config.get("max_epochs"),
    accumulate_grad_batches=config.get("accumulate_grad_batches"),
    check_val_every_n_epoch=config.get("check_val_every_n_epoch"),
    gradient_clip_val=config.get("gradient_clip_val"),
    precision="16-mixed",
    limit_val_batches=1,
    num_sanity_val_steps=1,
    callbacks=[early_stop_callback, checkpoint_callback],  # Add checkpoint callback here
    log_every_n_steps=1
)

Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/ext3/miniforge3/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/logger_connector/logger_connector.py:75: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `lightning.pytorch` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default
`Trainer(limit_val_batches=1)` was configured so 1 batch will be used.


In [None]:
trainer.fit(model_module)

/ext3/miniforge3/lib/python3.12/site-packages/lightning/pytorch/callbacks/model_checkpoint.py:654: Checkpoint directory /scratch/rr4577/ASL-Interpreter/outputs/Video-LLaVA-7B-hf_QLORA_4bit_r64_alpha128 exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type      | Params | Mode 
--------------------------------------------
0 | model | PeftModel | 4.0 B  | train
--------------------------------------------
178 M     Trainable params
3.8 B     Non-trainable params
4.0 B     Total params
15,978.332Total estimated model params size (MB)
3682      Modules in train mode
1054      Modules in eval mode
SLURM auto-requeueing enabled. Setting signal handlers.


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Expanding inputs for image tokens in Video-LLaVa should be done in processing. Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. Using processors without these attributes in the config is deprecated and will throw an error in v4.44.
Expanding inputs for image tokens in Video-LLaVa should be done in processing. Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. Using processors without these attributes in the config is deprecated and will throw an error in v4.44.
Expanding inputs for image tokens in Video-LLaVa should be done in processing. Please add `patch_size` and `vision_feature_select_strategy` to the model's p

Training: |          | 0/? [00:00<?, ?it/s]

Expanding inputs for image tokens in Video-LLaVa should be done in processing. Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. Using processors without these attributes in the config is deprecated and will throw an error in v4.44.
Expanding inputs for image tokens in Video-LLaVa should be done in processing. Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. Using processors without these attributes in the config is deprecated and will throw an error in v4.44.


Validation: |          | 0/? [00:00<?, ?it/s]

Expanding inputs for image tokens in Video-LLaVa should be done in processing. Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. Using processors without these attributes in the config is deprecated and will throw an error in v4.44.
Expanding inputs for image tokens in Video-LLaVa should be done in processing. Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. Using processors without these attributes in the config is deprecated and will throw an error in v4.44.
Metric train_loss improved. New best score: 4.577


In [None]:
# # Save the processor and model locally
processor.save_pretrained(MODEL_PATH)
model.save_pretrained(MODEL_PATH)