In [None]:
!rm -rf /content/drive


In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)


Mounted at /content/drive


In [None]:
!apt-get install git-lfs
!git lfs install
!git clone https://huggingface.co/datasets/BoKelvin/SLAKE
!cp -r slake /content/drive/MyDrive/UM/slake_dataset/

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
git-lfs is already the newest version (3.0.2-1ubuntu0.3).
0 upgraded, 0 newly installed, 0 to remove and 41 not upgraded.
Git LFS initialized.
fatal: destination path 'SLAKE' already exists and is not an empty directory.
cp: cannot stat 'slake': No such file or directory


In [None]:
# extrac to drive
import shutil
import os

zip_path = "/content/drive/MyDrive/UM/slake_dataset/imgs.zip"
extract_dir = "/content/drive/MyDrive/UM/slake_dataset/images"

print(f"Unzipping using shutil...")
try:
    shutil.unpack_archive(zip_path, extract_dir)
    print(f" Successfully extracted to {extract_dir}")

    # Verify extraction
    if os.path.exists(extract_dir):
        file_count = len(os.listdir(extract_dir))
        print(f"Found {file_count} files/directories in {extract_dir}")

except Exception as e:
    print(f" Error: {str(e)}")

In [None]:
# pip install rouge_score


In [None]:
import os
import json
import torch
from PIL import Image
from tqdm.auto import tqdm
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import Blip2Processor, Blip2ForConditionalGeneration
from peft import LoraConfig, get_peft_model
from torch.cuda.amp import GradScaler, autocast
from torch.optim.lr_scheduler import CosineAnnealingLR
import warnings

warnings.filterwarnings('ignore')

# configuration
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {DEVICE}")
if DEVICE == "cuda":
    print(f"GPU: {torch.cuda.get_device_name(0)}")

BASE_DIR = "/content/drive/MyDrive/UM"
DATA_DIR = os.path.join(BASE_DIR, "slake_cache", "SLAKE")
IMG_DIR = os.path.join(DATA_DIR, "images")
SAVE_DIR = os.path.join(BASE_DIR, "slake_models", "blip_model")
os.makedirs(SAVE_DIR, exist_ok=True)

# Training hyperparameters
EPOCHS = 10
LR = 1e-5
MAX_LEN = 256
GRAD_CLIP = 1.0
BATCH_SIZE = 16
ACCUMULATION_STEPS = 4
WEIGHT_DECAY = 0.01
USE_AMP = False
PATIENCE = 3

# Initialize GradScaler for AMP if using CUDA
scaler = GradScaler(enabled=USE_AMP)

# data hepler funtons
def normalize_answer(answer):
    """Normalize answer string."""
    return str(answer).strip().lower()

def load_image(img_name):
    """Load and preprocess image with error handling."""
    img_path = os.path.join(IMG_DIR, img_name)
    try:
        img = Image.open(img_path).convert("RGB")
        img = img.resize((224, 224), Image.Resampling.LANCZOS)
        return img
    except Exception as e:
        print(f"Warning: Could not load image {img_path}. Error: {e}")
        return Image.new("RGB", (224, 224), "black")

def load_json(split):
    """Load JSON data for a given split."""
    json_path = os.path.join(DATA_DIR, f"{split}.json")
    with open(json_path, 'r') as f:
        return json.load(f)

def check_tensor_for_issues(tensor, name=""):
    """Check tensor for NaN, Inf, or extreme values."""
    if tensor is None:
        return False

    has_nan = torch.isnan(tensor).any().item()
    has_inf = torch.isinf(tensor).any().item()

    if has_nan or has_inf:
        print(f"Warning: Tensor {name} has NaN: {has_nan}, Inf: {has_inf}")
        return True

    return False

# dataset and dataloader
class VQADataset(Dataset):
    """Dataset for VQA tasks with filtering."""

    def __init__(self, data, split="train"):
        self.samples = []
        self.split = split

        for sample in data:
            if not all(key in sample for key in ["question", "answer", "answer_type", "img_name", "q_lang"]):
                continue

            if (isinstance(sample["question"], str) and
                isinstance(sample["answer"], str) and
                sample["answer_type"] in ["OPEN", "CLOSED"] and
                sample.get("img_name") and
                sample["q_lang"] == "en" and
                len(sample["answer"].strip()) > 0):

                self.samples.append({
                    "image_name": sample["img_name"],
                    "question": sample["question"],
                    "answer": sample["answer"],
                    "answer_type": sample["answer_type"]
                })

        open_cnt = sum(s["answer_type"] == "OPEN" for s in self.samples)
        closed_cnt = sum(s["answer_type"] == "CLOSED" for s in self.samples)

        print(f"{split.upper():12s} | Total samples: {len(self.samples):5d} | "
              f"Open: {open_cnt:5d} | Closed: {closed_cnt:5d}  "
              )

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        sample = self.samples[idx]
        return {
            "image": load_image(sample["image_name"]),
            "question": sample["question"],
            "answer": normalize_answer(sample["answer"]),
            "answer_type": sample["answer_type"]
        }

def collate_fn(batch):
    """Collate function for DataLoader."""
    return {
        "images": [b["image"] for b in batch],
        "questions": [b["question"] for b in batch],
        "answers": [b["answer"] for b in batch],
        "answer_types": [b["answer_type"] for b in batch]
    }

# Load data
print("-" * 35)
print("Loading datasets...")
train_raw = load_json("train")
val_raw = load_json("validation")

train_dataset = VQADataset(train_raw, "train")
val_dataset = VQADataset(val_raw, "validation")

train_loader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    collate_fn=collate_fn,
    num_workers=2,
    pin_memory=True if DEVICE == "cuda" else False
)

val_loader = DataLoader(
    val_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    collate_fn=collate_fn,
    num_workers=2,
    pin_memory=True if DEVICE == "cuda" else False
)

print(f"Train batches: {len(train_loader)}")
print(f"Val batches: {len(val_loader)}")

# mdoel setup
print("-" * 35)
print("Loading BLIP-2 model...")

# Load processor and model
processor = Blip2Processor.from_pretrained("Salesforce/blip2-flan-t5-xl")

# Use float16 for mixed precision training
model_dtype = torch.float16 if USE_AMP else torch.float32
model = Blip2ForConditionalGeneration.from_pretrained(
    "Salesforce/blip2-flan-t5-xl",
    torch_dtype=model_dtype,
    device_map="auto" if DEVICE == "cuda" else None,
    low_cpu_mem_usage=True
)

# Move to device if not using device_map
if not model.device.type == DEVICE:
    model = model.to(DEVICE)

# Unfreeze QFormer to allow fine-tuning
for name, param in model.named_parameters():
    if "qformer" in name:
        param.requires_grad = True
    else:
        param.requires_grad = False

# Apply LoRA to language model
print("Applying LoRA to language model...")
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["q", "v", "k", "o"],
    bias="none",
    task_type="SEQ_2_SEQ_LM"
)

model.language_model = get_peft_model(model.language_model, lora_config)
model.language_model.print_trainable_parameters()

# Enable gradient checkpointing to save memory
model.gradient_checkpointing_enable()

# optimizer and scheduler
trainable_params = [p for p in model.parameters() if p.requires_grad]

optimizer = AdamW(
    trainable_params,
    lr=LR,
    weight_decay=WEIGHT_DECAY,
    betas=(0.9, 0.999),
    eps=1e-8
)

# Use cosine annealing with a warm-up scheduler
total_training_steps = EPOCHS * len(train_loader) // ACCUMULATION_STEPS
warmup_steps = int(0.1 * total_training_steps)  # 10% warmup
scheduler = CosineAnnealingLR(
    optimizer,
    T_max=total_training_steps,
    eta_min=LR * 0.01
)

#helper functions
def format_prompts(questions, answers, answer_types):
    """Format prompts and targets for training."""
    prompts, targets = [], []

    for q, a, ans_type in zip(questions, answers, answer_types):
        prompt = f"Question: {q}\nAnswer:"
        prompts.append(prompt)

        if ans_type == "CLOSED":
            targets.append(a)
        else:
            targets.append(f"The answer is {a}.")

    return prompts, targets

def evaluate_model(model, data_loader, processor, device):
    """Evaluate model on validation set."""
    model.eval()
    total_loss = 0.0
    total_batches = 0

    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Evaluating", leave=False):
            prompts, targets = format_prompts(
                batch["questions"], batch["answers"], batch["answer_types"]
            )

            inputs = processor(
                images=batch["images"],
                text=prompts,
                return_tensors="pt",
                padding=True,
                truncation=True,
                max_length=MAX_LEN
            ).to(device)

            labels = processor.tokenizer(
                targets,
                return_tensors="pt",
                padding=True,
                truncation=True,
                max_length=MAX_LEN
            ).input_ids.to(device)

            labels[labels == processor.tokenizer.pad_token_id] = -100

            # Forward pass
            if USE_AMP:
                with autocast():
                    outputs = model(**inputs, labels=labels)
                    loss = outputs.loss
            else:
                outputs = model(**inputs, labels=labels)
                loss = outputs.loss

            if torch.isnan(loss) or torch.isinf(loss):
                print("Warning: NaN/Inf loss in validation")
                continue

            total_loss += loss.item()
            total_batches += 1

    avg_loss = total_loss / max(total_batches, 1)

    if device == "cuda":
        torch.cuda.empty_cache()

    return avg_loss

# Main Trainingg Loop
print("-" * 35)
print("Starting training...")

history = {
    "train_loss": [],
    "val_loss": [],
    "learning_rates": []
}

best_val_loss = float('inf')
epochs_without_improvement = 0

for epoch in range(EPOCHS):
    print(f"\n{'-'*35}")
    print(f"Epoch {epoch+1}/{EPOCHS}")
    print(f"{'-'*35}")

    model.train()
    optimizer.zero_grad()

    epoch_loss = 0.0
    total_steps = 0
    gradient_norms = []

    progress_bar = tqdm(train_loader, desc=f"Training")
    for step, batch in enumerate(progress_bar):
        prompts, targets = format_prompts(
            batch["questions"], batch["answers"], batch["answer_types"]
        )

        inputs = processor(
            images=batch["images"],
            text=prompts,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=MAX_LEN
        ).to(DEVICE)

        labels = processor.tokenizer(
            targets,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=MAX_LEN
        ).input_ids.to(DEVICE)

        labels[labels == processor.tokenizer.pad_token_id] = -100

        if (check_tensor_for_issues(inputs.pixel_values, "pixel_values") or
            check_tensor_for_issues(labels, "labels")):
            print("Skipping batch due to tensor issues")
            continue

        # FIXED: Correct autocast usage
        if USE_AMP:
            with autocast():
                outputs = model(**inputs, labels=labels)
                loss = outputs.loss / ACCUMULATION_STEPS
        else:
            outputs = model(**inputs, labels=labels)
            loss = outputs.loss / ACCUMULATION_STEPS

        if torch.isnan(loss) or torch.isinf(loss):
            print(f"Skipping batch due to invalid loss: {loss}")
            optimizer.zero_grad()
            continue

        # Backward pass
        if USE_AMP:
            scaler.scale(loss).backward()
        else:
            loss.backward()

        epoch_loss += loss.item() * ACCUMULATION_STEPS
        total_steps += 1

        # Gradient accumulation step
        if (step + 1) % ACCUMULATION_STEPS == 0:
            if USE_AMP:
                scaler.unscale_(optimizer)

            total_norm = torch.nn.utils.clip_grad_norm_(
                model.parameters(),
                GRAD_CLIP,
                norm_type=2.0
            )
            gradient_norms.append(total_norm.item())

            if USE_AMP:
                scaler.step(optimizer)
                scaler.update()
            else:
                optimizer.step()

            scheduler.step()
            optimizer.zero_grad()

            current_lr = scheduler.get_last_lr()[0]
            progress_bar.set_postfix({
                "loss": f"{loss.item() * ACCUMULATION_STEPS:.4f}",
                "grad_norm": f"{total_norm:.2f}",
                "lr": f"{current_lr:.2e}"
            })

    if total_steps % ACCUMULATION_STEPS != 0:
        if USE_AMP:
            scaler.unscale_(optimizer)

        torch.nn.utils.clip_grad_norm_(model.parameters(), GRAD_CLIP)

        if USE_AMP:
            scaler.step(optimizer)
            scaler.update()
        else:
            optimizer.step()

        optimizer.zero_grad()

    avg_train_loss = epoch_loss / max(total_steps, 1)
    avg_grad_norm = sum(gradient_norms) / max(len(gradient_norms), 1) if gradient_norms else 0

    avg_val_loss = evaluate_model(model, val_loader, processor, DEVICE)

    history["train_loss"].append(avg_train_loss)
    history["val_loss"].append(avg_val_loss)
    history["learning_rates"].append(scheduler.get_last_lr()[0])

    print(f"\nEpoch {epoch+1}  Training Loss: {avg_train_loss:.4f}, "
      f"Validation Loss: {avg_val_loss:.4f}, Avg Grad Norm: {avg_grad_norm:.2f} "
      )


    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        epochs_without_improvement = 0

        print(f"   New best model! Saving...")

        # Save model without .pt extension, use save_pretrained
        model.language_model.save_pretrained(SAVE_DIR)
        processor.save_pretrained(SAVE_DIR)

        with open(os.path.join(SAVE_DIR, "training_history.json"), "w") as f:
            json.dump(history, f, indent=2)

    else:
        epochs_without_improvement += 1
        print(f"  No improvement for {epochs_without_improvement} epoch(s)")

    if epochs_without_improvement >= PATIENCE:
        print(f"\nEarly stopping triggered after {epoch+1} epochs")
        break

    if DEVICE == "cuda":
        torch.cuda.empty_cache()




Using device: cuda
GPU: NVIDIA A100-SXM4-80GB
-----------------------------------
Loading datasets...
TRAIN        | Total samples:  4918 | Open:  2975 | Closed:  1943  
VALIDATION   | Total samples:  1053 | Open:   631 | Closed:   422  
Train batches: 308
Val batches: 66
-----------------------------------
Loading BLIP-2 model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Applying LoRA to language model...
trainable params: 18,874,368 || all params: 2,868,631,552 || trainable%: 0.6580
-----------------------------------
Starting training...

-----------------------------------
Epoch 1/10
-----------------------------------


Training:   0%|          | 0/308 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/66 [00:00<?, ?it/s]


Epoch 1  Training Loss: 1.6319, Validation Loss: 0.8769, Avg Grad Norm: 3.10 
   New best model! Saving...

-----------------------------------
Epoch 2/10
-----------------------------------


Training:   0%|          | 0/308 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/66 [00:00<?, ?it/s]


Epoch 2  Training Loss: 0.9093, Validation Loss: 0.7497, Avg Grad Norm: 2.17 
   New best model! Saving...

-----------------------------------
Epoch 3/10
-----------------------------------


Training:   0%|          | 0/308 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/66 [00:00<?, ?it/s]


Epoch 3  Training Loss: 0.7974, Validation Loss: 0.6791, Avg Grad Norm: 1.74 
   New best model! Saving...

-----------------------------------
Epoch 4/10
-----------------------------------


Training:   0%|          | 0/308 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/66 [00:00<?, ?it/s]


Epoch 4  Training Loss: 0.7350, Validation Loss: 0.6399, Avg Grad Norm: 1.54 
   New best model! Saving...

-----------------------------------
Epoch 5/10
-----------------------------------


Training:   0%|          | 0/308 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/66 [00:00<?, ?it/s]


Epoch 5  Training Loss: 0.6880, Validation Loss: 0.6121, Avg Grad Norm: 1.43 
   New best model! Saving...

-----------------------------------
Epoch 6/10
-----------------------------------


Training:   0%|          | 0/308 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/66 [00:00<?, ?it/s]


Epoch 6  Training Loss: 0.6606, Validation Loss: 0.5896, Avg Grad Norm: 1.37 
   New best model! Saving...

-----------------------------------
Epoch 7/10
-----------------------------------


Training:   0%|          | 0/308 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/66 [00:00<?, ?it/s]


Epoch 7  Training Loss: 0.6435, Validation Loss: 0.5772, Avg Grad Norm: 1.37 
   New best model! Saving...

-----------------------------------
Epoch 8/10
-----------------------------------


Training:   0%|          | 0/308 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/66 [00:00<?, ?it/s]


Epoch 8  Training Loss: 0.6331, Validation Loss: 0.5715, Avg Grad Norm: 1.34 
   New best model! Saving...

-----------------------------------
Epoch 9/10
-----------------------------------


Training:   0%|          | 0/308 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/66 [00:00<?, ?it/s]


Epoch 9  Training Loss: 0.6250, Validation Loss: 0.5673, Avg Grad Norm: 1.34 
   New best model! Saving...

-----------------------------------
Epoch 10/10
-----------------------------------


Training:   0%|          | 0/308 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/66 [00:00<?, ?it/s]


Epoch 10  Training Loss: 0.6265, Validation Loss: 0.5664, Avg Grad Norm: 1.39 
   New best model! Saving...


TypeError: unsupported operand type(s) for -: 'str' and 'str'

In [None]:
# save model
print("-"*35)
print("Training complete!")

final_save_dir = os.path.join(BASE_DIR, "slake_models", "blip2_final")
os.makedirs(final_save_dir, exist_ok=True)

model.language_model.save_pretrained(final_save_dir)
processor.save_pretrained(final_save_dir)

with open(os.path.join(final_save_dir, "final_history.json"), "w") as f:
    json.dump(history, f, indent=2)

print(f"Best validation loss: {best_val_loss:.4f}")
print(f"Models saved to: {SAVE_DIR} and {final_save_dir}")
print("-"*35)

print("\nTraining Statistics:")
print(f"Total epochs trained: {len(history['train_loss'])}")
print(f"Final training loss: {history['train_loss'][-1]:.4f}")
print(f"Final validation loss: {history['val_loss'][-1]:.4f}")

if DEVICE == "cuda":
    memory_allocated = torch.cuda.memory_allocated() / 1024**3
    memory_reserved = torch.cuda.memory_reserved() / 1024**3
    print(f"\nGPU Memory Usage:")
    print(f"  Allocated: {memory_allocated:.2f} GB")
    print(f"  Reserved:  {memory_reserved:.2f} GB")
    torch.cuda.empty_cache()

-----------------------------------
Training complete!
Best validation loss: 0.5664
Models saved to: /content/drive/MyDrive/UM/slake_models/blip_model and /content/drive/MyDrive/UM/slake_models/blip2_final
-----------------------------------

Training Statistics:
Total epochs trained: 10
Final training loss: 0.6265
Final validation loss: 0.5664

GPU Memory Usage:
  Allocated: 42.12 GB
  Reserved:  43.27 GB


# ***LLAVA Training***

In [None]:

# LLaVA LoRA VQA TRAINING (SLAKE: OPEN + CLOSED)

import os, json, torch
from PIL import Image
from tqdm.auto import tqdm
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from torch.amp import autocast, GradScaler

from transformers import (
    LlavaForConditionalGeneration,
    AutoProcessor,
    get_linear_schedule_with_warmup
)
from peft import LoraConfig, get_peft_model

# CONFIG
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

BASE_DIR = "/content/drive/MyDrive/UM"
DATA_DIR = os.path.join(BASE_DIR, "slake_cache", "SLAKE")
IMG_DIR  = os.path.join(DATA_DIR, "images")
SAVE_DIR = os.path.join(BASE_DIR, "slake_models", "llava_model2")

MODEL_ID = "llava-hf/llava-1.5-7b-hf"

EPOCHS = 10
LR = 2e-5
BATCH_SIZE = 4
GRAD_ACCUM = 8
GRAD_CLIP = 1.0
DTYPE = torch.float16

os.makedirs(SAVE_DIR, exist_ok=True)

# UTILITIES
def normalize_answer(a):
    return str(a).strip().lower().rstrip(".")

def load_image(name):
    path = os.path.join(IMG_DIR, name)
    try:
        return Image.open(path).convert("RGB")
    except:
        return Image.new("RGB", (224, 224), "black")

def load_json(split):
    with open(os.path.join(DATA_DIR, f"{split}.json")) as f:
        return json.load(f)

# DATASET

class VQADataset(Dataset):
    def __init__(self, data, split):
        self.samples = [
            x for x in data
            if x.get("q_lang") == "en"
            and x.get("answer_type") in ["OPEN", "CLOSED"]
            and isinstance(x.get("question"), str)
            and isinstance(x.get("answer"), str)
            and x.get("img_name")
        ]

        open_cnt = sum(x["answer_type"] == "OPEN" for x in self.samples)
        closed_cnt = sum(x["answer_type"] == "CLOSED" for x in self.samples)

        print(f"{split.upper()} | kept: {len(self.samples)} | OPEN: {open_cnt} | CLOSED: {closed_cnt}")

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        x = self.samples[idx]
        return {
            "image": load_image(x["img_name"]),
            "question": x["question"],
            "answer": normalize_answer(x["answer"])
        }

def collate_fn(batch):
    return {
        "images": [b["image"] for b in batch],
        "questions": [b["question"] for b in batch],
        "answers": [b["answer"] for b in batch],
    }

# LOAD DATA
train_loader = DataLoader(
    VQADataset(load_json("train"), "train"),
    batch_size=BATCH_SIZE,
    shuffle=True,
    collate_fn=collate_fn
)

val_loader = DataLoader(
    VQADataset(load_json("validation"), "validation"),
    batch_size=BATCH_SIZE,
    shuffle=False,
    collate_fn=collate_fn
)

# LOAD MODEL
processor = AutoProcessor.from_pretrained(MODEL_ID)

model = LlavaForConditionalGeneration.from_pretrained(
    MODEL_ID,
    torch_dtype=DTYPE,
    device_map="auto"
)

# APPLY LoRA
model = get_peft_model(
    model,
    LoraConfig(
        r=8,
        lora_alpha=16,
        lora_dropout=0.05,
        target_modules=["q_proj", "v_proj"],
        task_type="CAUSAL_LM"
    )
)

for name, param in model.named_parameters():
    param.requires_grad = "lora_" in name

# OPTIMIZER
optimizer = AdamW(
    [p for p in model.parameters() if p.requires_grad],
    lr=LR
)
scaler = GradScaler()

# PROMPT + LABEL BUILD
def build_inputs(images, questions, answers):
    texts = [
        f"USER: <image>\n{q}\nASSISTANT: {a}"
        for q, a in zip(questions, answers)
    ]

    enc = processor(
        images=images,
        text=texts,
        padding=True,
        truncation=True,
        return_tensors="pt"
    )

    labels = enc.input_ids.clone()

    for i, q in enumerate(questions):
        prompt = f"USER: <image>\n{q}\nASSISTANT:"
        prompt_ids = processor.tokenizer(
            prompt, return_tensors="pt"
        ).input_ids[0]
        labels[i, :len(prompt_ids)] = -100

    enc["labels"] = labels
    return enc


# TRAINING + VALIDATION
print(" START LLaVA TRAINING")

for epoch in range(EPOCHS):
    #  TRAIN
    model.train()
    train_loss = 0.0
    optimizer.zero_grad()

    pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS} [TRAIN]")
    for i, batch in enumerate(pbar):
        enc = build_inputs(
            batch["images"],
            batch["questions"],
            batch["answers"]
        )
        enc = {k: v.to(DEVICE) for k, v in enc.items()}

        with autocast(device_type="cuda", dtype=torch.float16):
            loss = model(**enc).loss / GRAD_ACCUM

        scaler.scale(loss).backward()

        if (i + 1) % GRAD_ACCUM == 0:
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), GRAD_CLIP)
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()

        true_loss = loss.item() * GRAD_ACCUM
        train_loss += true_loss
        pbar.set_postfix(loss=f"{true_loss:.4f}")

    train_loss /= len(train_loader)

    # VALIDATION
    model.eval()
    val_loss = 0.0

    with torch.no_grad():
        pbar = tqdm(val_loader, desc=f"Epoch {epoch+1}/{EPOCHS} [VAL]")
        for batch in pbar:
            enc = build_inputs(
                batch["images"],
                batch["questions"],
                batch["answers"]
            )
            enc = {k: v.to(DEVICE) for k, v in enc.items()}

            with autocast(device_type="cuda", dtype=torch.float16):
                loss = model(**enc).loss

            val_loss += loss.item()
            pbar.set_postfix(loss=f"{loss.item():.4f}")

    val_loss /= len(val_loader)

    #  LOG
    print(
        f"Epoch {epoch+1} | "
        f"Train Loss: {train_loss:.4f} | "
        f"Val Loss: {val_loss:.4f}"
    )

# SAVE
model.save_pretrained(SAVE_DIR)
processor.save_pretrained(SAVE_DIR)

print("LLaVA TRAINING COMPLETE — CLEAN & VALIDATED")


TRAIN | kept: 4919 | OPEN: 2976 | CLOSED: 1943
VALIDATION | kept: 1053 | OPEN: 631 | CLOSED: 422


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

 START LLaVA TRAINING


Epoch 1/10 [TRAIN]:   0%|          | 0/1230 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Epoch 1/10 [VAL]:   0%|          | 0/264 [00:00<?, ?it/s]

Epoch 1 | Train Loss: 7.0387 | Val Loss: 4.1645


Epoch 2/10 [TRAIN]:   0%|          | 0/1230 [00:00<?, ?it/s]

Epoch 2/10 [VAL]:   0%|          | 0/264 [00:00<?, ?it/s]

Epoch 2 | Train Loss: 3.9924 | Val Loss: 3.8585


Epoch 3/10 [TRAIN]:   0%|          | 0/1230 [00:00<?, ?it/s]

Epoch 3/10 [VAL]:   0%|          | 0/264 [00:00<?, ?it/s]

Epoch 3 | Train Loss: 3.8045 | Val Loss: 3.7779


Epoch 4/10 [TRAIN]:   0%|          | 0/1230 [00:00<?, ?it/s]

Epoch 4/10 [VAL]:   0%|          | 0/264 [00:00<?, ?it/s]

Epoch 4 | Train Loss: 3.7696 | Val Loss: 3.7610


Epoch 5/10 [TRAIN]:   0%|          | 0/1230 [00:00<?, ?it/s]

Epoch 5/10 [VAL]:   0%|          | 0/264 [00:00<?, ?it/s]

Epoch 5 | Train Loss: 3.7584 | Val Loss: 3.7534


Epoch 6/10 [TRAIN]:   0%|          | 0/1230 [00:00<?, ?it/s]

Epoch 6/10 [VAL]:   0%|          | 0/264 [00:00<?, ?it/s]

Epoch 6 | Train Loss: 3.7532 | Val Loss: 3.7499


Epoch 7/10 [TRAIN]:   0%|          | 0/1230 [00:00<?, ?it/s]

Epoch 7/10 [VAL]:   0%|          | 0/264 [00:00<?, ?it/s]

Epoch 7 | Train Loss: 3.7511 | Val Loss: 3.7477


Epoch 8/10 [TRAIN]:   0%|          | 0/1230 [00:00<?, ?it/s]

Epoch 8/10 [VAL]:   0%|          | 0/264 [00:00<?, ?it/s]

Epoch 8 | Train Loss: 3.7495 | Val Loss: 3.7463


Epoch 9/10 [TRAIN]:   0%|          | 0/1230 [00:00<?, ?it/s]

Epoch 9/10 [VAL]:   0%|          | 0/264 [00:00<?, ?it/s]

Epoch 9 | Train Loss: 3.7485 | Val Loss: 3.7456


Epoch 10/10 [TRAIN]:   0%|          | 0/1230 [00:00<?, ?it/s]

Epoch 10/10 [VAL]:   0%|          | 0/264 [00:00<?, ?it/s]

Epoch 10 | Train Loss: 3.7480 | Val Loss: 3.7448
LLaVA TRAINING COMPLETE — CLEAN & VALIDATED
