In [1]:
# ============================================
# Continue Fine-tuning BLIP-base (Second Half)
# Kaggle (Full dataset, last 50%)
# ============================================

import torch
from torch.nn.utils.rnn import pad_sequence
from transformers import BlipProcessor, BlipForConditionalGeneration, Trainer, TrainingArguments
from datasets import Dataset, concatenate_datasets
from PIL import Image
import os, glob
os.environ["HF_DATASETS_CACHE"] = "/kaggle/working/hf_cache"
os.makedirs("/kaggle/working/hf_cache", exist_ok=True)

# --------------------------------------------
# Check GPU
# --------------------------------------------
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

# --------------------------------------------
# Load BLIP with checkpoint
# --------------------------------------------
processor = BlipProcessor.from_pretrained("/kaggle/input/caption-1/blip-anime-half1-final")
model = BlipForConditionalGeneration.from_pretrained(
    "/kaggle/input/caption-1/blip-anime/checkpoint-40000"
).to(device)

# --------------------------------------------
# Training arguments
# (only +45k more steps, total = 85k)
# --------------------------------------------
training_args = TrainingArguments(
    output_dir="./blip-anime",
    per_device_train_batch_size=4,
    fp16=True,
    save_strategy="steps",
    save_steps=10000,
    save_total_limit=2,
    logging_steps=500,
    max_steps=80000,   # 40k already done, +45k = 85k total
    report_to="none",
    remove_unused_columns=False,
)

# --------------------------------------------
# Preprocessing function
# --------------------------------------------
def preprocess(example):
    try:
        image = example["image"]
        if isinstance(image, str):
            image = Image.open(image).convert("RGB")
        else:
            image = image.convert("RGB")

        inputs = processor(images=image, text=example["text"], return_tensors="pt")

        return {
            "input_ids": inputs["input_ids"][0],
            "attention_mask": inputs["attention_mask"][0],
            "pixel_values": inputs["pixel_values"][0],
        }
    except Exception:
        return {}

# --------------------------------------------
# Collator (pads captions dynamically)
# --------------------------------------------
# def collate_fn(batch):
#     batch = [x for x in batch if x]  # drop empties

#     input_ids = [x["input_ids"] for x in batch]
#     attention_masks = [x["attention_mask"] for x in batch]
#     pixel_values = torch.stack([x["pixel_values"] for x in batch])

#     input_ids = pad_sequence(input_ids, batch_first=True, padding_value=processor.tokenizer.pad_token_id)
#     attention_masks = pad_sequence(attention_masks, batch_first=True, padding_value=0)

#     return {
#         "input_ids": input_ids,
#         "attention_mask": attention_masks,
#         "pixel_values": pixel_values,
#         "labels": input_ids.clone(),
#     }
def collate_fn(batch):
    new_batch = []
    for example in batch:
        try:
            image = example["image"]
            if isinstance(image, str):
                image = Image.open(image).convert("RGB")
            else:
                image = image.convert("RGB")

            inputs = processor(images=image, text=example["text"], return_tensors="pt")

            new_batch.append({
                "input_ids": inputs["input_ids"][0],
                "attention_mask": inputs["attention_mask"][0],
                "pixel_values": inputs["pixel_values"][0],
            })
        except Exception:
            continue

    input_ids = [x["input_ids"] for x in new_batch]
    attention_masks = [x["attention_mask"] for x in new_batch]
    pixel_values = torch.stack([x["pixel_values"] for x in new_batch])

    input_ids = pad_sequence(input_ids, batch_first=True, padding_value=processor.tokenizer.pad_token_id)
    attention_masks = pad_sequence(attention_masks, batch_first=True, padding_value=0)

    return {
        "input_ids": input_ids,
        "attention_mask": attention_masks,
        "pixel_values": pixel_values,
        "labels": input_ids.clone(),
    }

# --------------------------------------------
# Dataset (load directly from Kaggle input, no copy)
# --------------------------------------------



src_dir = "/kaggle/input/d/nguyengiabach1810/caption-1/none-yet___anime-captions/default/0.0.0/2f1272a94691fd3c8dede0a3697057ab1d4d2296"
files = sorted(glob.glob(f"{src_dir}/anime-captions-train-*-of-00057.arrow"))

valid_datasets = []
for f in files:
    try:
        ds = Dataset.from_file(f)
        valid_datasets.append(ds)
        print(f"✅ Loaded {f}")
    except Exception as e:
        print(f"⚠ Skipping corrupted shard {f}: {e}")

dataset = concatenate_datasets(valid_datasets)
print("✅ Full dataset size:", len(dataset))

# Take last 50%
half = len(dataset) // 2
half2 = dataset.select(range(half, len(dataset)))
print("✅ Half2 size:", len(half2))

# Preprocess (cache in writable dir)

# --------------------------------------------
# Setup Trainer
# --------------------------------------------
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=half2,
    data_collator=collate_fn,
)

# --------------------------------------------
# Continue training from checkpoint
# --------------------------------------------
trainer.train(resume_from_checkpoint="/kaggle/input/caption-1/blip-anime/checkpoint-40000")

# --------------------------------------------
# Save final model
# --------------------------------------------
model.save_pretrained("./blip-anime-half2-final")
processor.save_pretrained("./blip-anime-half2-final")

print("\n✅ Continued training complete! Final model saved at ./blip-anime-half2-final")


2025-09-20 04:45:09.421978: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1758343509.799780      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1758343509.911205      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


Using device: cuda
✅ Loaded /kaggle/input/d/nguyengiabach1810/caption-1/none-yet___anime-captions/default/0.0.0/2f1272a94691fd3c8dede0a3697057ab1d4d2296/anime-captions-train-00000-of-00057.arrow
✅ Loaded /kaggle/input/d/nguyengiabach1810/caption-1/none-yet___anime-captions/default/0.0.0/2f1272a94691fd3c8dede0a3697057ab1d4d2296/anime-captions-train-00001-of-00057.arrow
✅ Loaded /kaggle/input/d/nguyengiabach1810/caption-1/none-yet___anime-captions/default/0.0.0/2f1272a94691fd3c8dede0a3697057ab1d4d2296/anime-captions-train-00002-of-00057.arrow
✅ Loaded /kaggle/input/d/nguyengiabach1810/caption-1/none-yet___anime-captions/default/0.0.0/2f1272a94691fd3c8dede0a3697057ab1d4d2296/anime-captions-train-00003-of-00057.arrow
✅ Loaded /kaggle/input/d/nguyengiabach1810/caption-1/none-yet___anime-captions/default/0.0.0/2f1272a94691fd3c8dede0a3697057ab1d4d2296/anime-captions-train-00004-of-00057.arrow
✅ Loaded /kaggle/input/d/nguyengiabach1810/caption-1/none-yet___anime-captions/default/0.0.0/2f1272a9

There were missing keys in the checkpoint model loaded: ['text_decoder.cls.predictions.decoder.bias'].


Step,Training Loss
40500,0.4758
41000,0.4973
41500,0.5064
42000,0.497
42500,0.494
43000,0.4781
43500,0.4757
44000,0.4852
44500,0.4741
45000,0.4834





✅ Continued training complete! Final model saved at ./blip-anime-half2-final
