In [1]:
!pip install -U datasets



In [2]:
import os
from pathlib import Path

import torch
from torch.utils.data import DataLoader
from torch.cuda.amp import GradScaler
from tqdm import tqdm

from datasets import load_dataset
from transformers import (
    VisionEncoderDecoderModel,
    ViTImageProcessor,
    AutoTokenizer,
    default_data_collator,
    get_cosine_schedule_with_warmup,
)

In [3]:
model_name = "nlpconnect/vit-gpt2-image-captioning" 
out = "vitgpt2_flickr8k_finetuned"
ds_name = "Naveengo/flickr8k"     
split = "train"
cnt = 2000                   
batch_size = 32                      
n_epochs = 1
learning_rate = 2e-5                    
m_targer_len = 32
dev = "cuda" if torch.cuda.is_available() else "cpu"

Path(out).mkdir(parents=True, exist_ok=True)
print(f"dev: {dev}")

torch.backends.cudnn.benchmark = True

Device: cpu


In [4]:
dataset = load_dataset(ds_name, split=split)
if cnt and len(dataset) > cnt:
    dataset = dataset.shuffle(seed=42).select(range(cnt))
print("Loaded", len(dataset), "examples")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loaded 2000 examples


In [5]:
model = VisionEncoderDecoderModel.from_pretrained(model_name).to(dev)
feature_extractor = ViTImageProcessor.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

for p in model.get_encoder().parameters():
    p.requires_grad_(False)

In [6]:
def transform(example):
    pixel = feature_extractor(example["image"], return_tensors="pt").pixel_values[0]
    ids = tokenizer(
        example["text"],
        padding="max_length",
        truncation=True,
        max_length=m_targer_len,
        return_tensors="pt",
    ).input_ids[0]
    ids[ids == tokenizer.pad_token_id] = -100
    return {"pixel_values": pixel, "labels": ids}

train_dataset = dataset.map(transform, remove_columns=dataset.column_names)

train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True,
    collate_fn=default_data_collator,
    num_workers=2,
    pin_memory=True,
)

In [7]:
optimizer = torch.optim.AdamW(model.parameters(), learning_rate=learning_rate)
total_steps = len(train_loader) * n_epochs
scheduler = get_cosine_schedule_with_warmup(
    optimizer,
    num_warmup_steps=int(0.1 * total_steps),
    num_training_steps=total_steps,
)

In [8]:
scaler = GradScaler(enabled=dev == "cuda")
model.train()
for epoch in range(n_epochs):
    epoch_loss = 0.0
    progress = tqdm(train_loader, desc=f"Epoch {epoch + 1}/{n_epochs}")
    for batch in progress:
        batch = {k: v.to(dev, non_blocking=True) for k, v in batch.items()}
        with torch.autocast("cuda", dtype=torch.float16, enabled=dev == "cuda"):
            loss = model(**batch).loss
        scaler.scale(loss).backward()

        scaler.step(optimizer)
        scaler.update()
        optimizer.zero_grad(set_to_none=True)
        scheduler.step()

        epoch_loss += loss.item()
        progress.set_postfix(loss=f"{loss.item():.4f}")

    print(f"Epoch {epoch + 1} mean loss: {epoch_loss / len(train_loader):.4f}")


  scaler = GradScaler(enabled=DEVICE == "cuda")
Epoch 1/1:   0%|          | 0/63 [00:00<?, ?it/s]We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.
Epoch 1/1: 100%|██████████| 63/63 [48:15<00:00, 45.96s/it, loss=5.5471]

Epoch 1 mean loss: 6.2497





In [9]:
model.save_pretrained(out)
feature_extractor.save_pretrained(out)
tokenizer.save_pretrained(out)

('vitgpt2_flickr8k_finetuned/tokenizer_config.json',
 'vitgpt2_flickr8k_finetuned/special_tokens_map.json',
 'vitgpt2_flickr8k_finetuned/vocab.json',
 'vitgpt2_flickr8k_finetuned/merges.txt',
 'vitgpt2_flickr8k_finetuned/added_tokens.json',
 'vitgpt2_flickr8k_finetuned/tokenizer.json')

In [10]:
model.eval()
with torch.no_grad():
    sample = dataset.shuffle(seed=123).select(range(4))
    pixels = feature_extractor(sample["image"], return_tensors="pt").pixel_values.to(dev)
    with torch.autocast("cuda", dtype=torch.float16, enabled=dev == "cuda"):
        gen_ids = model.generate(pixels, max_length=m_targer_len)
    captions = tokenizer.batch_decode(gen_ids, skip_special_tokens=True)

for cap in captions:
    print(cap)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.53.0. You should pass an instance of `Cache` instead, e.g. `past_key_values=DynamicCache.from_legacy_cache(past_key_values)`.


a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
dogs and dogs in grassy area with dog in grassy area 
 girl and girl on a a a a a a a a a a a a a a a a a a a a a a a a a a a


Выводы:

Эта штука на ресурсах коллаба вообще не училась(

По итогу я сначала пытался уменьшить размер датасета (4000 -> 2000)

Не сказать что это сильно помогло, и сам ноутбук падал после первой же эпохи, как следствия я попробовал оставить всего одну эпоху - результаты удручающие

Оставлю как есть, а то тут уже сроки проверки почти подошли, пусть лежит чтоб труды зря не проходили)