In [18]:
import numpy as np
import torch
from datasets import load_dataset
from PIL import Image
import sys
import cv2
from torchvision.io import ImageReadMode, read_image
from transformers import (
    Trainer,
    TrainingArguments,
    SiglipModel,
    AutoTokenizer,
    AutoImageProcessor,
    AutoProcessor,
    EvalPrediction
)
from modeling_siglip import SiglipModel
import albumentations as A
from albumentations.pytorch import ToTensorV2

In [2]:
pretrained_model_path = "siglip-finetune-3090-ep10-0.874"

model = SiglipModel.from_pretrained(pretrained_model_path).to('cuda')
processor = AutoProcessor.from_pretrained(pretrained_model_path)
# tokenizer = AutoTokenizer.from_pretrained(pretrained_model_path)
# image_processor = AutoImageProcessor.from_pretrained(pretrained_model_path)
config = model.config

In [3]:
# split the ds into train and val. the image column is in format "image_1234_1.jpg". Take 1234 as the image id. Check if the id is smaller than 4086. If yes, put this in train, else in val.
def get_ds():
    # If error about missing name, use datasets==2.15.0
    dataset = load_dataset(path='.', data_files="til_siglip_ds.json")
    # train_dataset = dataset.filter(lambda example: int(example["image"].split("_")[1]) < 4086)
    # val_dataset = dataset.filter(lambda example: int(example["image"].split("_")[1]) >= 4086)
    # return train_dataset, val_dataset
    return dataset

# train_dataset, val_dataset = get_ds()
train_dataset = get_ds()

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [279]:
class Transform(torch.nn.Module):
    def __init__(self, image_size, mean, std):
        super().__init__()
        self.albu_transforms = A.Compose([
            A.Resize(image_size, image_size, interpolation=cv2.INTER_LANCZOS4),
            A.GaussNoise(var_limit=(500, 5000), p=1.0, per_channel=True),
            A.ISONoise(p=1.0, color_shift=(0.02, 0.07)),
            A.MultiplicativeNoise(p=1.0),
            A.AdvancedBlur(blur_limit=(3, 11), p=0.3),
            A.Flip(p=0.5),
            A.RandomRotate90(p=0.5),
            A.CLAHE(p=0.2),
            A.RandomBrightnessContrast(brightness_limit=0.2, contrast_limit=0.5, p=0.5),
            A.RandomGamma(p=0.2),
            A.Perspective(p=0.5),
            A.ImageCompression(quality_range=(25, 75), p=0.8),
            A.Normalize(mean=mean, std=std),
            ToTensorV2()  # CHW
        ])

    def forward(self, x) -> torch.Tensor:
        """`x` should be an instance of `PIL.Image.Image`"""
        with torch.no_grad():
            x = np.asarray(x.permute(1, 2, 0), dtype=np.uint8)  # torch CHW to HWC as required by albumentations
            x = self.albu_transforms(image=x)['image']
            x = x.float()
        return x

# For preprocessing the datasets.
# Initialize torchvision transforms and jit it for faster processing.
image_transformations = Transform(
    config.vision_config.image_size, processor.image_processor.image_mean, processor.image_processor.image_std
)
# image_transformations = torch.jit.script(image_transformations)

In [8]:
def preprocess_dataset(dataset):
    # Preprocessing the datasets.
    data = dataset['train']
    # We need to tokenize inputs and targets.
    column_names = data.column_names

    # 6. Get the column names for input/target.
    image_column = "image"
    caption_column = "label"

    # Preprocessing the datasets.
    # We need to tokenize input captions and transform the images.
    def tokenize_captions(examples):
        captions = list(examples[caption_column])
        text_inputs = processor.tokenizer(captions, padding="max_length")
        examples["input_ids"] = text_inputs.input_ids
        return examples
    
    data = data.map(
        function=tokenize_captions,
        batched=True,
        remove_columns=[col for col in column_names if col != image_column],
    )
    
    def transform_images(examples):
        image_dir = 'til_siglip_ds/'
        images = [read_image(image_dir + image_file, mode=ImageReadMode.RGB) for image_file in examples[image_column]]
        examples["pixel_values"] = [image_transformations(image) for image in images]
        return examples

    # Transform images on the fly as doing it on the whole dataset takes too much time.
    data.set_transform(transform_images)
    return data

In [9]:
train_dataset = preprocess_dataset(train_dataset)
# val_dataset = preprocess_dataset(val_dataset)

Map:   0%|          | 0/27913 [00:00<?, ? examples/s]

In [10]:
def collate_fn(examples):
    pixel_values = torch.stack([example["pixel_values"] for example in examples])
    input_ids = torch.tensor([example["input_ids"] for example in examples], dtype=torch.long)
    return {
        "pixel_values": pixel_values,
        "input_ids": input_ids,
        "return_loss": True,
    }

In [11]:
from sklearn.metrics import f1_score
def compute_metrics(pred: EvalPrediction):
    predictions = pred.predictions
    labels = pred.label_ids
    return {"accuracy": (predictions == labels).mean().item(), 'F1': f1_score(labels, predictions, average='macro')}

In [None]:
# SigLIP loss from https://github.com/google-research/big_vision/blob/01edb81a4716f93a48be43b3a4af14e29cdb3a7f/big_vision/trainers/proj/image_text/siglip.py#L287
# To add into https://github.com/huggingface/transformers/blob/bdb9106f247fca48a71eb384be25dbbd29b065a8/src/transformers/models/siglip/modeling_siglip.py#L1230
if return_loss:
    eye = torch.eye(logits_per_text.size(0), device=logits_per_text.device)
    m1_diag1 = -torch.ones_like(logits_per_text) + 2 * eye
    loglik = torch.nn.functional.logsigmoid(m1_diag1 * logits_per_text)
    nll = -torch.sum(loglik, dim=-1)
    loss = nll.mean()

In [12]:
torch.cuda.empty_cache()

In [13]:
training_args = TrainingArguments(
    learning_rate=2e-5,
    warmup_ratio=0.1,
    weight_decay=1e-4,
    per_device_train_batch_size=2,
    remove_unused_columns=False,
    output_dir="siglip-finetune",
    gradient_accumulation_steps=16,
    adam_beta1=0.9,
    adam_beta2=0.99,  # decrease from 0.999
    num_train_epochs=20,
    lr_scheduler_type="cosine",
    logging_strategy="steps",
    logging_steps=10,
    save_strategy="epoch",  # epoch have bug
    # eval_strategy="epoch",  # eval bugged causing oom
    save_total_limit=5,
    bf16=torch.cuda.is_bf16_supported(),
    bf16_full_eval=torch.cuda.is_bf16_supported(),
    fp16=not torch.cuda.is_bf16_supported(),
    fp16_full_eval=not torch.cuda.is_bf16_supported(),
    tf32=True,
    # load_best_model_at_end=True,
    # metric_for_best_model='F1',
    # greater_is_better=True,
    optim='adamw_torch_fused',
    # optim='adafactor',
    resume_from_checkpoint=pretrained_model_path,
    report_to='none',
    gradient_checkpointing=True,
    torch_compile = sys.platform == 'linux',
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    # eval_dataset=val_dataset,
    data_collator=collate_fn,
    compute_metrics=compute_metrics
)
train_result = trainer.train()

TypeError: image must be numpy array type

In [11]:
trainer.save_model("siglip-finetune-ep2")