In [1]:
!pip show datasets
!pip show bitsandbytes

Name: datasets
Version: 3.5.1
Summary: HuggingFace community-driven open-source library of datasets
Home-page: https://github.com/huggingface/datasets
Author: HuggingFace Inc.
Author-email: thomas@huggingface.co
License: Apache 2.0
Location: /home/bboulbarss/.local/lib/python3.11/site-packages
Requires: aiohttp, dill, filelock, fsspec, huggingface-hub, multiprocess, numpy, packaging, pandas, pyarrow, pyyaml, requests, tqdm, xxhash
Required-by: 
Name: bitsandbytes
Version: 0.45.5
Summary: k-bit optimizers and matrix multiplication routines.
Home-page: 
Author: 
Author-email: Tim Dettmers <dettmers@cs.washington.edu>
License: MIT License

Copyright (c) Facebook, Inc. and its affiliates.

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell

In [2]:
import os
import random
from PIL import Image
import tqdm
import numpy as np
import pandas as pd
import csv
import time
from datetime import datetime
import pytz

import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torchvision.datasets import ImageFolder
import torchvision.transforms as transforms

from datasets import Dataset, Features, Image as HFImage, Value, Sequence
from peft import LoraConfig, get_peft_model
from transformers import TrainerCallback, Trainer, TrainingArguments, EarlyStoppingCallback
from transformers import IdeficsForVisionText2Text, AutoProcessor, Trainer, TrainingArguments, BitsAndBytesConfig

# Device setup
device = "cuda" if torch.cuda.is_available() else "cpu"

In [3]:
from transformers.utils import logging
logging.set_verbosity_error()

import warnings
warnings.filterwarnings("ignore")

os.environ["WANDB_DISABLED"] = "true"

In [4]:
def create_distractors_single_object(true_label):
    shapes = ['cube', 'sphere', 'cone', 'cylinder']
    colors = ['blue', 'brown', 'cyan', 'gray', 'green', 'purple', 'red', 'yellow']

    all_labels = [f"A photo of a {color} {shape}" for shape in shapes for color in colors]
    all_labels.remove(true_label)

    random_labels = random.sample(all_labels, k=4)

    return random_labels

In [5]:
def create_distractors_two_object(true_labels):
    shapes = ['cube', 'sphere', 'cone', 'cylinder']
    colors = ['blue', 'brown', 'cyan', 'gray', 'green', 'purple', 'red', 'yellow']

    _, _, _, _, color1, shape1 = true_labels[0].split()
    _, _, _, _, color2, shape2 = true_labels[1].split()

    hard_distractors = [f"A photo of a {color1} {shape2}", f"A photo of a {color2} {shape1}"]

    exclude = set(true_labels + tuple(hard_distractors))
    all_labels = [f"A photo of a {color} {shape}" for shape in shapes for color in colors if f"A photo of a {color} {shape}" not in exclude]
    random_labels = random.sample(all_labels, k=4-len(hard_distractors))

    return hard_distractors + random_labels

In [6]:
def create_distractors_relational(true_label):
    shapes = ['cube', 'sphere', 'cone', 'cylinder']
    relations = {'right': 'left', 'left': 'right'}

    true_parts = true_label.split()
    _, _, _, _, true_shape1, true_relation, _, _, true_shape2 = true_parts  # e.g., 'a', 'photo, 'of', 'a', 'sphere', 'right', 'of', 'a', 'cube'

    # Define hard distractors
    # 1. Shape-swapped: Swap true_shape1 and true_shape2
    shape_swapped = f"A photo of a {true_shape2} {true_relation} of a {true_shape1}"
    # 2. Relation-swapped: Use opposite relation
    relation_swapped = f"A photo of a {true_shape1} {relations[true_relation]} of a {true_shape2}"

    hard_distractors = [shape_swapped, relation_swapped]

    # Generate all possible labels
    all_labels = [f"A photo of a {shape} {rel} of a {other_shape}"
                  for shape in shapes
                  for rel in relations
                  for other_shape in shapes if other_shape != shape]

    # Define the inverse label (already true and must be excluded)
    inverse_label = f" A photo of a {true_shape2} {relations[true_relation]} of a {true_shape1}"

    # Filter out true label, inverse label, and ensure hard distractors are unique
    exclude = {true_label, inverse_label}
    filtered_labels = [label for label in all_labels if label not in exclude]

    # Sample random distractors, excluding hard distractors if they’re already in filtered_labels
    random_labels = random.sample([label for label in filtered_labels if label not in hard_distractors], k=4-len(hard_distractors))

    return hard_distractors + random_labels

In [7]:
class CustomDataset(ImageFolder):
    def __init__(self, root, dataset_name, transform=None):
        super().__init__(root, transform=transform)
        self.dataset_name = dataset_name

    def find_classes(self, directory):
        classes = [d.name for d in os.scandir(directory) if d.is_dir() and not d.name.startswith('.')]
        classes.sort()
        if not classes:
            raise FileNotFoundError(f"Couldn't find any valid class folders in {directory}")
        class_to_idx = {cls_name: i for i, cls_name in enumerate(classes)}
        return classes, class_to_idx

    def __getitem__(self, index):
        path, target = self.samples[index]
        image = self.loader(path)  # Load as PIL Image
        correct_label = self.classes[target]

        if self.dataset_name == 'single_object':
            parts = correct_label.split('_')
            correct_label = f'A photo of a {parts[0]} {parts[1]}'
            labels_list = [correct_label] + create_distractors_single_object(correct_label)
        elif self.dataset_name == 'two_object':
            parts = correct_label.split('_')
            correct_label = f"A photo of a {parts[0]} {parts[1]}"
            filler_label = f"A photo of a {parts[2]} {parts[3]}"
            labels_list = [correct_label] + create_distractors_two_object((correct_label, filler_label))
        elif self.dataset_name == 'relational':
            parts = correct_label.split('_')
            correct_label = f"A photo of a {parts[0]} {parts [1]} of a {parts[2]}"
            labels_list = [correct_label] + create_distractors_relational(correct_label)

        random.shuffle(labels_list)
        correct_index = labels_list.index(correct_label)

        return image, labels_list, correct_index

In [8]:
# Custom data collator to filter unexpected keys
class CustomDataCollator:
    def __init__(self, processor):
        self.processor = processor

    def __call__(self, examples):
        batch = {}
        # Include image_attention_mask as a valid key
        valid_keys = ["input_ids", "attention_mask", "pixel_values", "image_attention_mask", "labels"]
        for key in valid_keys:
            if key in examples[0]:
                batch[key] = torch.stack([example[key] for example in examples])
        return batch

In [9]:
# Define image transform
def convert_to_rgb(image):
    if image.mode == "RGB":
        return image
    image_rgba = image.convert("RGBA")
    background = Image.new("RGBA", image_rgba.size, (255, 255, 255))
    alpha_composite = Image.alpha_composite(background, image_rgba)
    return alpha_composite.convert("RGB")

In [10]:
# def train_and_evaluate(dataset_name, base_path='/content/drive/MyDrive/thesis_small_dataset', seed=42):
dataset_name = 'relational' # 'single_object' 'two_object', 'relational'
base_path = '/home/bboulbarss/large_dataset'
seed = 42

lora_params = (16, 32) #  (8, 16), (16, 32)
lr = 1e-6 #  1e-6, 1e-5

torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
random.seed(seed)

# Define paths and dataset
train_root = os.path.join(base_path, dataset_name, 'train')
val_root = os.path.join(base_path, dataset_name, 'ood_val')

for dir_name in os.listdir(train_root):
    if ' ' in dir_name:
        new_name = dir_name.replace(' ', '_')
        os.rename(
            os.path.join(train_root, dir_name),
            os.path.join(train_root, new_name)
        )
for dir_name in os.listdir(val_root):
    if ' ' in dir_name:
        new_name = dir_name.replace(' ', '_')
        os.rename(
            os.path.join(val_root, dir_name),
            os.path.join(val_root, new_name)
        )


train_dataset = CustomDataset(root=train_root, dataset_name=dataset_name, transform=None)
val_dataset = CustomDataset(root=val_root, dataset_name=dataset_name, transform=None)

print('checkpoint 1')

# Convert to Hugging Face datasets
train_data = []
for image, labels_list, correct_index in train_dataset:
    train_data.append({
        'image': image,
        'labels_list': labels_list,
        'correct_index': correct_index
    })

print('checkpoint 2')

val_data = []
if val_dataset:
    for image, labels_list, correct_index in val_dataset:
        val_data.append({
            'image': image,
            'labels_list': labels_list,
            'correct_index': correct_index
        })

features = Features({
    'image': HFImage(),
    'labels_list': Sequence(Value('string')),
    'correct_index': Value('int64'),
})

print('checkpoint 3')

train_ds = Dataset.from_list(train_data, features=features)
eval_ds = Dataset.from_list(val_data, features=features)

checkpoint 1
checkpoint 2
checkpoint 3


In [11]:
# Load model and processor
checkpoint = "HuggingFaceM4/idefics-9b-instruct"
# checkpoint = "HuggingFaceM4/tiny-random-idefics"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    llm_int8_skip_modules=["lm_head", "embed_tokens"],
)
processor = AutoProcessor.from_pretrained(checkpoint)
model = IdeficsForVisionText2Text.from_pretrained(checkpoint, quantization_config=bnb_config, device_map="auto")

# LoRA configuration
config = LoraConfig(
    r=lora_params[0],
    lora_alpha=lora_params[1],
    target_modules="all-linear",
    lora_dropout=0.5,
    bias="none",
)
model = get_peft_model(model, config)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [12]:
image_size = processor.image_processor.image_size
image_mean = processor.image_processor.image_mean
image_std = processor.image_processor.image_std
image_transform = transforms.Compose([
    convert_to_rgb,
    transforms.RandomResizedCrop((image_size, image_size), scale=(0.9, 1.0), interpolation=transforms.InterpolationMode.BICUBIC),
    transforms.ToTensor(),
    transforms.Normalize(mean=image_mean, std=image_std),
])

# Define transformation function with debugging
def my_ds_transforms(example_batch):
    images = example_batch['image']
    labels_lists = example_batch['labels_list']
    correct_indices = example_batch['correct_index']
    prompts = []
    for i in range(len(images)):
        labels_list = labels_lists[i]
        correct_index = correct_indices[i]
        letters = ['A', 'B', 'C', 'D', 'E']
        correct_letter = letters[correct_index]
        choices_text = "\n".join([f"{letters[j]}. {labels_list[j]}" for j in range(5)])
        text = (f"Task: Identify the correct label for this image from the following choices:\n{choices_text}\n"
                f"Answer with the letter of the correct choice.\nAssistant: {correct_letter}")
        prompts.append(["User:", images[i], text])

    # Process prompts with explicit return_dict
    inputs = processor(prompts, transform=image_transform, return_tensors="pt", return_dict=True).to(device)
    inputs["labels"] = inputs["input_ids"]

    # Debug: Inspect image_attention_mask
    # print("Keys in inputs:", list(inputs.keys()))
    if "image_attention_mask" in inputs:
        # print("image_attention_mask value:", inputs["image_attention_mask"])
        if inputs["image_attention_mask"] is None:
            # Manually create image_attention_mask
            batch_size = inputs["input_ids"].shape[0]
            seq_length = inputs["input_ids"].shape[1]
            inputs["image_attention_mask"] = torch.ones((batch_size, seq_length), dtype=torch.long, device=device)
            print("Created manual image_attention_mask with shape:", inputs["image_attention_mask"].shape)

    return inputs

train_ds.set_transform(my_ds_transforms)
eval_ds.set_transform(my_ds_transforms)
# model.print_trainable_parameters()

timestamp = datetime.now(pytz.timezone('Europe/Amsterdam')).strftime('%Y%m%d_%H%M%S')

training_args = TrainingArguments(
    output_dir=f"/home/bboulbarss/finetuned_models/idefics/idefics-{dataset_name}-{timestamp}-TRAININGSAVE",
    learning_rate=lr,
    fp16=True,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=16,
    dataloader_pin_memory=False,
    save_total_limit=3,
    eval_strategy="epoch",     # Fixed typo
    save_strategy="epoch",
    save_steps=40,
    eval_steps=20,
    logging_steps=20,
    num_train_epochs=15,
    remove_unused_columns=False,
    push_to_hub=False,
    label_names=["labels"],
    load_best_model_at_end=True,
    report_to=None,
    optim="paged_adamw_8bit",
    metric_for_best_model="eval_loss",   # Track eval_loss
    greater_is_better=False              # Lower eval_loss is better
)


# Initialize trainer with custom collator and early stopping
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    data_collator=CustomDataCollator(processor),
    # compute_metrics=compute_metrics,  # Only needed if using custom metrics
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

print('training')

# Train
trainer.train()

# Save LoRA adapter weights and processor for the best model
model_save_dir = '/home/bboulbarss/finetuned_models/idefics'
os.makedirs(model_save_dir, exist_ok=True)

best_model_path = os.path.join(model_save_dir, f'idefics_lora_best_{dataset_name}_{seed}_{timestamp}_smalldataset_{lora_params}_{lr}')
best_processor_path = os.path.join(model_save_dir, f'idefics_processor_best_{dataset_name}_{seed}_{timestamp}_smalldataset_{lora_params}_{lr}')

model.save_pretrained(best_model_path)
processor.save_pretrained(best_processor_path)

print(f"Best LoRA adapter saved to: {best_model_path}")
print(f"Processor saved to: {best_processor_path}")

training
{'eval_loss': 2.0477490425109863, 'eval_runtime': 14.3269, 'eval_samples_per_second': 17.45, 'eval_steps_per_second': 4.397, 'epoch': 0.8727272727272727}
{'eval_loss': 2.0178675651550293, 'eval_runtime': 14.3207, 'eval_samples_per_second': 17.457, 'eval_steps_per_second': 4.399, 'epoch': 1.8727272727272726}
{'eval_loss': 1.979794979095459, 'eval_runtime': 14.3011, 'eval_samples_per_second': 17.481, 'eval_steps_per_second': 4.405, 'epoch': 2.8727272727272726}
{'loss': 2.1689, 'grad_norm': 4.454567909240723, 'learning_rate': 8.11111111111111e-07, 'epoch': 3.290909090909091}
{'eval_loss': 1.9368896484375, 'eval_runtime': 14.323, 'eval_samples_per_second': 17.454, 'eval_steps_per_second': 4.399, 'epoch': 3.8727272727272726}
{'eval_loss': 1.8923307657241821, 'eval_runtime': 14.3025, 'eval_samples_per_second': 17.479, 'eval_steps_per_second': 4.405, 'epoch': 4.872727272727273}
{'eval_loss': 1.849433183670044, 'eval_runtime': 14.2562, 'eval_samples_per_second': 17.536, 'eval_steps_pe

In [13]:
# datasets = ['two_object'] #'single_object', 'relational',
# for dataset in datasets:
#     print(f"\nTraining on {dataset}")
#     train_and_evaluate(dataset, seed=42)