In [1]:
import torch
from PIL import Image
import pandas as pd
import os
import random
import csv
import datetime
from pathlib import Path
from zoneinfo import ZoneInfo
from tqdm import tqdm

from peft import PeftModel

from transformers.utils import logging
logging.set_verbosity_error()

device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cuda


## Predict functions for the models

In [2]:
def clip_predict_label(image_path, labels, model, processor):
    # Load and preprocess the image
    image = Image.open(image_path)

    # Process the inputs
    inputs = processor(text=labels, images=image, return_tensors="pt", padding=True).to(device)

    # Get model outputs
    with torch.no_grad():
        outputs = model(**inputs)

    # Extract the image-text similarity scores
    logits_per_image = outputs.logits_per_image
    # Convert logits to probabilities
    probs = logits_per_image.softmax(dim=1)

    # Get the predicted label
    predicted_label_idx = probs.argmax().item()
    predicted_label = labels[predicted_label_idx]

    return predicted_label, probs.cpu().numpy()

In [3]:
def flava_predict_label(image_path, texts, model, processor):
    # Load and preprocess the image
    image = Image.open(image_path).convert('RGB')

    # Replicate the image to match the number of text inputs (required for FLAVA)
    images = [image] * len(texts)

    # Process the inputs
    inputs = processor(text=texts, images=images, return_tensors="pt", padding=True, max_length=77, return_codebook_pixels=True, return_image_mask=True, return_attention_mask=True).to(device)

    # Get model outputs without gradient tracking for efficiency
    with torch.no_grad():
        output = model(**inputs)

    # Extract the image-text similarity scores
    logits_per_image = output.contrastive_logits_per_image
    # Convert logits to probabilities
    probs = logits_per_image.softmax(dim=1)[0].unsqueeze(0)

    # Get the predicted label
    predicted_label_idx = probs.argmax().item()
    predicted_label = texts[predicted_label_idx]

    return predicted_label, probs.cpu().numpy()

In [4]:
def vilt_predict_label(image_path, texts, model, processor):
    # Load and preprocess the image
    image = Image.open(image_path).convert('RGB')

    # Replicate the image to match the number of text inputs (required for ViLT)
    images = [image] * len(texts)

    # Process the inputs
    inputs = processor(images, texts, return_tensors="pt", padding=True).to(device)

    # Get model outputs without gradient tracking for efficiency
    with torch.no_grad():
        output = model(**inputs)

    # Extract the image-text similarity scores
    logits_per_image = output.logits
    # Convert logits to probabilities
    probs = logits_per_image.T.softmax(dim=1)

    # Get the predicted label
    predicted_label_idx = probs.argmax().item()
    predicted_label = texts[predicted_label_idx]

    return predicted_label, probs.cpu().numpy()

In [5]:
def idefics_predict_label(image_path, texts, model, processor):
    image = Image.open(image_path).convert("RGB")
    
    # Shuffle the texts list to randomize answer position
    shuffled_texts = texts.copy()  # Create a copy to avoid modifying original list
    random.shuffle(shuffled_texts)
    
    # Dynamically generate question string based on shuffled texts
    question_string = "Task: Identify the correct label for this image from the following choices:\n" + "\n".join(
        [f"{chr(65+i)}. {text}" for i, text in enumerate(shuffled_texts)]
    ) + "\nAnswer with the letter of the correct choice.\nAssistant:"

    # Prepare the input prompt
    inputs = processor(["User:", image, question_string], return_tensors="pt").to(device)

    # Generate response
    with torch.no_grad():
        generated_ids = model.generate(**inputs, max_new_tokens=8)

    # Decode and return result
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
    return generated_text, '_'

In [6]:
def ovis_predict_label(image_path, texts, model, processor):
    # Load and prepare the image
    text_tokenizer, visual_tokenizer = processor

    image = Image.open(image_path).convert("RGB")
    images = [image]

    # Shuffle the texts to randomize answer position
    shuffled_texts = texts.copy()
    random.shuffle(shuffled_texts)

    # Generate the question string with choices
    question_string = "Task: Identify the correct label for this image from the following choices:\n" + "\n".join(
        [f"{chr(65+i)}. {text}" for i, text in enumerate(shuffled_texts)]
    ) + "\nAnswer with the letter of the correct choice."
    query = f'<image>\n{question_string}'

    # Preprocess inputs for the Ovis model
    max_partition = 9
    prompt, input_ids, pixel_values = model.preprocess_inputs(query, images, max_partition=max_partition)
    attention_mask = torch.ne(input_ids, text_tokenizer.pad_token_id)

    # Prepare inputs for generation
    input_ids = input_ids.unsqueeze(0).to(device=model.device)
    attention_mask = attention_mask.unsqueeze(0).to(device=model.device)
    if pixel_values is not None:
        pixel_values = pixel_values.to(dtype=visual_tokenizer.dtype, device=visual_tokenizer.device)
        pixel_values = [pixel_values]

    # Define generation parameters
    gen_kwargs = {
        "max_new_tokens": 20,
        "do_sample": False,
        "eos_token_id": model.generation_config.eos_token_id,
        "pad_token_id": text_tokenizer.pad_token_id,
        "use_cache": True
    }

    # Generate the output
    with torch.inference_mode():
        output_ids = model.generate(
            input_ids,
            pixel_values=pixel_values,
            attention_mask=attention_mask,
            **gen_kwargs
        )[0]

    generated_ids = output_ids
    generated_text = text_tokenizer.decode(generated_ids, skip_special_tokens=True).strip()

    # Construct the full sequence by appending generated text to the original query
    full_sequence = query + generated_text

    # Return the result in a format consistent with the Idefics function
    return full_sequence, '_'

## Functions to create distractor labels

In [7]:
def create_distractors_single_object(true_label):
    shapes = ['cube', 'sphere', 'cone', 'cylinder']
    colors = ['blue', 'brown', 'cyan', 'gray', 'green', 'purple', 'red', 'yellow']

    all_labels = [f"A photo of a {color} {shape}" for shape in shapes for color in colors]
    all_labels.remove(true_label)

    random_labels = random.sample(all_labels, k=K)

    return random_labels

In [8]:
def create_distractors_two_object(true_labels):
    shapes = ['cube', 'sphere', 'cone', 'cylinder']
    colors = ['blue', 'brown', 'cyan', 'gray', 'green', 'purple', 'red', 'yellow']

    _, _, _, _, color1, shape1 = true_labels[0].split()
    _, _, _, _, color2, shape2 = true_labels[1].split()

    hard_distractors = [f"A photo of a {color1} {shape2}", f"A photo of a {color2} {shape1}"]

    exclude = set(true_labels + tuple(hard_distractors))
    all_labels = [f"A photo of a {color} {shape}" for shape in shapes for color in colors if f"A photo of a {color} {shape}" not in exclude]
    random_labels = random.sample(all_labels, k=K-len(hard_distractors))

    return hard_distractors + random_labels

In [9]:
def create_distractors_relational(true_label):
    shapes = ['cube', 'sphere', 'cone', 'cylinder']
    relations = {'right': 'left', 'left': 'right'}

    true_parts = true_label.split()
    _, _, _, _, true_shape1, true_relation, _, _, true_shape2 = true_parts  # e.g., 'a', 'photo, 'of', 'a', 'sphere', 'right', 'of', 'a', 'cube'

    # Define hard distractors
    # 1. Shape-swapped: Swap true_shape1 and true_shape2
    shape_swapped = f"A photo of a {true_shape2} {true_relation} of a {true_shape1}"
    # 2. Relation-swapped: Use opposite relation
    relation_swapped = f"A photo of a {true_shape1} {relations[true_relation]} of a {true_shape2}"

    hard_distractors = [shape_swapped, relation_swapped]

    # Generate all possible labels
    all_labels = [f"A photo of a {shape} {rel} of a {other_shape}"
                  for shape in shapes
                  for rel in relations
                  for other_shape in shapes if other_shape != shape]

    # Define the inverse label (already true and must be excluded)
    inverse_label = f" A photo of a {true_shape2} {relations[true_relation]} of a {true_shape1}"

    # Filter out true label, inverse label, and ensure hard distractors are unique
    exclude = {true_label, inverse_label}
    filtered_labels = [label for label in all_labels if label not in exclude]

    # Sample random distractors, excluding hard distractors if they’re already in filtered_labels
    random_labels = random.sample([label for label in filtered_labels if label not in hard_distractors], k=4-len(hard_distractors))

    return hard_distractors + random_labels

## Function to count number of images to predict

In [10]:
def count_files_in_subdirs(directory, dataset):
    root_dir = Path(directory)

    subdir_counts = {}
    total_files = 0

    for subdir in root_dir.iterdir():
        if subdir.is_dir():
            if dataset=='two_object':
                file_count = 0
                for sub_subdir in subdir.iterdir():
                    if sub_subdir.is_dir():
                        file_count += sum(1 for item in sub_subdir.iterdir() if item.is_file())
                subdir_counts[subdir.name] = file_count
                total_files += file_count
            elif dataset=='relational':
                file_count = 0
                for sub_subdir in subdir.iterdir():
                    if sub_subdir.is_dir():
                        file_count += sum(1 for item in sub_subdir.iterdir() if item.is_file())
                subdir_counts[subdir.name] = file_count
                total_files += file_count
                
            else:
                file_count = sum(1 for item in subdir.iterdir() if item.is_file())
                subdir_counts[subdir.name] = file_count
                total_files += file_count

    return total_files, subdir_counts

## Main experiment loop

In [11]:
google_path_data = '/content/drive/MyDrive/thesis_small_dataset'
google_path_experiment = '/content/drive/MyDrive/thesis_experiment_data'

snellius_path_data = '/home/bboulbarss/large_dataset'
snellius_path_experiment = '/home/bboulbarss/large_experiment_data'

In [12]:
def experiment(seed, model_name, dataset, split, model, processor):
    # Define the CSV file name
    now = datetime.datetime.now(ZoneInfo("Europe/Amsterdam"))
    formatted_time = now.strftime('%d-%m-%Y_%H-%M-%S')
    filename = f"{snellius_path_experiment}/{dataset}/{model_name}/output_{formatted_time}_{model_name}_seed_{seed}_{split}.csv"

    if model_name == 'idefics' or model_name == 'ovis' or model_name == 'idefics-ft':
        full_list = ["image_path", "true_label", "completed_prompt"]
    else:
        base_list = ["image_path", "true_label", "predicted_label", "is_correct"]
        dynamic_list = [item for i in range(1, K+2) for item in (f"label{i}", f"prob{i}")]
        full_list = base_list + dynamic_list

    # Create and write the header (only once, at the start)
    with open(filename, mode="w", newline="") as file:
        writer = csv.writer(file)
        writer.writerow(full_list)

    # Initialize variables for evaluation
    total_correct = 0
    total_images = 0

    directory = f'{snellius_path_data}/{dataset}/{split}' 
    # I changed all the names of the directories manually to lower case for consistency

    directories = os.listdir(directory)
    total_images_count, _ = count_files_in_subdirs(directory, dataset)
    print(f"Total images: {total_images_count}")

    # Filter out hidden directories before the loop
    directories = [d for d in directories if not d.startswith('.')]
    print(f"Total directories: {len(directories)}")
    print(f"Images per directory: {int(total_images_count/len(directories))}")

    # Iterate over each directory with a progress bar
    for f in tqdm(directories, desc="Processing directories"):
        if dataset == 'single_object':
            # f = 'blue cube' for example
            true_label = f'A photo of a {f}'

        elif dataset == 'two_object':
            parts = f.split('_')
            true_label = f"{parts[0]} {parts[1]}"
            filler_label = f"{parts[2]} {parts[3]}"
            f += f'/{true_label}'
            true_label = f'A photo of a {parts[0]} {parts[1]}'
            filler_label = f'A photo of a {parts[2]} {parts[3]}'

        elif dataset == 'relational':
            parts = f.split('_')
            directory_name = f'{parts[0]} {parts[1]} {parts[2]}'
            f += f'/{directory_name}'
            true_label = f"A photo of a {parts[0]} {parts [1]} of a {parts[2]}"

        dir = f'{snellius_path_data}/{dataset}/{split}/{f}'
        images_paths = os.listdir(dir)

        for image in images_paths:
            total_images += 1
            full_image_path = dir + '/' + image

            # Generate distractor labels
            if dataset == 'two_object':
                distractor_labels = function_map[dataset]((true_label, filler_label))
            else:
                distractor_labels = function_map[dataset](true_label)

            # Combine true label and distractors
            all_labels = [true_label] + distractor_labels

            # Predict the label
            predicted_label, label_probs = function_map[model_name](full_image_path, all_labels, model, processor)

            if model_name == 'idefics' or model_name == 'ovis' or model_name == 'idefics-ft':
                # Define print functions
                def print_details():
                    print(f"Image: {full_image_path}")
                    print(f"True Label: {true_label}")
                    print(f"Completed Prompt: {predicted_label}")
                    print("-" * 40)

                # Single toggle for all printing
                if PRINT_ALL:
                    print_details()

                # Gather all the data for logging
                data = [full_image_path, true_label, predicted_label]

                with open(filename, mode="a", newline="") as file:  # "a" mode appends data
                    writer = csv.writer(file)
                    writer.writerow(data)

            else:
                # Define print functions
                def print_details():
                    print(f"\nImage: {full_image_path}")
                    for label, prob in zip(all_labels, label_probs[0]):
                        print(f"Label: {label}, Score: {prob:.4f}")
                    print(f"Predicted Label: {predicted_label}")
                    print(f"Correct Prediction: {'Yes' if is_correct else 'No'}")
                    print("-" * 40)

                # Check if the prediction is correct
                is_correct = predicted_label == true_label
                if is_correct:
                    total_correct += 1

                # Single toggle for all printing
                if PRINT_ALL:
                    print_details()

                # Gather all the data for logging
                data = [full_image_path, true_label, predicted_label, is_correct]

                for label, prob in zip(all_labels, label_probs[0]):
                    data.append(label)
                    data.append(prob)

                data = [str(item) for item in data]

                with open(filename, mode="a", newline="") as file:  # "a" mode appends data
                    writer = csv.writer(file)
                    writer.writerow(data)

    # Calculate overall accuracy
    overall_accuracy = total_correct / total_images
    print()
    print(f"Overall Accuracy: {overall_accuracy:.4f}")
    print(f"Model: {model_name}")
    print(f"Dataset: {dataset}")
    print(f"seed: {seed}")
    print('-------------------------------------------')

In [13]:
function_map = {
    'single_object': create_distractors_single_object,
    'two_object': create_distractors_two_object,
    'relational': create_distractors_relational,

    'clip': clip_predict_label,
    'clip-ft': clip_predict_label,
    'flava': flava_predict_label,
    'flava-ft': flava_predict_label,
    'vilt': vilt_predict_label,
    'vilt-ft': vilt_predict_label,
    'idefics': idefics_predict_label,
    'idefics-ft': idefics_predict_label,
    'ovis': ovis_predict_label
}

In [14]:
##################### --- EXPERIMENT SETTINGS --- ######################
split = 'ood_test'       # Options: 'train', 'id_test', 'ood_test', 'id_val', 'ood_val'
model_name = 'vilt-ft'   # Options: 'clip', 'flava', 'vilt', 'idefics' -- 'clip-ft', 'flava-ft', 'vilt-ft', 'idefics-ft'
K = 4                    # Number of distractor labels for each experiment (e.g. if K=3, model will get 4 possible labels. 2 of the K labels will always be hard distractors.)
PRINT_ALL = False        # Whether to print model prediction details for each image
########################################################################

if model_name == 'clip':
    from transformers import CLIPProcessor, CLIPModel
    model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
    processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

elif model_name == 'flava':
    from transformers import FlavaForPreTraining, FlavaProcessor
    model = FlavaForPreTraining.from_pretrained("facebook/flava-full").to(device)
    processor = FlavaProcessor.from_pretrained("facebook/flava-full")

elif model_name == 'vilt':
    from transformers import ViltProcessor, ViltForImageAndTextRetrieval
    model = ViltForImageAndTextRetrieval.from_pretrained("dandelin/vilt-b32-finetuned-coco").to(device)
    processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-coco")

elif model_name == 'idefics':
    from transformers import IdeficsForVisionText2Text, AutoProcessor, BitsAndBytesConfig
    model = IdeficsForVisionText2Text.from_pretrained('HuggingFaceM4/idefics-9b-instruct').to(device)
    processor = AutoProcessor.from_pretrained('HuggingFaceM4/idefics-9b-instruct')
    # model = IdeficsForVisionText2Text.from_pretrained("HuggingFaceM4/tiny-random-idefics", torch_dtype=torch.bfloat16).to(device)
    # processor = AutoProcessor.from_pretrained("HuggingFaceM4/tiny-random-idefics")

elif model_name == 'ovis':
    from transformers import AutoModelForCausalLM, AutoConfig
    # Load and modify the configuration
    config = AutoConfig.from_pretrained(
        "AIDC-AI/Ovis2-8B",
        trust_remote_code=True)
    
    config.llm_attn_implementation = "eager"  # Override to use eager attention
    
    model = AutoModelForCausalLM.from_pretrained(
        "AIDC-AI/Ovis2-8B",
        config=config,  # Pass the modified config
        # torch_dtype=torch.bfloat16,
        multimodal_max_length=32768,
        trust_remote_code=True,
        attn_implementation="eager"  # Explicitly set for safety
    ).cuda()
    
    # Get tokenizers from the model
    text_tokenizer = model.get_text_tokenizer()
    visual_tokenizer = model.get_visual_tokenizer()
    
    processor = text_tokenizer, visual_tokenizer


elif model_name == 'clip-ft':
    from transformers import CLIPProcessor, CLIPModel
    model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
    processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

    # single object
    peft_model_path = '/home/bboulbarss/finetuned_models/clip/clip_lora_best_single_object_42_20250523_104223_(8, 16)_32_1e-06'
    peft_processor_path = '/home/bboulbarss/finetuned_models/clip/clip_processor_best_single_object_42_20250523_104223_(8, 16)_32_1e-06'

    # two object
    #peft_model_path = '/home/bboulbarss/finetuned_models/clip/clip_lora_best_two_object_42_20250505_140611_(8, 16)_32_1e-06'
    #peft_processor_path = '/home/bboulbarss/finetuned_models/clip/clip_processor_best_two_object_42_20250505_140611_(8, 16)_32_1e-06'
    
    # relational
    #peft_model_path = '/home/bboulbarss/finetuned_models/clip/clip_lora_best_relational_42_20250505_140038_(8, 16)_32_1e-05'
    #peft_processor_path = '/home/bboulbarss/finetuned_models/clip/clip_processor_best_relational_42_20250505_140038_(8, 16)_32_1e-05'

    model = PeftModel.from_pretrained(model, peft_model_path, is_trainable=False)
    processor = CLIPProcessor.from_pretrained(peft_processor_path)

elif model_name == 'flava-ft':
    from transformers import FlavaForPreTraining, FlavaProcessor
    model = FlavaForPreTraining.from_pretrained("facebook/flava-full").to(device)
    processor = FlavaProcessor.from_pretrained("facebook/flava-full")

    # single object
    peft_model_path = '/home/bboulbarss/finetuned_models/flava/flava_lora_best_single_object_42_20250523_122954_(8, 16)_32_1e-05'
    peft_processor_path = '/home/bboulbarss/finetuned_models/flava/flava_processor_best_single_object_42_20250523_122954_(8, 16)_32_1e-05'

    # two object
    #peft_model_path = '/home/bboulbarss/finetuned_models/flava/flava_lora_best_two_object_42_20250506_081726_(8, 16)_32_1e-06'
    #peft_processor_path = '/home/bboulbarss/finetuned_models/flava/flava_processor_best_two_object_42_20250506_081726_(8, 16)_32_1e-06'

    # relational
    #peft_model_path = '/home/bboulbarss/finetuned_models/flava/flava_lora_best_relational_42_20250505_143920_(8, 16)_32_1e-06'
    #peft_processor_path = '/home/bboulbarss/finetuned_models/flava/flava_processor_best_relational_42_20250505_143920_(8, 16)_32_1e-06'

    model = PeftModel.from_pretrained(model, peft_model_path, is_trainable=False)
    processor = FlavaProcessor.from_pretrained(peft_processor_path)
    
elif model_name == 'vilt-ft':
    from transformers import ViltProcessor, ViltForImageAndTextRetrieval
    model = ViltForImageAndTextRetrieval.from_pretrained("dandelin/vilt-b32-finetuned-coco").to(device)
    processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-coco")

    # single object
    peft_model_path = '/home/bboulbarss/finetuned_models/vilt/vilt_lora_best_single_object_42_20250523_112846_(16, 32)_32_1e-05'
    peft_processor_path = '/home/bboulbarss/finetuned_models/vilt/vilt_processor_best_single_object_42_20250523_112846_(16, 32)_32_1e-05'

    # two object
    #peft_model_path = '/home/bboulbarss/finetuned_models/vilt/vilt_lora_best_two_object_42_20250507_104656_(16, 32)_32_1e-05'
    #peft_processor_path = '/home/bboulbarss/finetuned_models/vilt/vilt_processor_best_two_object_42_20250507_104656_(16, 32)_32_1e-05'

    # relational
    #peft_model_path = '/home/bboulbarss/finetuned_models/vilt//vilt_lora_best_relational_42_20250505_173458_(8, 16)_32_1e-06'
    #peft_processor_path = '/home/bboulbarss/finetuned_models/vilt/vilt_processor_best_relational_42_20250505_173458_(8, 16)_32_1e-06'

    model = PeftModel.from_pretrained(model, peft_model_path, is_trainable=False)
    processor = ViltProcessor.from_pretrained(peft_processor_path)
    
elif model_name == 'idefics-ft':
    from transformers import IdeficsForVisionText2Text, AutoProcessor, BitsAndBytesConfig
    model = IdeficsForVisionText2Text.from_pretrained('HuggingFaceM4/idefics-9b-instruct').to(device)
    processor = AutoProcessor.from_pretrained('HuggingFaceM4/idefics-9b-instruct')

    # single object
    #peft_model_path = '/home/bboulbarss/finetuned_models/idefics/idefics_lora_best_single_object_42_20250525_144228_smalldataset_(16, 32)_1e-05'
    #peft_processor_path = '/home/bboulbarss/finetuned_models/idefics/idefics_processor_best_single_object_42_20250525_144228_smalldataset_(16, 32)_1e-05'

    # relational
    peft_model_path = '/home/bboulbarss/finetuned_models/idefics/idefics_lora_best_relational_42_20250525_160425_smalldataset_(16, 32)_1e-05'
    peft_processor_path = '/home/bboulbarss/finetuned_models/idefics/idefics_processor_best_relational_42_20250525_160425_smalldataset_(16, 32)_1e-05'

    # two object
    #peft_model_path = '/home/bboulbarss/finetuned_models/idefics/idefics_lora_best_two_object_42_20250524_165224_smalldataset_(16, 32)_1e-05'
    #peft_processor_path = '/home/bboulbarss/finetuned_models/idefics/idefics_processor_best_two_object_42_20250524_165224_smalldataset_(16, 32)_1e-05'
    
    model = PeftModel.from_pretrained(model, peft_model_path, is_trainable=False)
    processor = AutoProcessor.from_pretrained(peft_processor_path)


# Define datasets and seeds for experiments
datasets = ['relational'] # 'single_object', 'two_object', 'relational'
seeds = [11, 21, 31, 41, 51]

# Run experiments for each dataset and seed combination
for dataset in datasets:
    for seed in seeds:
        random.seed(seed)
        torch.manual_seed(seed)
        experiment(seed, model_name, dataset, split, model, processor)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Total images: 400
Total directories: 8
Images per directory: 50


Processing directories: 100%|██████████| 8/8 [03:13<00:00, 24.17s/it]



Overall Accuracy: 0.0000
Model: idefics-ft
Dataset: relational
seed: 11
-------------------------------------------
Total images: 400
Total directories: 8
Images per directory: 50


Processing directories: 100%|██████████| 8/8 [03:12<00:00, 24.08s/it]



Overall Accuracy: 0.0000
Model: idefics-ft
Dataset: relational
seed: 21
-------------------------------------------
Total images: 400
Total directories: 8
Images per directory: 50


Processing directories: 100%|██████████| 8/8 [03:13<00:00, 24.13s/it]



Overall Accuracy: 0.0000
Model: idefics-ft
Dataset: relational
seed: 31
-------------------------------------------
Total images: 400
Total directories: 8
Images per directory: 50


Processing directories: 100%|██████████| 8/8 [03:12<00:00, 24.08s/it]



Overall Accuracy: 0.0000
Model: idefics-ft
Dataset: relational
seed: 41
-------------------------------------------
Total images: 400
Total directories: 8
Images per directory: 50


Processing directories: 100%|██████████| 8/8 [03:12<00:00, 24.08s/it]


Overall Accuracy: 0.0000
Model: idefics-ft
Dataset: relational
seed: 51
-------------------------------------------



