In [None]:
EXPERIMENT_NAME = "augmented-qlora-model"

In [None]:
import boto3
import os
from dotenv import load_dotenv

load_dotenv()
access_key_id = os.getenv("ACCESS_KEY_ID")
secret_access_key = os.getenv("SECRET_ACCESS_KEY")
minio_url = "http://" + os.getenv("S3_API_ENDPOINT")


minio_client = boto3.client(
    "s3",
    aws_access_key_id=access_key_id,
    aws_secret_access_key=secret_access_key,
    endpoint_url=minio_url
)

minio_bucket = "training-preparation-zone"
manifest_name = "dataset_train_augmented.json"
local_file = "./dataset_train_augmented.json"

In [3]:
def download_manifest_from_minio(bucket_name, object_name, local_path):
    try:
        minio_client.download_file(bucket_name, object_name, local_path)
    except Exception as e:
        print(f"Error downloading {object_name} from bucket {bucket_name}: {e}")
    return local_path

downloaded_path = download_manifest_from_minio(minio_bucket, manifest_name, local_file)

In [4]:
import pandas as pd

def load_manifest(manifest_path):
    with open(manifest_path, 'r') as f:
        data = pd.read_json(f)
    
    print(f"Loaded {len(data)} entries from the manifest.")
    return data

df = load_manifest(downloaded_path)
print(df)

Loaded 415 entries from the manifest.
                       image                                     text  \
0    images/ISIC_0027249.png        texts/actinic_keratosis_0_0_0.txt   
1    images/ISIC_0027058.png        texts/actinic_keratosis_0_0_1.txt   
2    images/ISIC_0026152.png        texts/actinic_keratosis_0_0_2.txt   
3    images/ISIC_0026803.png        texts/actinic_keratosis_0_0_3.txt   
4    images/ISIC_0026077.png        texts/actinic_keratosis_0_0_4.txt   
..                       ...                                      ...   
410  images/ISIC_0027710.png  texts/squamous_cell_carcinoma_0_0_2.txt   
411  images/ISIC_0031380.png  texts/squamous_cell_carcinoma_0_0_3.txt   
412  images/ISIC_0032110.png  texts/squamous_cell_carcinoma_0_0_4.txt   
413  images/ISIC_0032110.png  texts/squamous_cell_carcinoma_0_0_5.txt   
414  images/ISIC_0034222.png  texts/squamous_cell_carcinoma_0_0_6.txt   

        score  
0    1.438925  
1    1.429278  
2    1.461677  
3    1.439080  
4    

## Hyperparameters

In [5]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from transformers import CLIPProcessor, CLIPModel
from torch.optim import AdamW
from tqdm import tqdm

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
MODEL_ID = "openai/clip-vit-base-patch32"
BATCH_SIZE = 8
LEARNING_RATE = 5e-6
EPOCHS = 3

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"Is CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU Name: {torch.cuda.get_device_name(0)}")

PyTorch version: 2.9.1+cu126
Is CUDA available: True
GPU Name: NVIDIA GeForce GTX 1650


## Data retrieval

The inputs variable is defined as it is because the model needs all of those parameters:

- Truncation=True means that if we provide more than 77 tokens (the usual maximum) it truncates the data

- Padding=max_length means that we add zeros to fill the max_length. We need to provide the same length for all the data (specially in text).



In [7]:
import io

class SkinLesionDataset(Dataset):
    def __init__(self, dataframe, processor, minio_client, bucket_name):
        self.df = dataframe
        self.processor = processor
        self.minio_client = minio_client
        self.bucket_name = bucket_name

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        img_key = self.df.iloc[idx]['image']
        txt_key = self.df.iloc[idx]['text']

        img_response = self.minio_client.get_object(Bucket=self.bucket_name, Key=img_key)
        img_bytes = img_response['Body'].read()
        image = Image.open(io.BytesIO(img_bytes)).convert("RGB")

        txt_response = self.minio_client.get_object(Bucket=self.bucket_name, Key=txt_key)
        description = txt_response['Body'].read().decode('utf-8').strip()

        inputs = self.processor(
            text=[description], 
            images=image, 
            return_tensors="pt", 
            padding="max_length", 
            truncation=True
        )
        
        return {k: v.squeeze(0) for k, v in inputs.items()}


## Initialization

Here we train the smaller clip model. We load it from the SkinLesionDataset class we created and the particularity is that we use AdamW. The AdamW is a widely used optimitzer for training Transformers. While the loss function tells the model where it needs to go, the optimitzer decides how fast it goes.

In [10]:
from transformers import CLIPModel, CLIPProcessor, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_type=torch.float16
)
model = CLIPModel.from_pretrained(MODEL_ID, quantization_config=bnb_config, device_map="auto").to(DEVICE)
model.get_input_embeddings = lambda: model.text_model.embeddings.token_embedding
model = prepare_model_for_kbit_training(model)
processor = CLIPProcessor.from_pretrained(MODEL_ID)
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["visual_projection", "text_projection"], 
    lora_dropout=0.05, 
    bias="none"
)
model = get_peft_model(model, lora_config)
dataset = SkinLesionDataset(df, processor, minio_client, minio_bucket)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)
optimizer = AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=LEARNING_RATE, weight_decay=0.1)

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [11]:
def get_trainable_parameters(model):
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    all_params = sum(p.numel() for p in model.parameters())

    print(f"Trainable parameters: {trainable_params}")
    print(f"All parameters: {all_params}")
    print(f"Percentage of trainable parameters: {100 * trainable_params / all_params:.2f}%")

    return trainable_params, all_params

trainable, total = get_trainable_parameters(model)

Trainable parameters: 36864
All parameters: 89775873
Percentage of trainable parameters: 0.04%


## Model training

Here we train the model using the hyperparameters and all the information provided in the previous cells.

In [12]:
def get_peak_vram(device):
    if device == 'cuda':
        return torch.cuda.max_memory_allocated(device) / (1024 ** 3)  # Convert to GB
    else:
        return 0

In [13]:
loss_history = []
model.train()

if DEVICE == 'cuda':
    torch.cuda.reset_peak_memory_stats(DEVICE)

for epoch in range(EPOCHS):
    pbar = tqdm(dataloader, desc=f"Epoch {epoch+1}")
    epoch_loss = 0
    
    for batch in pbar:
        optimizer.zero_grad()
        batch = {k: v.to(DEVICE) for k, v in batch.items()}
        
        outputs = model(
            input_ids=batch['input_ids'],
            pixel_values=batch['pixel_values'],
            attention_mask=batch['attention_mask'],
            return_loss=True
        )
        
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        pbar.set_postfix({"loss": loss.item()})
    
    loss_history.append(epoch_loss / len(dataloader))

peak_mem = get_peak_vram(DEVICE)
print(f"Peak VRAM usage during training: {peak_mem:.2f} GB")

  return fn(*args, **kwargs)
Epoch 1: 100%|██████████| 52/52 [00:26<00:00,  2.00it/s, loss=3.2] 
Epoch 2: 100%|██████████| 52/52 [00:24<00:00,  2.09it/s, loss=2.72]
Epoch 3: 100%|██████████| 52/52 [00:24<00:00,  2.09it/s, loss=2.65]

Peak VRAM usage during training: 0.57 GB





In [14]:
import time
import numpy as np

def calculate_inference_latency(model, dataloader, device, num_samples=50):
    model.eval()
    latencies = []
    
    with torch.no_grad():
        for i, batch in enumerate(dataloader):
            if i >= num_samples: break
            
            batch = {k: v.to(device) for k, v in batch.items() if isinstance(v, torch.Tensor)}
            
            start_time = time.time()
            _ = model.get_image_features(pixel_values=batch['pixel_values'])
            end_time = time.time()
            
            latencies.append((end_time - start_time) * 1000)
            
    avg_latency = np.mean(latencies)
    print(f"Average Inference Latency: {avg_latency:.2f} ms")
    return avg_latency
latency = calculate_inference_latency(model, dataloader, DEVICE)

Average Inference Latency: 73.04 ms


In [15]:
import torch
import torch.nn.functional as F
import numpy as np
from bert_score import score as bert_score_func
from sklearn.metrics import recall_score, f1_score, confusion_matrix

def extract_class_from_path(path):
    return "_".join(path.split("/")[-1].split("_")[:-3])

@torch.no_grad()
def get_comprehensive_metrics(model, dataloader, device):
    model.eval()
    all_image_embeds = []
    all_text_embeds = []
    all_ground_truth_texts = []

    for batch in dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        img_emb = model.get_image_features(pixel_values=batch['pixel_values'])
        txt_emb = model.get_text_features(input_ids=batch['input_ids'], 
                                        attention_mask=batch['attention_mask'])
        
        all_image_embeds.append(F.normalize(img_emb, dim=-1))
        all_text_embeds.append(F.normalize(txt_emb, dim=-1))

    image_embeds = torch.cat(all_image_embeds)
    text_embeds = torch.cat(all_text_embeds)

    # Perspective 1: Text-to-Image Retrieval
    sim_matrix = text_embeds @ image_embeds.T
    
    num_queries = sim_matrix.size(0)
    ranks = []
    
    for i in range(num_queries):
        sorted_indices = torch.argsort(sim_matrix[i], descending=True)
        rank = (sorted_indices == i).nonzero(as_tuple=True)[0].item() + 1
        ranks.append(rank)
    
    ranks = np.array(ranks)

    # Perspective 2: Safety (Clinical Classification)
    # We pass an image, retrieve the best text and check if classes match.
    sim_matrix_i2t = image_embeds @ text_embeds.T
    
    # Map all text files in the dataset to their classes
    all_text_paths = df['text'].tolist()
    text_classes = np.array([extract_class_from_path(p) for p in all_text_paths])
    image_classes = np.array([extract_class_from_path(p) for p in df['text'].tolist()])

    top_text_indices = torch.argmax(sim_matrix_i2t, dim=-1).cpu().numpy()
    predicted_classes = text_classes[top_text_indices]

    sensitivity = recall_score(image_classes, predicted_classes, average='macro')
    f1 = f1_score(image_classes, predicted_classes, average='macro')

    cm = confusion_matrix(image_classes, predicted_classes)
    fp = cm.sum(axis=0) - np.diag(cm)
    fn = cm.sum(axis=1) - np.diag(cm)
    tp = np.diag(cm)
    tn = cm.sum() - (fp + fn + tp)
    specificity = np.mean(tn / (tn + fp + 1e-10))

    def get_text_content(path, client, bucket):
        response = client.get_object(Bucket=bucket, Key=path)
        return response['Body'].read().decode('utf-8').strip()

    sample_indices = np.random.choice(len(df), min(50, len(df)), replace=False)
    gt_texts = [get_text_content(df.iloc[i]['text'], minio_client, minio_bucket) for i in sample_indices]
    predicted_classes_texts = [get_text_content(df.iloc[top_text_indices[i]]['text'], minio_client, minio_bucket) for i in sample_indices]

    P, R, F1 = bert_score_func(predicted_classes_texts, gt_texts, lang='en', verbose=False)

    metrics = {
        # Perspective 1
        "Recall@1":  np.mean(ranks <= 1),
        "Recall@5":  np.mean(ranks <= 5),
        "Recall@10": np.mean(ranks <= 10),
        "Mean Rank": np.mean(ranks),
        "Median Rank": np.median(ranks),
        "MRR": np.mean(1.0 / ranks),
        "NDCG": np.mean([1.0 / np.log2(r + 1) for r in ranks]),
        # Perspective 2
        "Sensitivity": sensitivity,
        "Specificity": specificity,
        "F1 Score": f1,
        # Perspective 3
        "BertScore Precision": P.mean().item(),
        "BertScore Recall": R.mean().item(),
        "BERTScore F1": F1.mean().item(),
        "Trainable Parameters": trainable,
        "Total Parameters": total,
        "Inference Latency (ms)": latency,
        "Peak VRAM Usage (GB)": peak_mem
    }
    
    return metrics

eval_results = get_comprehensive_metrics(model, dataloader, DEVICE)
print("Evaluation Results:", eval_results)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation Results: {'Recall@1': np.float64(0.00963855421686747), 'Recall@5': np.float64(0.03855421686746988), 'Recall@10': np.float64(0.06506024096385542), 'Mean Rank': np.float64(153.8867469879518), 'Median Rank': np.float64(135.0), 'MRR': np.float64(0.035222223988940496), 'NDCG': np.float64(0.17380703444825432), 'Sensitivity': 0.19171593485607855, 'Specificity': np.float64(0.8007611832609356), 'F1 Score': 0.17398886503202257, 'BertScore Precision': 0.8452470302581787, 'BertScore Recall': 0.849989116191864, 'BERTScore F1': 0.8472861647605896, 'Trainable Parameters': 36864, 'Total Parameters': 89775873, 'Inference Latency (ms)': np.float64(73.04405689239502), 'Peak VRAM Usage (GB)': 0.5736546516418457}


In [16]:
import json
import datetime
import matplotlib.pyplot as plt
os.makedirs('../results', exist_ok=True)

final_experiment_data = {
    "metadata": {
        "model_name": MODEL_ID,
        "device_used": DEVICE,
        "hyperparameters": {
            "batch_size": BATCH_SIZE,
            "learning_rate": LEARNING_RATE,
            "epochs": EPOCHS
        }
    },
    "metrics": eval_results,
    "loss_history": loss_history
}

json_filename = f"../results/{EXPERIMENT_NAME}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
png_filename = f"../results/{EXPERIMENT_NAME}_loss_curve_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.png"

with open(json_filename, 'w') as f:
    json.dump(final_experiment_data, f, indent=4)

plt.figure(figsize=(10, 5))
plt.plot(loss_history, marker='o', linestyle='-', color='#2ca02c', label='Training Loss')
plt.title("Skin Cancer Model: Fine-Tuning Learning Curve")
plt.xlabel("Epoch")
plt.ylabel("Average Loss")
plt.grid(True, alpha=0.3)
plt.legend()
plt.savefig(png_filename, bbox_inches='tight')
plt.close()

In [17]:
import json
import datetime
import io

RESULTS_BUCKET = "visualization-zone"

def setup_results_storage(client, bucket_name):
    try:
        client.head_bucket(Bucket=bucket_name)
        print(f"Bucket '{bucket_name}' already exists.")
    except:
        print(f"Creating bucket '{bucket_name}'...")
        client.create_bucket(Bucket=bucket_name)

def upload_experiment_assets(client, bucket_name, results_dir, json_file, png_file):
    try:
        client.head_bucket(Bucket=bucket_name)
    except:
        client.create_bucket(Bucket=bucket_name)
        print(f"Created bucket: {bucket_name}")

    assets = [json_file, png_file]
    
    for asset_name in assets:
        local_path = os.path.join(results_dir, asset_name)
        
        if os.path.exists(local_path):
            # We store them in a folder named after the run_id for the viz page
            object_key = f"{EXPERIMENT_NAME}/{asset_name}"
            try:
                client.upload_file(local_path, bucket_name, object_key)
                print(f"Successfully uploaded {asset_name} to {object_key}")
            except Exception as e:
                print(f"Failed to upload {asset_name}: {e}")
        else:
            print(f"Warning: Asset not found at {local_path}")


json_file = os.path.basename(json_filename)
png_file = os.path.basename(png_filename)

setup_results_storage(minio_client, RESULTS_BUCKET)
upload_experiment_assets(minio_client, RESULTS_BUCKET, "../results", json_file, png_file)

Bucket 'visualization-zone' already exists.
Successfully uploaded qlora-model_20260102_160843.json to qlora-model/qlora-model_20260102_160843.json
Successfully uploaded qlora-model_loss_curve_20260102_160843.png to qlora-model/qlora-model_loss_curve_20260102_160843.png
