In [4]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import requests
from PIL import Image
import io
import os
from bisect import bisect_left
import torch
from tqdm import tqdm
import wandb
from torch.utils.data import Dataset
from torchvision import transforms
from transformers import AutoProcessor, AutoModelForPreTraining

from transformers import BitsAndBytesConfig
from peft import get_peft_model, LoraConfig
import os
import torch
from transformers import AutoModelForCausalLM, AutoProcessor


In [5]:
import shutil
import os

# Clear transformers cache
cache_dir = os.path.expanduser("~/.cache/huggingface/hub")
if os.path.exists(cache_dir):
    print(f"Removing cache from {cache_dir}")
    shutil.rmtree(cache_dir)

In [6]:
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

#Add the file-path here
# 1. Path to images folder
NUM_EPOCHS = 3
BATCH_SIZE = 1
# 2. Path to test.csv
data_dir = "/teamspace/studios/this_studio/images" 
csv_filename = "/teamspace/studios/this_studio/saved_images.csv"
metadata_df = pd.read_csv(csv_filename)
metadata_df["image"] = [x.split('/')[-1] for x in metadata_df["image_link"]]

metadata_df = metadata_df.drop(columns=["image_link", "group_id"])


new_eval = [f'What is the {x}?' for x in metadata_df["entity_name"]]
metadata_df["entity_name"] = new_eval

# This gives you roughly:
# - 80% train
# - 10% validation
# - 10% test

In [7]:
c=0
def clean_value(value):
    value = str(value).strip('[]').strip()
    global c
    try:
        # Split into parts
        parts = value.split()
        
        # Get first number whether it's a range or single number
        number_part = parts[0].split(',')[0]  # Take first number if there's a comma
        number = float(number_part.replace(',', ''))
        
        # Get unit if it exists
        unit = parts[-1] if len(parts) > 1 else ''
        
        # Convert to int if it's a whole number
        if number.is_integer():
            number = int(number)
        
        # Return formatted string
        if unit:
            return f"{number} {unit}"
        return str(number)
    except:
        print(f"Could not process: {value}")
        c+=1
        return value
metadata_df['entity_value']=metadata_df['entity_value'].apply(clean_value)

from sklearn.model_selection import train_test_split

# First split: separate test set
train_val_df, test_df = train_test_split(metadata_df, test_size=0.166, random_state=7,stratify=metadata_df['entity_name'],)

# Second split: separate validation from train
train_df, val_df = train_test_split(train_val_df, test_size=0.8, random_state=7,stratify=train_val_df['entity_name'],)


In [8]:
print(len(train_df))

5004


In [9]:
class VQADataset(Dataset):
    def __init__(self, df_val, img_folder, transform=None):
        self.data = df_val
        self.img_folder = img_folder
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        try:
            img_name = self.data.iloc[idx]['image']
            prefix = self.data.iloc[idx]['entity_name']
            suffix = self.data.iloc[idx]['entity_value']
            
            img_path = os.path.join(self.img_folder, img_name)
            image = Image.open(img_path).convert("RGB")
            
            if self.transform:
                image = self.transform(image)
                
            if image.shape != (3, 448, 448):
                print(f"Skipping image {img_name} due to wrong shape: {image.shape}")
                return None

            return {"image": image, "entity_name": prefix, "entity_value": suffix}
        except Exception as e:
            print(f"Error with image {img_name}: {str(e)}")
            return None
def collate_fn(examples):
    # Filter out None values
    examples = [ex for ex in examples if ex is not None]
    if not examples:
        return None
    
    # Get input texts and images
    texts = [example['entity_name'] for example in examples]
    images = [example["image"] for example in examples]
    
    # Get target texts (answers)
    target_texts = [example['entity_value'] for example in examples]
    
    # Process inputs (questions and images)
    inputs = processor(
        text=texts, 
        images=images, 
        return_tensors="pt", 
        padding="longest",
    )
    
    # Process targets (answers) separately
    target_tokens = processor.tokenizer(
        target_texts,
        padding=True,
        return_tensors="pt",
    )
    
    # Create decoder_input_ids (shifted right)
    decoder_input_ids = target_tokens.input_ids.clone()
    decoder_input_ids = torch.cat(
        [
            torch.ones((decoder_input_ids.shape[0], 1), dtype=torch.long) * processor.tokenizer.bos_token_id,
            decoder_input_ids[:, :-1]
        ],
        dim=-1
    )
    
    # Add decoder inputs and labels to the inputs dict
    inputs['decoder_input_ids'] = decoder_input_ids
    inputs['labels'] = target_tokens.input_ids
    device='cuda'
    # Move everything to device
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    return inputs
device = "cuda"
model_id = "google/paligemma-3b-ft-docvqa-448"
processor = AutoProcessor.from_pretrained(model_id, do_rescale=False)
def collate_fn(examples):
    
    prefixes = [example['entity_name'] for example in examples]
    suffixes = [example['entity_value'] for example in examples]
    images = [example["image"] for example in examples]

    images = torch.stack(images)
    tokens = processor(text=prefixes, images=images, suffix=suffixes,
                       return_tensors="pt", padding="longest")

    tokens = tokens.to(torch.bfloat16).to(device)

    return tokens
transform = transforms.Compose([
    transforms.Resize((448, 448)),
    transforms.ToTensor()
])

#Initializing the dataset
train_dataset = VQADataset(train_df, data_dir,  transform)
val_dataset = VQADataset(val_df, data_dir,  transform)
test_dataset = VQADataset(test_df, data_dir,  transform)
dataset_size = len(train_dataset)
print(f"Train size: {dataset_size}")

#Loading the model


Train size: 5004


In [10]:
import torch
from transformers import AutoModelForCausalLM, AutoProcessor

# CHECKPOINT = "microsoft/Florence-2-base-ft"
# CHECKPOINT='prithivMLmods/Florence-2-VLM-Doc-VQA'
# CHECKPOINT='adamchanadam/Test_Florence-2-FT-DocVQA'
# CHECKPOINT='microsoft/Florence-2-base'
CHECKPOINT="google/paligemma-3b-ft-docvqa-448"
REVISION = 'refs/pr/6'
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_type=torch.bfloat16,
    skip_modules_not_needed=True,
    offload_to_cpu=True,
)
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# model = AutoModelForCausalLM.from_pretrained(
#     CHECKPOINT, trust_remote_code=True,quantization_config=bnb_config
#  ).to(DEVICE)
# processor = AutoProcessor.from_pretrained(
#     CHECKPOINT, trust_remote_code=True)
from peft import LoraConfig, get_peft_model

# TARGET_MODULES = [
#     "q_proj", # Only attention queries
#     "v_proj", # Only attention values
#     "fc2"     # FFN down-projections
# ]
TARGET_MODULES=["q_proj", "v_proj",  "up_proj", "down_proj"]
model = AutoModelForPreTraining.from_pretrained(CHECKPOINT, quantization_config=bnb_config, device_map={"":0})
config = LoraConfig(
    r=8,
    lora_alpha=8,
    target_modules=TARGET_MODULES,
    task_type="CAUSAL_LM"

)

model = get_peft_model(model, config)
for param in model.vision_tower.parameters():
    param.requires_grad = False

for param in model.multi_modal_projector.parameters():
    param.requires_grad = False
    
print(model.print_trainable_parameters())
model.print_trainable_parameters()

Unused kwargs: ['bnb_4bit_compute_type', 'skip_modules_not_needed', 'offload_to_cpu']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

trainable params: 6,230,016 || all params: 2,931,576,560 || trainable%: 0.2125
None
trainable params: 6,230,016 || all params: 2,931,576,560 || trainable%: 0.2125


In [16]:
import torch
torch.cuda.empty_cache()

In [12]:
wandb.login(key='a2750455a136a8fa22fd4f037a3b5c5b68f3426f')
#wandb logging parameters
wandb.init(
    project="paligemma_finetuning-003",
    config={
    "learning_rate": 2e-5,
    "architecture": "florence_vqa-02",
    "dataset": "amazon_entities",
    "epochs": 10,
    }
)

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.


[34m[1mwandb[0m: Currently logged in as: [33madithyabalagoni11[0m ([33madithyabalagoni11-vasavi-college-of-engineering[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /teamspace/studios/this_studio/.netrc


In [13]:
os.environ["HUGGING_FACE_HUB_TOKEN"] ='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'

from huggingface_hub import login

login(token=os.environ['HUGGING_FACE_HUB_TOKEN'])
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions[0].argmax(axis=-1)
    
    # Decode all predictions and labels
    decoded_preds = []
    decoded_labels = []
    
    for pred, label in zip(predictions, labels):
        # Remove padding (-100) from labels
        label = label[label != -100]
        
        # Decode and clean up
        pred_text = processor.tokenizer.decode(pred, skip_special_tokens=True).strip()
        label_text = processor.tokenizer.decode(label, skip_special_tokens=True).strip()
        
        decoded_preds.append(pred_text)
        decoded_labels.append(label_text)
    
    # Initialize counters for F1 score calculation
    true_positives = 0
    false_positives = 0
    false_negatives = 0
    true_negatives = 0
    
    # Calculate metrics based on the problem's criteria
    for pred, gt in zip(decoded_preds, decoded_labels):
        if pred != "" and gt != "":
            if pred == gt:
                true_positives += 1
            else:
                false_positives += 1
        elif pred != "" and gt == "":
            false_positives += 1
        elif pred == "" and gt != "":
            false_negatives += 1
        else:  # pred == "" and gt == ""
            true_negatives += 1
    
    # Calculate precision and recall
    precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
    recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
    
    # Calculate F1 score
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    # Compile metrics
    metrics = {
        "precision": precision,
        "recall": recall,
        "f1_score": f1_score,
        "exact_match": true_positives 
    }
    
    # Print some examples and metrics

    
    print("\nExample Predictions:")
    import random
    # Get 3 random indices
    sample_indices = random.sample(range(len(decoded_preds)), min(50, len(decoded_preds)))
    for idx in sample_indices:
        print(f"Predicted: {decoded_preds[idx]}")
        print(f"Actual: {decoded_labels[idx]}")
        print("-" * 50)
    
    return metrics

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions[0].argmax(axis=-1)  # Get most likely token predictions
    
    decoded_preds = []
    decoded_labels = []
    
    for pred, label in zip(predictions, labels):
        # Remove padding and special tokens
        label = label[label != -100]  # Remove padding
        
        # Decode using processor's tokenizer
        pred_text = processor.decode(pred, skip_special_tokens=True).strip()
        label_text = processor.decode(label, skip_special_tokens=True).strip()
        
        # Clean up any extra whitespace
        pred_text = " ".join(pred_text.split())
        label_text = " ".join(label_text.split())
        
        decoded_preds.append(pred_text)
        decoded_labels.append(label_text)
    
    # Calculate metrics
    metrics = {
        "exact_match": sum(p == l for p, l in zip(decoded_preds, decoded_labels)) / len(decoded_preds)
    }
    
    # Print sample predictions
    print("\nSample Predictions:")
    for i in range(min(3, len(decoded_preds))):
        print(f"Prediction: {decoded_preds[i]}")
        print(f"Label: {decoded_labels[i]}")
        print("-" * 50)
        
    return metrics    
#
# Setting up the training
BATCH_SIZE=1
NUM_EPOCHS=1
from transformers import Trainer, TrainingArguments
training_args = TrainingArguments(
    num_train_epochs=NUM_EPOCHS,
    remove_unused_columns=False,
    per_device_train_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=1,
    # evaluation_strategy="steps",  # or "epoch"
    # eval_steps=2,   
    logging_steps=50,
    save_steps=100,
    save_total_limit=1,


           # if using "steps"
    warmup_steps=2,
    learning_rate=5e-6,
    weight_decay=1e-6,
    adam_beta2=0.999,
    optim="adamw_hf",
    save_strategy="steps",
    push_to_hub=True,
    output_dir="pali-gemma-ft-ml-challenge",
    bf16=True,
    report_to=["wandb"],
    dataloader_pin_memory=False
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    # eval_dataset=val_dataset,  # Add validation dataset
    data_collator=collate_fn,
    # compute_metrics=compute_metrics
)
trainer.train()

# Train the model


Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


Step,Training Loss
50,2.9042
100,2.6667
150,3.0691
200,2.5642
250,2.6868
300,2.1369
350,1.7191
400,1.8388
450,1.2289
500,1.3128


TrainOutput(global_step=5004, training_loss=0.7744381612629818, metrics={'train_runtime': 4708.3901, 'train_samples_per_second': 1.063, 'train_steps_per_second': 1.063, 'total_flos': 7.481653546575254e+16, 'train_loss': 0.7744381612629818, 'epoch': 1.0})

In [14]:
trainer.push_to_hub()

CommitInfo(commit_url='https://huggingface.co/adithyabalagoni11/pali-gemma-ft-ml-challenge/commit/052daaacde27d3b630be5c88c4c8a392669a2676', commit_message='End of training', commit_description='', oid='052daaacde27d3b630be5c88c4c8a392669a2676', pr_url=None, repo_url=RepoUrl('https://huggingface.co/adithyabalagoni11/pali-gemma-ft-ml-challenge', endpoint='https://huggingface.co', repo_type='model', repo_id='adithyabalagoni11/pali-gemma-ft-ml-challenge'), pr_revision=None, pr_num=None)

In [None]:
def new_compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions[0].argmax(axis=-1)
    
    # Decode all predictions and labels
    decoded_preds = []
    decoded_labels = []
    
    for pred, label in zip(predictions, labels):
        # Remove padding (-100) from labels
        label = label[label != -100]
        
        # Decode and clean up
        pred_text = processor.tokenizer.decode(pred, skip_special_tokens=True).strip()
        label_text = processor.tokenizer.decode(label, skip_special_tokens=True).strip()
        
        decoded_preds.append(pred_text)
        decoded_labels.append(label_text)
    
    # Initialize counters for F1 score calculation
    true_positives = 0
    false_positives = 0
    false_negatives = 0
    true_negatives = 0
    
    # Calculate metrics based on the problem's criteria
    for pred, gt in zip(decoded_preds, decoded_labels):
        if pred != "" and gt != "":
            if pred == gt:
                true_positives += 1
            else:
                false_positives += 1
        elif pred != "" and gt == "":
            false_positives += 1
        elif pred == "" and gt != "":
            false_negatives += 1
        else:  # pred == "" and gt == ""
            true_negatives += 1
    
    # Calculate precision and recall
    precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
    recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
    
    # Calculate F1 score
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    # Compile metrics
    metrics = {
        "precision": precision,
        "recall": recall,
        "f1_score": f1_score,
        "exact_match": true_positives 
    }
    
    # Print some examples and metrics

    
    print("\nExample Predictions:")
    import random
    # Get 3 random indices
    # sample_indices = random.sample(range(len(decoded_preds)), min(3, len(decoded_preds)))
    for idx in sample_indices[:200]:
        print(f"Predicted: {decoded_preds[idx]}")
        print(f"Actual: {decoded_labels[idx]}")
        print("-" * 50)
    
    return metrics

trainer.compute_metrics=new_compute_metrics
from tqdm import tqdm

# Add progress bar manually
print("Evaluating on test set...")
with tqdm(total=len(test_dataset)) as pbar:
    test_results = trainer.evaluate(
        eval_dataset=test_dataset,
        metric_key_prefix="test"
    )
    pbar.update(len(test_dataset))