In [13]:
# Install Pytorch & other libraries
%pip install "torch==2.4.0" tensorboard pillow

# Install Hugging Face libraries
%pip install  --upgrade \
  "transformers==4.45.1" \
  "datasets==3.0.1" \
  "accelerate==0.34.2" \
  "evaluate==0.4.3" \
  "bitsandbytes==0.44.0" \
  "trl==0.11.1" \
  "peft==0.13.0" \
  "qwen-vl-utils" \
  "wandb" \
  "scikit-learn"

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


### Complete Code for evaluation

In [49]:
from huggingface_hub import login
import evaluate
from datasets import load_dataset
import numpy as np
from transformers import TrainerCallback

login(
  token="YOUR-TOKEN", # ADD YOUR TOKEN HERE
  add_to_git_credential=True
)

prompt = """Create a 150 to 300 words sustainable tourism guide for ##TITLE## and ##QUESTION## based on the image.

##title##: {title}
##question##: {question}"""

system_message = """You are a sustainable tourism expert creating precise 300-word guides for Liechtenstein destinations. Your expertise is showcasing responsible travel experiences while:
- Maintaining exactly 150 to 300 words 
- Focusing on sustainable tourism guide
- Providing specific, actionable details
- Excluding headers, hashtags, or meta-text

Each description must integrate every sustainability aspect while accurately representing the destination."""

# Create a custom callback to compute metrics during training
class EvaluationCallback(TrainerCallback):
    def __init__(self, eval_dataset, processor, tokenizer):
        self.eval_dataset = eval_dataset
        self.processor = processor
        self.tokenizer = tokenizer
        self.bleu = evaluate.load("bleu")
        self.rouge = evaluate.load("rouge")
        self.meteor = evaluate.load("meteor")
        
    def on_epoch_end(self, args, state, control, model, **kwargs):
        print("\nComputing evaluation metrics...")
        
        # Take a sample of evaluation dataset for faster computation
        eval_sample = self.eval_dataset[:20]  # Reduced sample size for efficiency
        
        predictions = []
        references = []
        
        model.eval()
        for example in eval_sample:
            try:
                # Get the reference text
                reference = example["messages"][-1]["content"][0]["text"]
                
                # Process image first
                image_input = process_vision_info(example["messages"])[0]
                
                # Process text
                text = self.processor.apply_chat_template(
                    example["messages"][:-1],
                    tokenize=False,
                    add_generation_prompt=False
                )
                
                # Create inputs batch
                inputs = self.processor(
                    text=text,
                    images=image_input,
                    return_tensors="pt",
                    padding=True
                )
                
                # Convert input tensors to bfloat16 where needed
                for key in inputs:
                    if torch.is_tensor(inputs[key]):
                        if inputs[key].dtype == torch.float32:
                            inputs[key] = inputs[key].to(dtype=torch.bfloat16)
                        inputs[key] = inputs[key].to(model.device)
                
                with torch.cuda.amp.autocast(dtype=torch.bfloat16):
                    with torch.no_grad():
                        outputs = model.generate(
                            **inputs,
                            max_new_tokens=300,
                            do_sample=True,  # Enable sampling
                            temperature=0.7,  # Lower temperature for more focused sampling
                            top_p=0.9,       # Nucleus sampling
                            pad_token_id=self.tokenizer.pad_token_id,
                            eos_token_id=self.tokenizer.eos_token_id
                        )
                
                prediction = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
                predictions.append(prediction)
                references.append(reference)
                
            except Exception as e:
                print(f"Error processing example: {str(e)}")
                continue
        
        if len(predictions) > 0:
            try:
                # Calculate metrics
                bleu_score = self.bleu.compute(predictions=predictions, references=[[r] for r in references])
                rouge_score = self.rouge.compute(predictions=predictions, references=references)
                meteor_score = self.meteor.compute(predictions=predictions, references=references)
                
                # Log metrics
                print(f"\nEvaluation Metrics after epoch {state.epoch}:")
                print(f"Number of successful generations: {len(predictions)}/{len(eval_sample)}")
                print(f"BLEU: {bleu_score['bleu']:.4f}")
                print(f"ROUGE-L F1: {rouge_score['rougeL']:.4f}")
                print(f"METEOR: {meteor_score['meteor']:.4f}")
                
                # Sample output comparison
                if len(predictions) > 0:
                    print("\nSample Output Comparison:")
                    print("Reference:", references[0][:200] + "...")
                    print("Prediction:", predictions[0][:200] + "...")
                
                # Log to tensorboard
                state.log_history.append({
                    "eval/bleu": bleu_score["bleu"],
                    "eval/rougeL": rouge_score["rougeL"],
                    "eval/meteor": meteor_score["meteor"],
                    "eval/successful_generations": len(predictions),
                    "epoch": state.epoch
                })
            except Exception as e:
                print(f"Error computing metrics: {str(e)}")

from datasets import load_dataset

# Convert dataset to OAI messages       
def format_data(sample):
    return {"messages": [
                {
                    "role": "system",
                    "content": [{"type": "text", "text": system_message}],
                },
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "text",
                            "text": prompt.format(title=sample["title"], question=sample["question"]),
                        },{
                            "type": "image",
                            "image": sample["image"],
                        }
                    ],
                },
                {
                    "role": "assistant",
                    "content": [{"type": "text", "text": sample["description"]}],
                },
            ],
        }

# Load dataset from the hub
dataset_id = "arad1367/Liechtenstein_Tourist_OCR_Big_Dataset"
dataset = load_dataset("arad1367/Liechtenstein_Tourist_OCR_Big_Dataset", split="train")

# Convert dataset to OAI messages
# need to use list comprehension to keep Pil.Image type, .mape convert image to bytes
dataset = [format_data(sample) for sample in dataset]

print(dataset[345]["messages"])

# Split your dataset into train and eval
train_size = int(0.9 * len(dataset))
indices = list(range(len(dataset)))
np.random.shuffle(indices)
train_indices = indices[:train_size]
eval_indices = indices[train_size:]

train_dataset = [dataset[i] for i in train_indices]
eval_dataset = [dataset[i] for i in eval_indices]

# Create the evaluation callback
evaluation_callback = EvaluationCallback(eval_dataset, processor, processor.tokenizer)

import torch
from transformers import AutoModelForVision2Seq, AutoProcessor, BitsAndBytesConfig

# Hugging Face model id
model_id = "Qwen/Qwen2-VL-2B-Instruct" 

# BitsAndBytesConfig int-4 config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
)

# Load model and tokenizer
model = AutoModelForVision2Seq.from_pretrained(
    model_id,
    device_map="auto",
    # attn_implementation="flash_attention_2", # not supported for training
    torch_dtype=torch.bfloat16,
    quantization_config=bnb_config
)
processor = AutoProcessor.from_pretrained(model_id)

# Preparation for inference
text = processor.apply_chat_template(
    dataset[2]["messages"], tokenize=False, add_generation_prompt=False
)
print(text)

from peft import LoraConfig

# LoRA config based on QLoRA paper & Sebastian Raschka experiment
peft_config = LoraConfig(
        lora_alpha=16,
        lora_dropout=0.05,
        r=8,
        bias="none",
        target_modules=["q_proj", "v_proj"],
        task_type="CAUSAL_LM", 
)

from trl import SFTConfig
from transformers import Qwen2VLProcessor
from qwen_vl_utils import process_vision_info

args = SFTConfig(
    output_dir="TRL-sustainable-tourist-FL-Evaluate", # directory to save and repository id
    num_train_epochs=3,                     # number of training epochs
    per_device_train_batch_size=4,          # batch size per device during training
    gradient_accumulation_steps=8,          # number of steps before performing a backward/update pass
    gradient_checkpointing=True,            # use gradient checkpointing to save memory
    optim="adamw_torch_fused",              # use fused adamw optimizer
    logging_steps=5,                       # log every 10 steps
    save_strategy="epoch",                  # save checkpoint every epoch
    learning_rate=2e-4,                     # learning rate, based on QLoRA paper
    bf16=True,                              # use bfloat16 precision
    tf32=True,                              # use tf32 precision
    max_grad_norm=0.3,                      # max gradient norm based on QLoRA paper
    warmup_ratio=0.03,                      # warmup ratio based on QLoRA paper
    lr_scheduler_type="constant",           # use constant learning rate scheduler
    push_to_hub=True,                       # push model to hub
    report_to="tensorboard",                # report metrics to tensorboard
    gradient_checkpointing_kwargs = {"use_reentrant": False}, # use reentrant checkpointing
    dataset_text_field="", # need a dummy field for collator
    dataset_kwargs = {"skip_prepare_dataset": True} # important for collator
)
args.remove_unused_columns=False

# Create a data collator to encode text and image pairs
def collate_fn(examples):
    # Get the texts and images, and apply the chat template
    texts = [processor.apply_chat_template(example["messages"], tokenize=False) for example in examples]
    image_inputs = [process_vision_info(example["messages"])[0] for example in examples]

    # Tokenize the texts and process the images
    batch = processor(text=texts, images=image_inputs, return_tensors="pt", padding=True)

    # The labels are the input_ids, and we mask the padding tokens in the loss computation
    labels = batch["input_ids"].clone()
    labels[labels == processor.tokenizer.pad_token_id] = -100  #
    # Ignore the image token index in the loss computation (model specific)
    if isinstance(processor, Qwen2VLProcessor):
        image_tokens = [151652,151653,151655]
    else: 
        image_tokens = [processor.tokenizer.convert_tokens_to_ids(processor.image_token)]
    for image_token_id in image_tokens:
        labels[labels == image_token_id] = -100
    batch["labels"] = labels

    return batch

from trl import SFTTrainer

trainer = SFTTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    data_collator=collate_fn,
    dataset_text_field="",  # needs dummy value
    peft_config=peft_config,
    tokenizer=processor.tokenizer,
    callbacks=[evaluation_callback]  # Add the evaluation callback
)

# start training, the model will be automatically saved to the hub and the output directory
trainer.train()

# save model 
trainer.save_model(args.output_dir)

Token is valid (permission: fineGrained).
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub.
Run the following command in your terminal in case you want to set the 'store' credential helper as default.

git config --global credential.helper store

Read https://git-scm.com/book/en/v2/Git-Tools-Credential-Storage for more details.[0m
Token has not been saved to git credential helper.
Your token has been saved to /home/pejman/.cache/huggingface/token
Login successful
[{'role': 'system', 'content': [{'type': 'text', 'text': 'You are a sustainable tourism expert creating precise 300-word guides for Liechtenstein destinations. Your expertise is showcasing responsible travel experiences while:\n- Maintaining exactly 150 to 300 words \n- Focusing on sustainable tourism guide\n- Providing specific, actionable details\n- Excluding headers, hashtags, or meta-text\n\nEach description 

[nltk_data] Downloading package wordnet to /home/pejman/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/pejman/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/pejman/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
Unrecognized keys in `rope_scaling` for 'rope_type'='default': {'mrope_section'}


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

<|im_start|>system
You are a sustainable tourism expert creating precise 300-word guides for Liechtenstein destinations. Your expertise is showcasing responsible travel experiences while:
- Maintaining exactly 150 to 300 words 
- Focusing on sustainable tourism guide
- Providing specific, actionable details
- Excluding headers, hashtags, or meta-text

Each description must integrate every sustainability aspect while accurately representing the destination.<|im_end|>
<|im_start|>user
Create a 150 to 300 words sustainable tourism guide for ##TITLE## and ##QUESTION## based on the image.

##title##: Schellenberg Castle
##question##: What is the name of the tourist destination shown in this image from Liechtenstein?<|vision_start|><|image_pad|><|vision_end|><|im_end|>
<|im_start|>assistant
Schellenberg Castle in Liechtenstein is a historic hilltop castle offering panoramic views. It dates back to the 12th century and is a significant cultural landmark in the region.<|im_end|>




Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]


Step,Training Loss
5,2.9896
10,2.7532
15,2.4612
20,2.1298
25,1.7484
30,1.3529
35,0.9734
40,0.6805
45,0.5347
50,0.4449



Computing evaluation metrics...


  with torch.cuda.amp.autocast(dtype=torch.bfloat16):



Evaluation Metrics after epoch 1.0:
Number of successful generations: 20/20
BLEU: 0.0477
ROUGE-L F1: 0.1797
METEOR: 0.3768

Sample Output Comparison:
Reference: Kunstmuseum Liechtenstein is a modern art museum located in Vaduz Liechtenstein. It features contemporary and modern art exhibitions and is known for its striking black cube architecture and cultural ...
Prediction: system
You are a sustainable tourism expert creating precise 300-word guides for Liechtenstein destinations. Your expertise is showcasing responsible travel experiences while:
- Maintaining exactly 15...


Unrecognized keys in `rope_scaling` for 'rope_type'='default': {'mrope_section'}
  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]



Computing evaluation metrics...


  with torch.cuda.amp.autocast(dtype=torch.bfloat16):



Evaluation Metrics after epoch 2.0:
Number of successful generations: 20/20
BLEU: 0.1495
ROUGE-L F1: 0.3251
METEOR: 0.6228

Sample Output Comparison:
Reference: Kunstmuseum Liechtenstein is a modern art museum located in Vaduz Liechtenstein. It features contemporary and modern art exhibitions and is known for its striking black cube architecture and cultural ...
Prediction: system
You are a sustainable tourism expert creating precise 300-word guides for Liechtenstein destinations. Your expertise is showcasing responsible travel experiences while:
- Maintaining exactly 15...


Unrecognized keys in `rope_scaling` for 'rope_type'='default': {'mrope_section'}
  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]
Unrecognized keys in `rope_scaling` for 'rope_type'='default': {'mrope_section'}



Computing evaluation metrics...


  with torch.cuda.amp.autocast(dtype=torch.bfloat16):



Evaluation Metrics after epoch 3.0:
Number of successful generations: 20/20
BLEU: 0.1552
ROUGE-L F1: 0.3268
METEOR: 0.6246

Sample Output Comparison:
Reference: Kunstmuseum Liechtenstein is a modern art museum located in Vaduz Liechtenstein. It features contemporary and modern art exhibitions and is known for its striking black cube architecture and cultural ...
Prediction: system
You are a sustainable tourism expert creating precise 300-word guides for Liechtenstein destinations. Your expertise is showcasing responsible travel experiences while:
- Maintaining exactly 15...


Unrecognized keys in `rope_scaling` for 'rope_type'='default': {'mrope_section'}
Unrecognized keys in `rope_scaling` for 'rope_type'='default': {'mrope_section'}
Unrecognized keys in `rope_scaling` for 'rope_type'='default': {'mrope_section'}


In [51]:
# free the memory again
del model
del trainer
torch.cuda.empty_cache()