Ref:

Batch Inference Section: https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct

In [None]:
import os, torch
print(os.getenv("CONDA_DEFAULT_ENV"))
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [None]:
import json
import pickle
import random
import datasets
import numpy as np
from tqdm.auto import tqdm
from datasets import Dataset
from datasets import load_dataset

### Force Determinism

In [None]:
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed_all(42)

### Load Dataset

In [None]:
dataset = load_dataset("dutta18/Quantity-Reasoning-VQA-23K")

In [None]:
dataset = dataset['train']

In [None]:
dataset

In [None]:
dataset = dataset.select(range(4450))

In [None]:
dataset

### Load COT Think Data

In [None]:
with open('./qty-reasoning-cot-data-8000.pkl', 'rb') as file:
    cot_think_data = pickle.load(file)

In [None]:
cot_think_data = cot_think_data[:4450]

In [None]:
dataset = dataset.add_column("cot_think_data", cot_think_data)

In [None]:
dataset

### Split Into Train, Test & Val

In [None]:
from datasets import DatasetDict

In [None]:
# 1. First create train (80%) and temp (20%)
train_test = dataset.train_test_split(test_size=0.25, seed=42)

# 2. Split the temp set into validation (10%) and test (10%)
test_val = train_test['test'].train_test_split(test_size=0.6, seed=42)

In [None]:
splits = {
    'train': train_test['train'],
    'validation': test_val['train'],
    'test': test_val['test'],
}

dataset_dict = DatasetDict(splits)

In [None]:
train_set, val_set, test_set = dataset_dict['train'], dataset_dict['validation'], dataset_dict['test']

In [None]:
test_set

### Inference

In [None]:
from peft import PeftModel
from util.vision_util import process_vision_info
from peft import get_peft_model, LoraConfig, prepare_model_for_kbit_training
from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor

In [None]:
pretrained_base_model_path = "Qwen/Qwen2.5-VL-3B-Instruct"
QLORA_finetuned_model_path = '/home/aritrad/main/Qwen2.5-VL-3B/GRPO/chkpts/qwen2.5-qty-chkpt'

In [None]:
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2.5-VL-3B-Instruct",
    torch_dtype=torch.bfloat16,
    low_cpu_mem_usage = True,
    attn_implementation="flash_attention_2",
    device_map='auto'
)

# Load processor. 
# The default range for the number of visual tokens per image in the model is 4-16384. You can set min_pixels and max_pixels according to your needs, such as a token count range of 256-1280, to balance speed and memory usage.
# min_pixels = 256*28*28
# max_pixels = 1280*28*28
processor = AutoProcessor.from_pretrained(
    "Qwen/Qwen2.5-VL-3B-Instruct", 
    min_pixels=256*28*28, 
    max_pixels=512*28*28, 
    padding_side="left",
    use_fast=True
)

In [None]:
# Load the QLORA-Trained Model.

peft_trained_model = PeftModel.from_pretrained(model, QLORA_finetuned_model_path)

### Prepare Chat Messages

In [None]:
messageList = list()

for i in tqdm(range(len(test_set))):
    
    message = [
        {
            "role": "user",
            "content": [
                {
                    "type": "image", 
                    "image": test_set[i]['image']
                },
                {
                    "type": "text", 
                    "text": test_set[i]['question']
                },
            ],
        }
    ]

    messageList.append(message)

In [None]:
# Define the batch size.

batch_size = 16

In [None]:
# Iterate through the dataset in batches.
resultGeneratedAnswers = list()

for i in tqdm(range(0, len(messageList), batch_size)):
    
    # Slice the dataset for the current batch
    messageBatch = messageList[i:i + batch_size]

    # Preparation for batch inference
    texts = [
        processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True)
        for msg in messageBatch
    ]
    image_inputs, video_inputs = process_vision_info(messageBatch)
    inputs = processor(
        text=texts,
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    # Batch Inference
    generated_ids = peft_trained_model.generate(
        **inputs, 
        max_new_tokens=256,
        num_return_sequences=1,
    )
        
    generated_ids_trimmed = [
        out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    output_texts = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )
    #print(output_texts, '\n\n')
    resultGeneratedAnswers.extend(output_texts)

In [None]:
print(resultGeneratedAnswers[0])

### Manual Verification

In [None]:
len(resultGeneratedAnswers)

In [None]:
idx = 11

In [None]:
print(test_set[idx]['question'],'\n\n', test_set[idx]['answer'],'\n\n', test_set[idx]['cot_think_data'])

In [None]:
print(resultGeneratedAnswers[idx])

In [None]:
test_set[idx]['image']

### Parsing Outputs

In [None]:
decoded_ouputs = resultGeneratedAnswers

In [None]:
COT_generated, shortAnswers_generated = [], []

for i in range(len(decoded_ouputs)):
    try:
        CoTAnswer, shortAnswer = decoded_ouputs[i].split('Final Answer:')
        COT_generated.append(CoTAnswer.strip()); shortAnswers_generated.append(shortAnswer.lower().strip().replace('.', ''))
    except:
        #print(i)
        COT_generated.append('NULL'); shortAnswers_generated.append('NULL')

In [None]:
# GT Labels:

shortAnswers_groundTruth = val_set['answer']

In [None]:
len(COT_generated), len(shortAnswers_generated), len(shortAnswers_groundTruth), len(COT_groundtruth)

## Calculate Accuracy

### Reasoning Group wise Exact Match (GEM)

In [None]:
from collections import defaultdict

In [None]:
match_counts = defaultdict(int)
total_counts = defaultdict(int)

In [None]:
for gt, pred, rtype in zip(val_set['answer'], shortAnswers_generated, val_set['reasoning_type']):
    total_counts[rtype] += 1
    if pred.strip().lower() == gt.strip().lower():
        match_counts[rtype] += 1

accuracy_per_type = {
    rtype: match_counts[rtype] / total_counts[rtype]
    for rtype in total_counts
}

In [None]:
for rtype, acc in accuracy_per_type.items():
    print(f"{rtype.capitalize()}: {acc:.2%} ({match_counts[rtype]}/{total_counts[rtype]})")

### Evaluate Exact String Match (EM)

In [None]:
# Initialize variables for accuracy calculation
correct_predictions = 0
total_predictions = len(shortAnswers_generated)

# Loop through the results and compare answers
for i in range(len(shortAnswers_generated)):
    if shortAnswers_generated[i].strip().lower() == shortAnswers_groundTruth[i].strip().lower():
        correct_predictions += 1

# Calculate accuracy
accuracy = (correct_predictions / total_predictions) * 100

print(f"Accuracy: {accuracy:.2f} %")

## Evaluating with BERT Score

Precision (P): How much of the candidate's content is relevant.

Recall (R): How much of the reference's content is covered by the candidate.

F1 Score (F1): Harmonic mean of Precision and Recall, commonly used as the final metric.al metric.

In [None]:
from bert_score import score

# Example references and candidates
# references = ['stool','no','person','stool','sign','bronze','door','no','red','chair','red','black']
# candidates = ['stool','no','child','stool','sign','gold','picture','no','brown','chair','brown','black']

In [None]:
# Compute BERTScore, answerList_test = ground truth, result_list = model generated
P, R, F1 = score(COT_generated, COT_groundtruth, lang="en", verbose=True, device='cuda')

In [None]:
# Print scores
print("Mean Precision:", np.round(np.mean(P.tolist())*100, 2) )
print("Mean Recall:", np.round(np.mean(R.tolist())*100, 2) )
print("Mean F1 Score:", np.round(np.mean(F1.tolist())*100, 2) )

### Evaluating with BLEU-1 Score

In [None]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

In [None]:
bleu_scores = []

# Function to compute BLEU-1 score for a list of ground truth and predicted answers
def calculate_bleu_1_score(ground_truth, predicted):

    # This sets BLEU-1 to only consider unigram precision
    weights = [1.0] + [0.0] * 3  
    
    # Smoothing function to handle cases with no n-gram matches
    smoothing_function = SmoothingFunction().method1  
    
    for gt, pred in zip(ground_truth, predicted):
        score = sentence_bleu([gt], pred, weights=weights, smoothing_function=smoothing_function)  
        bleu_scores.append(score)
    
    avg_bleu_score = sum(bleu_scores) / len(bleu_scores)
    
    return avg_bleu_score

In [None]:
# Calculate the BLEU score
avg_bleu = calculate_bleu_1_score(COT_generated, COT_groundtruth)
print(f"Average BLEU score: {np.round(avg_bleu*100, 2)}")

### ROUGE Score

In [None]:
from rouge_score import rouge_scorer

In [None]:
def calculate_avg_rouge_scores(references, candidates):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    
    rouge1_scores, rouge2_scores, rougeL_scores = [], [], []

    for ref, cand in zip(references, candidates):
        scores = scorer.score(ref, cand)
        rouge1_scores.append(scores['rouge1'].fmeasure)
        rouge2_scores.append(scores['rouge2'].fmeasure)
        rougeL_scores.append(scores['rougeL'].fmeasure)

    return {
        "ROUGE-1": np.round(np.mean(rouge1_scores), 4),
        "ROUGE-2": np.round(np.mean(rouge2_scores), 4),
        "ROUGE-L": np.round(np.mean(rougeL_scores), 4),
    }

In [None]:
avg_rouge = calculate_avg_rouge_scores(COT_generated, COT_groundtruth)
avg_rouge = { k:round(v*100, 2) for k,v in avg_rouge.items()}
print("Average ROUGE scores:", avg_rouge)

### Cosine Function

In [None]:
from sentence_transformers import SentenceTransformer, util
sbert = SentenceTransformer('all-mpnet-base-v2', device = 'cuda')

In [None]:
def findCosSim(word1:str, word2:str) -> int:

    # Compute the embeddings
    embedding1 = sbert.encode(word1, convert_to_tensor=True)
    embedding2 = sbert.encode(word2, convert_to_tensor=True)
    
    # Compute cosine similarity
    cosine_score = util.pytorch_cos_sim(embedding1, embedding2)
    return round(cosine_score.item(), 2)

### Cosine Accuracy - COT Chain

In [None]:
COTCosineAccuracy = []

for idx in tqdm(range(len(COT_generated))):
    if COT_generated[idx] == 'NULL':
        # print('0')
        score = 0
    else:
        cos_sim = findCosSim(COT_generated[idx], COT_groundtruth[idx])
        if cos_sim > 0.8:
            score = 1
        else:
            score = 0
    
    COTCosineAccuracy.append(score)

In [None]:
round( ( sum(COTCosineAccuracy) / len(COTCosineAccuracy) ) * 100, 2)

### Cosine Accuracy - Short Answers

In [None]:
cosineAccuracy = [ findCosSim( shortAnswers_generated[idx], shortAnswers_groundTruth[idx] ) > 0.71 for idx in tqdm(range(len(shortAnswers_groundTruth))) ]

In [None]:
round( ( sum(cosineAccuracy) / len(cosineAccuracy) ) * 100, 2)