In [1]:
!pip install nltk  
!pip install bert-score
!pip install rouge-score
!pip install peft
!pip install nltk
!pip install rouge-score

Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.0.0->bert-score)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.0.0->bert-score)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.0.0->bert-score)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch>=1.0.0->bert-score)
  Downloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cusolver-cu12==11.6.1.9 (from torch>=1.0.0->bert-score)
  Downloading nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cusparse-cu12==12.3.1.170 (from torch>=1.0.0->be

In [2]:
# import kagglehub

# # Download latest version
# path = kagglehub.dataset_download("naval2024099/vqa-vr")

# print("Path to dataset files:", path)

In [3]:

import os
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from peft import LoraConfig, get_peft_model
from sklearn.metrics import accuracy_score, f1_score
from PIL import Image
import logging
from transformers import AutoProcessor, AutoModelForVisualQuestionAnswering

2025-05-17 07:29:22.030118: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747466962.218153      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747466962.271887      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


# ==============================================================================================================================================

# LoRA Finetuning and Testing

# =================================================================================================================================================


In [4]:
import os
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoProcessor, AutoModelForVisualQuestionAnswering
from peft import LoraConfig, get_peft_model
from sklearn.metrics import accuracy_score, f1_score
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
from bert_score import score as bert_score
import numpy as np
from PIL import Image
import logging
import torch.nn.functional as F
from tqdm import tqdm
import gc

# Set up logging for debugging and tracking
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# ========== PARAMETERS ==========
base_model_name = "Salesforce/blip-vqa-base"
num_epochs = 5
learning_rate = 5e-5
batch_size = 100
lora_r = 16
lora_alpha = 32
lora_dropout = 0.05
train_chunk_sizes = [10000]
test_chunk_size = 50
save_dir = "/kaggle/working/lora_adapters_refined"
output_csv = "/kaggle/working/vqa_results_refined.csv"
accumulation_steps = 2
previous_lora_dir = "/kaggle/working/lora_adapters"

# Paths
csv_base_path = '/kaggle/input/vqa-vr/'
img_base_path = '/kaggle/input/vqa-vr/filtered_images_corrected/'
train_csv_path = os.path.join(csv_base_path, 'train_main_image_id.csv')
test_csv_path = os.path.join(csv_base_path, 'test_main_image_id.csv')

# Create directories if they don't exist
os.makedirs(save_dir, exist_ok=True)

# ========== CHECK CUDA/GPU ==========
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
if device.type == "cuda":
    print(f"GPU Name: {torch.cuda.get_device_name(0)}")
    print(f"CUDA Version: {torch.version.cuda}")
    print(f"PyTorch Version: {torch.__version__}")
    print(f"CUDA Device Count: {torch.cuda.device_count()}")
    torch.cuda.empty_cache()
else:
    print("WARNING: CUDA not available, falling back to CPU.")

# Log training parameters
logger.info(f"Training Parameters:")
logger.info(f"Number of Epochs: {num_epochs}")
logger.info(f"Batch Size: {batch_size}")
logger.info(f"Learning Rate: {learning_rate}")
logger.info(f"LoRA Parameters: r={lora_r}, alpha={lora_alpha}, dropout={lora_dropout}")
logger.info(f"Gradient Accumulation Steps: {accumulation_steps}")
logger.info(f"Training chunk sizes: {train_chunk_sizes}")
logger.info(f"Total training questions: {sum(train_chunk_sizes)}")
logger.info(f"Test chunk size: {test_chunk_size}")

# Custom Dataset Class for VQA
class VQADataset(Dataset):
    def __init__(self, csv_path, img_base_path, processor, max_questions=None):
        self.df = pd.read_csv(csv_path)
        self.df = self.df[['path', 'question', 'correct_answer']]
        self.img_base_path = img_base_path
        self.processor = processor
        self.valid_indices = self._validate_dataset(max_questions)
        self.unique_images = len(set(self.df['path'].iloc[self.valid_indices]))
        logger.info(f"Loaded {len(self.valid_indices)} valid samples from {csv_path}")
        logger.info(f"Number of unique images: {self.unique_images}")

    def _validate_dataset(self, max_questions=None):
        valid_indices = []
        for idx in range(len(self.df)):
            if max_questions is not None and len(valid_indices) >= max_questions:
                break
            try:
                img_path = os.path.join(self.img_base_path, self.df['path'][idx])
                image = Image.open(img_path).convert('RGB')
                question = self.df['question'][idx]
                inputs = self.processor(image, question, return_tensors="pt", padding=True, truncation=True)
                if 'pixel_values' in inputs and inputs['pixel_values'] is not None:
                    valid_indices.append(idx)
                else:
                    logger.warning(f"Skipping index {idx}: No pixel_values for image")
            except Exception as e:
                logger.warning(f"Skipping index {idx}: Failed to load/process image")
        return valid_indices

    def __len__(self):
        return len(self.valid_indices)

    def __getitem__(self, idx):
        actual_idx = self.valid_indices[idx]
        img_path = os.path.join(self.img_base_path, self.df['path'][actual_idx])
        try:
            image = Image.open(img_path).convert('RGB')
        except Exception as e:
            logger.error(f"Failed to load image at index {actual_idx}")
            raise e
        question = self.df['question'][actual_idx]
        answer = str(self.df['correct_answer'][actual_idx]).lower()
        try:
            inputs = self.processor(image, question, return_tensors="pt", padding=True, truncation=True)
            inputs = {k: v.squeeze(0) for k, v in inputs.items()}
            labels = self.processor.tokenizer.encode(answer, return_tensors="pt", padding=True, truncation=True).squeeze(0)
            inputs['labels'] = labels
            return inputs
        except Exception as e:
            logger.error(f"Failed to process sample at index {actual_idx}")
            raise e

# Custom collate function
def custom_collate_fn(batch):
    keys = batch[0].keys()
    result = {}
    for key in keys:
        max_len = max(item[key].shape[-1] for item in batch)
        if key in ['input_ids', 'attention_mask', 'labels']:
            padded_tensors = torch.zeros(len(batch), max_len, dtype=torch.long)
            for i, item in enumerate(batch):
                tensor = item[key]
                padded_tensors[i, :len(tensor)] = tensor
            result[key] = padded_tensors
        elif key == 'pixel_values':
            result[key] = torch.stack([item[key] for item in batch])
        else:
            result[key] = torch.stack([item[key] for item in batch])
    return result

# Setup model with LoRA
def setup_model():
    model = AutoModelForVisualQuestionAnswering.from_pretrained(base_model_name)
    processor = AutoProcessor.from_pretrained(base_model_name)
    lora_config = LoraConfig(
        r=lora_r,
        lora_alpha=lora_alpha,
        target_modules=["query", "value"],
        lora_dropout=lora_dropout,
    )
    model = get_peft_model(model, lora_config)
    model.print_trainable_parameters()
    logger.info("Model with LoRA setup complete")
    if os.path.exists(previous_lora_dir):
        model.load_state_dict(torch.load(os.path.join(previous_lora_dir, "pytorch_model.bin")), strict=False)
        logger.info(f"Loaded previous LoRA weights")
    return model, processor

# Train model
def train_model(model, processor, train_dataset, checkpoint_path, epochs=num_epochs, batch_size=batch_size):
    model.to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
    start_epoch = 0
    if os.path.exists(checkpoint_path):
        checkpoint = torch.load(checkpoint_path)
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        start_epoch = checkpoint['epoch'] + 1
        logger.info(f"Resumed training from epoch {start_epoch}")
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=0, collate_fn=custom_collate_fn)
    model.train()
    for epoch in range(start_epoch, epochs):
        total_loss = 0
        accumulated_loss = 0
        optimizer.zero_grad()
        for batch_idx, batch in enumerate(tqdm(train_loader, desc=f"Training Epoch {epoch + 1}/{epochs}")):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss / accumulation_steps
            total_loss += loss.item() * accumulation_steps
            accumulated_loss += loss.item()
            loss.backward()
            if (batch_idx + 1) % accumulation_steps == 0 or (batch_idx + 1) == len(train_loader):
                optimizer.step()
                optimizer.zero_grad()
                accumulated_loss = 0
        avg_loss = total_loss / len(train_loader)
        logger.info(f"Epoch {epoch + 1}/{epochs}, Average Loss: {avg_loss}")
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': avg_loss,
        }, checkpoint_path)
        logger.info(f"Checkpoint saved")
        model.save_pretrained(save_dir)
        logger.info(f"LoRA adapters saved")

# Inference and save
def inference_and_save(model, processor, test_dataset, output_csv_path, batch_size):
    model.to(device)
    model.eval()
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=0, collate_fn=custom_collate_fn)
    results = []
    with torch.no_grad():
        for batch_idx, batch in enumerate(tqdm(test_loader, desc="Inference")):
            inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
            outputs = model.generate(**inputs)
            predicted_answers = [processor.decode(output, skip_special_tokens=True).lower() for output in outputs]
            batch_size_actual = len(predicted_answers)
            start_idx = batch_idx * batch_size
            end_idx = min(start_idx + batch_size_actual, len(test_dataset))
            batch_paths = [test_dataset.df['path'][test_dataset.valid_indices[i]] for i in range(start_idx, end_idx)]
            batch_questions = [test_dataset.df['question'][test_dataset.valid_indices[i]] for i in range(start_idx, end_idx)]
            batch_correct = [str(test_dataset.df['correct_answer'][test_dataset.valid_indices[i]]).lower() for i in range(start_idx, end_idx)]
            for path, question, correct, predicted in zip(batch_paths, batch_questions, batch_correct, predicted_answers):
                results.append({
                    'path': path,
                    'question': question,
                    'correct_answer': correct,
                    'predicted_answer': predicted
                })
    return results

# Compute evaluation metrics
def compute_metrics(results_df):
    logger.info("Computing evaluation metrics...")
    metrics = {
        'accuracy': 0.0,
        'f1_score': 0.0,
        'bleu_score': 0.0,
        'rouge1': 0.0,
        'rougeL': 0.0,
        'bertscore_precision': 0.0,
        'bertscore_recall': 0.0,
        'bertscore_f1': 0.0
    }

    y_true = results_df['correct_answer'].tolist()
    y_pred = results_df['predicted_answer'].tolist()

    # Accuracy and F1 Score
    metrics['accuracy'] = accuracy_score(y_true, y_pred)
    metrics['f1_score'] = f1_score(y_true, y_pred, average='weighted')
    logger.info(f"Accuracy: {metrics['accuracy']:.4f}")
    logger.info(f"F1 Score: {metrics['f1_score']:.4f}")

    # BLEU Score
    bleu_scores = []
    smoothie = SmoothingFunction().method1
    for true, pred in zip(y_true, y_pred):
        true_tokens = true.split()
        pred_tokens = pred.split()
        bleu = sentence_bleu([true_tokens], pred_tokens, smoothing_function=smoothie)
        bleu_scores.append(bleu)
    metrics['bleu_score'] = np.mean(bleu_scores)
    logger.info(f"Average BLEU Score: {metrics['bleu_score']:.4f}")

    # ROUGE Score
    rouge = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
    rouge1_scores = []
    rougeL_scores = []
    for true, pred in zip(y_true, y_pred):
        scores = rouge.score(true, pred)
        rouge1_scores.append(scores['rouge1'].fmeasure)
        rougeL_scores.append(scores['rougeL'].fmeasure)
    metrics['rouge1'] = np.mean(rouge1_scores)
    metrics['rougeL'] = np.mean(rougeL_scores)
    logger.info(f"Average ROUGE-1 F1 Score: {metrics['rouge1']:.4f}")
    logger.info(f"Average ROUGE-L F1 Score: {metrics['rougeL']:.4f}")

    # BERTScore
    P, R, F1 = bert_score(y_pred, y_true, lang="en", verbose=False)
    metrics['bertscore_precision'] = np.mean(P.numpy())
    metrics['bertscore_recall'] = np.mean(R.numpy())
    metrics['bertscore_f1'] = np.mean(F1.numpy())
    logger.info(f"Average BERTScore Precision: {metrics['bertscore_precision']:.4f}")
    logger.info(f"Average BERTScore Recall: {metrics['bertscore_recall']:.4f}")
    logger.info(f"Average BERTScore F1: {metrics['bertscore_f1']:.4f}")


    # Print evaluation results
    print("========== EVALUATION RESULTS ==========")
    print(f"Accuracy: {metrics['accuracy']:.4f}")
    print(f"F1 Score: {metrics['f1_score']:.4f}")
    print(f"Average BLEU Score: {metrics['bleu_score']:.4f}")
    print(f"Average ROUGE-1 F1 Score: {metrics['rouge1']:.4f}")
    print(f"Average ROUGE-L F1 Score: {metrics['rougeL']:.4f}")
    print(f"Average BERTScore F1: {metrics['bertscore_f1']:.4f}")
    print("========================================")

    return metrics

# Clean up GPU memory
def cleanup_gpu():
    torch.cuda.empty_cache()
    gc.collect()
    logger.info("GPU memory cleaned up")

# Main function
def main():
    print("========== CODE STARTED ==========")
    try:
        model, processor = setup_model()
        train_df = pd.read_csv(train_csv_path)
        total_train_samples = len(train_df)
        logger.info(f"Total training samples: {total_train_samples}")
        total_unique_train_images = len(set(train_df['path']))
        logger.info(f"Total unique training images: {total_unique_train_images}")
        start_idx = 0
        total_processed_train_images = 0
        for i, chunk_size in enumerate(train_chunk_sizes):
            logger.info(f"Training on chunk {i+1}: validating up to {chunk_size} questions")
            train_dataset = VQADataset(train_csv_path, img_base_path, processor, max_questions=chunk_size)
            total_processed_train_images += train_dataset.unique_images
            logger.info(f"Chunk {i+1} unique images: {train_dataset.unique_images}")
            checkpoint_path = os.path.join(save_dir, f"checkpoint_chunk_{i+1}.pth")
            train_model(model, processor, train_dataset, checkpoint_path, epochs=num_epochs, batch_size=batch_size)
            start_idx += chunk_size
        logger.info(f"Total unique training images processed: {total_processed_train_images}")
        test_df = pd.read_csv(test_csv_path)
        total_test_samples = len(test_df)
        logger.info(f"Total test samples: {total_test_samples}")
        total_unique_test_images = len(set(test_df['path']))
        logger.info(f"Total unique test images: {total_unique_test_images}")
        logger.info(f"Processing test data: validating up to {test_chunk_size} questions")
        test_dataset = VQADataset(test_csv_path, img_base_path, processor, max_questions=test_chunk_size)
        logger.info(f"Test data unique images: {test_dataset.unique_images}")
        test_results = inference_and_save(model, processor, test_dataset, output_csv, batch_size=batch_size)
        results_df = pd.DataFrame(test_results)
        results_df.to_csv(output_csv, index=False)
        logger.info(f"Results saved to {output_csv}")
        metrics = compute_metrics(results_df)
    finally:
        cleanup_gpu()
        del model
        logger.info("Model deleted to free up memory")
        print("========== CODE ENDED ==========")

if __name__ == "__main__":
    main()

Using device: cuda
GPU Name: Tesla T4
CUDA Version: 12.4
PyTorch Version: 2.6.0+cu124
CUDA Device Count: 2


config.json:   0%|          | 0.00/4.56k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.54G [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/445 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

trainable params: 2,359,296 || all params: 387,031,868 || trainable%: 0.6096


Training Epoch 1/5:   0%|          | 0/100 [00:00<?, ?it/s]We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.
Training Epoch 1/5: 100%|██████████| 100/100 [11:01<00:00,  6.62s/it]
Training Epoch 2/5: 100%|██████████| 100/100 [11:08<00:00,  6.68s/it]
Training Epoch 3/5: 100%|██████████| 100/100 [11:05<00:00,  6.65s/it]
Training Epoch 4/5: 100%|██████████| 100/100 [11:04<00:00,  6.65s/it]
Training Epoch 5/5: 100%|██████████| 100/100 [11:04<00:00,  6.64s/it]
Inference: 100%|██████████| 1/1 [00:03<00:00,  3.53s/it]


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Accuracy: 0.5600
F1 Score: 0.5294
Average BLEU Score: 0.0996
Average ROUGE-1 F1 Score: 0.5600
Average ROUGE-L F1 Score: 0.5600
Average BERTScore F1: 0.9715


# ====================================================================================================================================

# Validation Test 


# ================================================================================================================================================

In [5]:
print("========== CODE STARTED ==========")
print("started validation")
import torch
import pandas as pd
from transformers import AutoProcessor, AutoModelForVisualQuestionAnswering
from peft import PeftModel
from PIL import Image
import logging
from sklearn.metrics import accuracy_score, f1_score
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
from bert_score import score as bert_score
import numpy as np
from tqdm import tqdm
import os

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Define paths, device, and parameters
base_model_name = "Salesforce/blip-vqa-base"
lora_adapters_path = "/kaggle/working/lora_adapters_refined"
val_csv_path = "/kaggle/input/vqa-vr/val_main_image_id.csv"
img_base_path = "/kaggle/input/vqa-vr/filtered_images_corrected/"
output_csv = "/kaggle/working/validation_of_fine_tuned_model.csv"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
test_chunk_size = 20
logger.info(f"Using device: {device}")

# Custom Dataset Class for Validation
class VQADataset(Dataset):
    def __init__(self, csv_path, img_base_path, processor, max_questions=None):
        self.df = pd.read_csv(csv_path)
        self.df = self.df[['path', 'question', 'correct_answer']]
        self.df['correct_answer'] = self.df['correct_answer'].astype(str).str.lower()
        self.img_base_path = img_base_path
        self.processor = processor
        self.valid_indices = self._validate_dataset(max_questions)
        self.unique_images = len(set(self.df['path'].iloc[self.valid_indices]))
        logger.info(f"Loaded {len(self.valid_indices)} valid samples from {csv_path}")
        logger.info(f"Number of unique images: {self.unique_images}")

    def _validate_dataset(self, max_questions=None):
        valid_indices = []
        for idx in range(len(self.df)):
            if max_questions is not None and len(valid_indices) >= max_questions:
                break
            try:
                img_path = os.path.join(self.img_base_path, self.df['path'][idx])
                image = Image.open(img_path).convert('RGB')
                question = self.df['question'][idx]
                inputs = self.processor(image, question, return_tensors="pt", padding=True, truncation=True)
                if 'pixel_values' in inputs and inputs['pixel_values'] is not None:
                    valid_indices.append(idx)
                else:
                    logger.warning(f"Skipping index {idx}: No pixel_values")
            except Exception as e:
                logger.warning(f"Skipping index {idx}: Failed to load/process image")
        return valid_indices

    def __len__(self):
        return len(self.valid_indices)

    def __getitem__(self, idx):
        actual_idx = self.valid_indices[idx]
        img_path = os.path.join(self.img_base_path, self.df['path'][actual_idx])
        try:
            image = Image.open(img_path).convert('RGB')
        except Exception as e:
            logger.error(f"Failed to load image at index {actual_idx}")
            raise e
        question = self.df['question'][actual_idx]
        correct_answer = self.df['correct_answer'][actual_idx]
        return {
            'image': image,
            'question': question,
            'correct_answer': correct_answer,
            'path': self.df['path'][actual_idx]
        }

# Function to load the model for a specific chunk
def load_model_for_chunk(chunk_number):
    if chunk_number not in [1, 2, 3]:
        raise ValueError("Chunk number must be 1, 2, or 3.")
    logger.info("Loading base model and processor...")
    base_model = AutoModelForVisualQuestionAnswering.from_pretrained(base_model_name)
    processor = AutoProcessor.from_pretrained(base_model_name)
    logger.info(f"Loading LoRA adapters from {lora_adapters_path}...")
    model = PeftModel.from_pretrained(base_model, lora_adapters_path)
    checkpoint_path = os.path.join(lora_adapters_path, f"checkpoint_chunk_{chunk_number}.pth")
    if not os.path.exists(checkpoint_path):
        raise FileNotFoundError(f"Checkpoint for chunk {chunk_number} not found")
    logger.info(f"Loading checkpoint for chunk {chunk_number}...")
    checkpoint = torch.load(checkpoint_path, map_location=device)
    model.load_state_dict(checkpoint['model_state_dict'], strict=False)
    model.to(device)
    model.eval()
    logger.info(f"Model with checkpoint for chunk {chunk_number} loaded successfully.")
    return model, processor

# Perform inference on the validation dataset
def perform_inference(model, processor, val_dataset):
    def inference_single(image, question):
        try:
            inputs = processor(image, question, return_tensors="pt", padding=True, truncation=True)
            inputs = {k: v.to(device) for k, v in inputs.items()}
            with torch.no_grad():
                outputs = model.generate(**inputs)
            predicted_answer = processor.decode(outputs[0], skip_special_tokens=True).lower()
            return predicted_answer
        except Exception as e:
            logger.error(f"Failed to process image: {str(e)}")
            return "error"

    logger.info("Performing inference on validation dataset...")
    results = []
    for idx, sample in enumerate(tqdm(val_dataset, total=len(val_dataset), desc="Inference")):
        image = sample['image']
        question = sample['question']
        correct_answer = sample['correct_answer']
        path = sample['path']
        predicted_answer = inference_single(image, question)
        results.append({
            'path': path,
            'question': question,
            'correct_answer': correct_answer,
            'predicted_answer': predicted_answer
        })
    return results

# Main function to run validation
def validate(chunk_number, test_chunk_size):
    print(f"========== VALIDATING CHUNK {chunk_number} ==========")
    model, processor = load_model_for_chunk(chunk_number)
    logger.info(f"Loading validation dataset with up to {test_chunk_size} questions...")
    val_dataset = VQADataset(val_csv_path, img_base_path, processor, max_questions=test_chunk_size)
    results = perform_inference(model, processor, val_dataset)
    results_df = pd.DataFrame(results)
    results_df.to_csv(output_csv, index=False)
    logger.info(f"Validation results saved to {output_csv}")

    logger.info("Computing evaluation metrics...")
    metrics = {
        'accuracy': 0.0,
        'f1_score': 0.0,
        'bleu_score': 0.0,
        'rouge1': 0.0,
        'rougeL': 0.0,
        'bertscore_precision': 0.0,
        'bertscore_recall': 0.0,
        'bertscore_f1': 0.0
    }
    valid_results = results_df[results_df['predicted_answer'] != "error"]
    if len(valid_results) == 0:
        logger.warning("No valid predictions to evaluate.")
    else:
        y_true = valid_results['correct_answer'].tolist()
        y_pred = valid_results['predicted_answer'].tolist()

        # Accuracy and F1 Score
        metrics['accuracy'] = accuracy_score(y_true, y_pred)
        metrics['f1_score'] = f1_score(y_true, y_pred, average='weighted')
        logger.info(f"Accuracy: {metrics['accuracy']:.4f}")
        logger.info(f"F1 Score: {metrics['f1_score']:.4f}")

        # BLEU Score
        bleu_scores = []
        smoothie = SmoothingFunction().method1
        for true, pred in zip(y_true, y_pred):
            true_tokens = true.split()
            pred_tokens = pred.split()
            bleu = sentence_bleu([true_tokens], pred_tokens, smoothing_function=smoothie)
            bleu_scores.append(bleu)
        metrics['bleu_score'] = np.mean(bleu_scores)
        logger.info(f"Average BLEU Score: {metrics['bleu_score']:.4f}")

        # ROUGE Score
        rouge = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
        rouge1_scores = []
        rougeL_scores = []
        for true, pred in zip(y_true, y_pred):
            scores = rouge.score(true, pred)
            rouge1_scores.append(scores['rouge1'].fmeasure)
            rougeL_scores.append(scores['rougeL'].fmeasure)
        metrics['rouge1'] = np.mean(rouge1_scores)
        metrics['rougeL'] = np.mean(rougeL_scores)
        logger.info(f"Average ROUGE-1 F1 Score: {metrics['rouge1']:.4f}")
        logger.info(f"Average ROUGE-L F1 Score: {metrics['rougeL']:.4f}")

        # BERTScore
        P, R, F1 = bert_score(y_pred, y_true, lang="en", verbose=False)
        metrics['bertscore_precision'] = np.mean(P.numpy())
        metrics['bertscore_recall'] = np.mean(R.numpy())
        metrics['bertscore_f1'] = np.mean(F1.numpy())
        logger.info(f"Average BERTScore Precision: {metrics['bertscore_precision']:.4f}")
        logger.info(f"Average BERTScore Recall: {metrics['bertscore_recall']:.4f}")
        logger.info(f"Average BERTScore F1: {metrics['bertscore_f1']:.4f}")


    # Print evaluation results
    print("========== EVALUATION RESULTS ==========")
    print(f"Accuracy: {metrics['accuracy']:.4f}")
    print(f"F1 Score: {metrics['f1_score']:.4f}")
    print(f"Average BLEU Score: {metrics['bleu_score']:.4f}")
    print(f"Average ROUGE-1 F1 Score: {metrics['rouge1']:.4f}")
    print(f"Average ROUGE-L F1 Score: {metrics['rougeL']:.4f}")
    print(f"Average BERTScore Precision: {metrics['bertscore_precision']:.4f}")
    print(f"Average BERTScore Recall: {metrics['bertscore_recall']:.4f}")
    print(f"Average BERTScore F1: {metrics['bertscore_f1']:.4f}")
    print("========================================")

    torch.cuda.empty_cache()
    logger.info("GPU memory cleaned up.")

# Run validation
if __name__ == "__main__":
    try:
        chunk_number = 1
        test_chunk_size = 200
        validate(chunk_number, test_chunk_size)
    finally:
        print("========== CODE ENDED ==========")

started validation


Inference: 100%|██████████| 200/200 [00:21<00:00,  9.37it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Accuracy: 0.4850
F1 Score: 0.4285
Average BLEU Score: 0.0862
Average ROUGE-1 F1 Score: 0.5025
Average ROUGE-L F1 Score: 0.5025
Average BERTScore Precision: 0.9623
Average BERTScore Recall: 0.9566
Average BERTScore F1: 0.9587
