In [1]:
!pip install --upgrade pip
!pip install -q git+https://github.com/huggingface/peft.git transformers bitsandbytes datasets 
!pip install git+https://github.com/huggingface/transformers

Collecting pip
  Using cached pip-24.2-py3-none-any.whl.metadata (3.6 kB)
Using cached pip-24.2-py3-none-any.whl (1.8 MB)
Installing collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.0
    Uninstalling pip-24.0:
      Successfully uninstalled pip-24.0
Successfully installed pip-24.2
Collecting git+https://github.com/huggingface/transformers
  Cloning https://github.com/huggingface/transformers to /tmp/pip-req-build-o4s95vp8
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers /tmp/pip-req-build-o4s95vp8
  Resolved https://github.com/huggingface/transformers to commit c63a3d0f1791e018de447ac570fc7029d1ea19bd
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: transformers
  Building wheel for transformers (pyproject.toml) ... [?25ldone
[?25h  Creat

In [2]:
import os
import base64
import torch
import pickle
from PIL import Image
from io import BytesIO
from torch.utils.data import DataLoader, random_split
from transformers import Blip2Processor, Blip2ForConditionalGeneration
from datasets import load_dataset
from tqdm import tqdm
from sklearn.metrics import f1_score, accuracy_score

# Load the BLIP2 model and processor
processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b")

# # Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

torch.cuda.empty_cache()
torch.manual_seed(42)

2024-08-17 17:14:07.687363: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-08-17 17:14:07.730918: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-17 17:14:07.730987: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-17 17:14:07.732136: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-08-17 17:14:07.739160: I tensorflow/core/platform/cpu_feature_guar

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

<torch._C.Generator at 0x7f2fac1f3250>

In [3]:
import json
import os
from PIL import Image
import torch
from torch.utils.data import Dataset
from transformers import AutoTokenizer, AutoProcessor

class ChartQADataset(Dataset):
    """ChartQA dataset."""
    def __init__(self, json_file, image_folder, processor, image_size=(512, 512)):
        self.image_folder = image_folder
        self.image_size = image_size
        self.processor = processor

        # Load JSON data
        with open(json_file, 'r') as f:
            self.dataset = json.load(f)

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        data = self.dataset[idx]

        # Load image
        img_path = os.path.join(self.image_folder, data['imgname'])
        image = Image.open(img_path).convert("RGB")
        
        # Resize image
        image = image.resize(self.image_size)

        # Get text data
        query = "Question: " + data['query']
        label = data['label']
        
        # Process image and text
        encodings = self.processor(images=image, text=query, padding='max_length', truncation=True, max_length=100, return_tensors="pt")
        
        # Process labels
        labels = self.processor.tokenizer.encode(label, max_length=6, padding='max_length', truncation=True, return_tensors='pt')
        
        encodings["labels"] = labels.squeeze()
        
        return encodings

# File paths
train_json_file = 'chartqa/train/train_merged.json'
val_json_file = 'chartqa/val/val_merged.json'
test_json_file = 'chartqa/test/test_merged.json'

train_image_folder = 'chartqa/train/png'
val_image_folder = 'chartqa/val/png'
test_image_folder = 'chartqa/test/png'

# Create dataset instances
train_dataset = ChartQADataset(train_json_file, train_image_folder, processor)
val_dataset = ChartQADataset(val_json_file, val_image_folder, processor)
test_dataset = ChartQADataset(test_json_file, test_image_folder, processor)


In [4]:
# Define dataloaders and other components
batch_size = 3  # Reduced batch size
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, pin_memory=True)
valid_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=3, shuffle=False, pin_memory=True)

In [14]:
import torch
import pickle
import os
from tqdm import tqdm
import matplotlib.pyplot as plt

# Define optimizer and scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-6)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.15, patience=2, verbose=True)

# Training parameters
num_epochs = 25
patience = 3
min_eval_loss = float("inf")
early_stopping_hook = 0
tracking_information = []
train_losses = []
val_losses = []

# Training loop
for epoch in range(num_epochs):
    epoch_loss = 0
    model.train()
    optimizer.zero_grad()  # Ensure gradients are zeroed at the start of each epoch

    for batch in tqdm(train_dataloader, desc='Training batch: ...'):
        input_ids = batch.pop('input_ids').squeeze(1).to(device)
        pixel_values = batch.pop('pixel_values').squeeze(1).to(device)
        attention_mask = batch.pop('attention_mask').squeeze(1).to(device)
        labels = batch.pop('labels').to(device)
        
        with torch.cuda.amp.autocast(dtype=torch.float16):
            outputs = model(input_ids=input_ids,
                            pixel_values=pixel_values,
                            labels=labels)
            
        loss = outputs.loss
        epoch_loss += loss.item()

        loss.backward()

        # Gradient Clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        optimizer.step()  # Perform optimizer step
        optimizer.zero_grad()  # Reset gradients after step

        # Optionally, clear cache to manage memory
        torch.cuda.empty_cache()

    avg_train_loss = epoch_loss / len(train_dataloader)
    train_losses.append(avg_train_loss)

    # Evaluation loop
    model.eval()
    eval_loss = 0
    eval_accuracy = 0  # Initialize accuracy metric
    with torch.no_grad():
        for batch in tqdm(valid_dataloader, desc='Validating batch: ...'):
            input_ids = batch.pop('input_ids').squeeze(1).to(device)
            pixel_values = batch.pop('pixel_values').squeeze(1).to(device)
            attention_mask = batch.pop('attention_mask').squeeze(1).to(device)
            labels = batch.pop('labels').to(device)

            with torch.cuda.amp.autocast(dtype=torch.float16):
                outputs = model(input_ids=input_ids,
                                pixel_values=pixel_values,
                                labels=labels)
            
            loss = outputs.loss
            eval_loss += loss.item()

            # Calculate accuracy (if relevant for your task)
            preds = torch.argmax(outputs.logits, dim=-1)
            eval_accuracy += (preds == labels).sum().item() / len(labels)

    avg_val_loss = eval_loss / len(valid_dataloader)
    avg_val_accuracy = eval_accuracy / len(valid_dataloader)  # Average accuracy
    val_losses.append(avg_val_loss)

    tracking_information.append((avg_train_loss, avg_val_loss, avg_val_accuracy, optimizer.param_groups[0]["lr"]))
    print(f"Epoch: {epoch+1} - Training loss: {avg_train_loss:.4f} - Eval Loss: {avg_val_loss:.4f} - Accuracy: {avg_val_accuracy:.4f} - LR: {optimizer.param_groups[0]['lr']:.6f}")
    
    # Adjust learning rate based on validation loss
    scheduler.step(avg_val_loss)
   
    # Save the best model
    try:
        if avg_val_loss < min_eval_loss:
            torch.save(model.state_dict(), 'vqa_best_model.pth') 
            print("Saved model to vqa_best_model.pth")
            min_eval_loss = avg_val_loss
            early_stopping_hook = 0
        else:
            early_stopping_hook += 1
            if early_stopping_hook > patience:
                print("Early stopping triggered.")
                break
        # Save the last model
        torch.save(model.state_dict(), 'vqa_last_model.pth')
    except Exception as e:
        print(f"Error saving the model: {e}")

# Save tracking information
with open("chartqa_tracking_information.pkl", "wb") as f:
    pickle.dump(tracking_information, f)
print("The fine-tuning process has completed!")

epochs_completed = len(train_losses)

plt.figure(figsize=(14, 6))

# Plot Losses
plt.plot(range(1, epochs_completed + 1), train_losses, label='Training Loss', color='blue')
plt.plot(range(1, epochs_completed + 1), val_losses, label='Validation Loss', color='red')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()
plt.grid(True)

plt.tight_layout()
plt.savefig('loss_plot.png')  # Save the plot as a .png file
plt.show()  # Show the plot


In [6]:
!pip install nltk word2number

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[0mCollecting nltk
  Downloading nltk-3.9-py3-none-any.whl.metadata (2.9 kB)
Collecting word2number
  Using cached word2number-1.1-py3-none-any.whl
Downloading nltk-3.9-py3-none-any.whl (1.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m32.8 MB/s[0m eta [36m0:00:00[0m
[0mInstalling collected packages: word2number, nltk
[0mSuccessfully installed nltk-3.9 word2number-1.1


In [13]:
import torch
from tqdm import tqdm
import numpy as np
import difflib
from nltk.corpus import wordnet
from word2number import w2n
import nltk

# Download WordNet data for nltk
nltk.download('wordnet')

def get_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonyms.add(lemma.name().lower())
    return synonyms

def word_to_num(word):
    try:
        return w2n.word_to_num(word)
    except ValueError:
        return None

def relaxed_correctness(prediction, ground_truth, num_threshold=0.05, text_similarity_threshold=0.8):
    prediction = prediction.lower().strip()
    ground_truth = ground_truth.lower().strip()

    # Try to handle numbers
    pred_num = word_to_num(prediction)
    gt_num = word_to_num(ground_truth)

    if pred_num is not None and gt_num is not None:
        lower_bound = gt_num * (1 - num_threshold)
        upper_bound = gt_num * (1 + num_threshold)
        if lower_bound <= pred_num <= upper_bound:
            return 1.0

    if pred_num is not None and ground_truth.isdigit():
        lower_bound = int(ground_truth) * (1 - num_threshold)
        upper_bound = int(ground_truth) * (1 + num_threshold)
        if lower_bound <= pred_num <= upper_bound:
            return 1.0

    if gt_num is not None and prediction.isdigit():
        lower_bound = gt_num * (1 - num_threshold)
        upper_bound = gt_num * (1 + num_threshold)
        if lower_bound <= int(prediction) <= upper_bound:
            return 1.0

    # Exact match
    if prediction == ground_truth:
        return 1.0

    # Synonym match
    prediction_synonyms = get_synonyms(prediction)
    ground_truth_synonyms = get_synonyms(ground_truth)
    if prediction in ground_truth_synonyms or ground_truth in prediction_synonyms:
        return 1.0

    # Approximate string matching
    similarity = difflib.SequenceMatcher(None, prediction, ground_truth).ratio()
    if similarity > text_similarity_threshold:  # Threshold for relaxed match
        return similarity

    return 0.0

def evaluate(model, loader, processor, device, num_examples=15):
    model.eval()
    total_loss = 0
    num_batches = 0
    all_labels = []
    all_preds = []
    all_relaxed_correctness_scores = []

    example_inputs = []
    example_predictions = []
    example_ground_truths = []

    with torch.no_grad():
        for batch in tqdm(loader, desc='Evaluating batch:'):
            input_ids = batch['input_ids'].squeeze(1).to(device)
            pixel_values = batch['pixel_values'].squeeze(1).to(device)
            labels = batch['labels'].to(device)
            
            model.to(device)
            torch.cuda.empty_cache()

            outputs = model(input_ids=input_ids, pixel_values=pixel_values, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()
            num_batches += 1

            logits = outputs.logits
            preds = torch.argmax(logits, dim=-1)

            all_labels.append(labels.cpu().numpy())
            all_preds.append(preds.cpu().numpy())

            if len(example_inputs) < num_examples:
                example_inputs.extend(input_ids.cpu().numpy())
                example_predictions.extend(preds.cpu().numpy())
                example_ground_truths.extend(labels.cpu().numpy())

            for i in range(len(labels)):
                try:
                    model_answer = processor.tokenizer.decode(preds[i], skip_special_tokens=True)
                    ground_truth = processor.tokenizer.decode(labels[i], skip_special_tokens=True)
                    correctness = relaxed_correctness(model_answer, ground_truth)
                    all_relaxed_correctness_scores.append(correctness)
                except Exception as e:
                    print(f"Error calculating correctness for sample {i}: {e}")
                    all_relaxed_correctness_scores.append(0.0)

    all_labels = np.concatenate(all_labels, axis=0)
    all_preds = np.concatenate(all_preds, axis=0)

    average_loss = total_loss / num_batches
    accuracy = np.mean([np.mean(all_labels[:, i] == all_preds[:, i]) for i in range(all_labels.shape[1])])
    average_relaxed_correctness = np.mean(all_relaxed_correctness_scores)

    for i in range(min(num_examples, len(example_inputs))):
        input_text = processor.tokenizer.decode(example_inputs[i], skip_special_tokens=True)
        predicted_label = processor.tokenizer.decode(example_predictions[i], skip_special_tokens=True)
        actual_label = processor.tokenizer.decode(example_ground_truths[i], skip_special_tokens=True)
        print(f"Input: {input_text}")
        print(f"Predicted Label: {predicted_label}")
        print(f"Actual Label: {actual_label}")
        print()

    return average_loss, accuracy, average_relaxed_correctness


# Load the best model for final evaluation
model.load_state_dict(torch.load('vqa_last_model.pth'))

# Final Evaluation on Test Set
test_loss, test_accuracy, test_relaxed_correctness = evaluate(model, test_loader, processor, device)
print(f"Test Loss: {test_loss:.4f}, Accuracy: {test_accuracy * 100:.2f}%, Relaxed Correctness: {test_relaxed_correctness * 100:.2f}%")


[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Evaluating batch:: 100%|██████████| 834/834 [07:00<00:00,  1.98it/s]

Input: Question: How many stores did Saint Laurent operate in Western Europe in 2020?
Predicted Label: 71
Actual Label: 47

Input: Question: In what year did online sales make up 6.8 percent of retail sales of jewelry, watches and accessories in Germany?
Predicted Label: 2013
Actual Label: 2013

Input: Question: What percentage of the retail sales of jewelry, watches and accessories in Germany were online in 2013?
Predicted Label: 11.
Actual Label: 6.8

Input: Question: What is the predicted increase in online sales of jewelry, watches and accessories in Germany by 2018?
Predicted Label: 11
Actual Label: 11

Input: Question: How many companies were in Hungary's insurance market in 2013?
Predicted Label: 28
Actual Label: 36

Input: Question: How many companies were in Hungary's insurance market in 2019?
Predicted Label: 28
Actual Label: 23

Input: Question: How many drone strikes did the U.S. carry out in Somalia in 2019?
Predicted Label: 12
Actual Label: 63

Input: Question: In what ye




# Code to use the model

In [2]:
import os
import base64
import torch
import pickle
from PIL import Image
from io import BytesIO
from torch.utils.data import DataLoader, random_split
from transformers import Blip2Processor, Blip2ForConditionalGeneration
from datasets import load_dataset
from tqdm import tqdm
from sklearn.metrics import f1_score, accuracy_score

# Load the BLIP2 model and processor
processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b")

# # Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

torch.cuda.empty_cache()
torch.manual_seed(42)
# Load the fine-tuned weights
model.load_state_dict(torch.load('vqa_last_model.pth'))

# Set the model to evaluation mode
model.eval()

# Move model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

from PIL import Image

# Example question and image
question = "In what year did online sales make up 6.8 percent of retail sales of jewelry, watches and accessories in Germany?"
image_path = "chartqa/test/png/multi_col_20436.png"  # Replace with the path to your image

# Load and preprocess the image
image = Image.open(image_path).convert("RGB")
inputs = processor(text=question, images=image, return_tensors="pt").to(device)


2024-08-20 19:17:39.039387: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-08-20 19:17:39.090474: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-20 19:17:39.090542: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-20 19:17:39.092093: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-08-20 19:17:39.099301: I tensorflow/core/platform/cpu_feature_guar

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Expanding inputs for image tokens in BLIP-2 should be done in processing. Please follow instruction here (https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042) to update your BLIP-2 model. Using processors without these attributes in the config is deprecated and will throw an error in v4.47.


In [9]:
# Check if input_ids is correctly generated
if "input_ids" in inputs and inputs["input_ids"].size(1) > 0:
    # Make the prediction
    generated_ids = model.generate(**inputs, max_length=100)  # You can set max_length to a suitable value
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
    print(generated_text)
else:
    print("Error: input_ids were not generated correctly. Please check the input data.")

That was 2013.
