In [12]:
import random
import torch
from datasets import load_dataset,Dataset
from transformers import AutoProcessor, LlavaForConditionalGeneration, AutoModelForCausalLM, AutoTokenizer, AutoModelForSeq2SeqLM
from accelerate import Accelerator
from PIL import Image
import os
import itertools
import pandas as pd
# import tiktoken
import matplotlib.pyplot as plt
# Initialize the accelerator for mixed precision
accelerator = Accelerator(mixed_precision="fp16")


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [3]:
from huggingface_hub import login

login(token="")

In [4]:

# Step 1: Load the MMMU dataset
print("Loading the VQAv2 dataset...")
# Update this line with the correct dataset source if available on Hugging Face
# Replace with your local dataset path if needed
try:
    streamed_dataset = load_dataset("lmms-lab/VQAv2", split = "validation", streaming=True)
except:
    raise ValueError("MMMU dataset not found. Ensure it's correctly downloaded or accessible.")


Loading the VQAv2 dataset...


In [5]:
# Step 2: Sample 1000 entries from the dataset
num_samples = 50
# samples = dataset.shuffle(seed=40).select(range(num_samples))
samples1 = list(itertools.islice(streamed_dataset, num_samples))

# Step 3: Convert the list of samples to a Hugging Face Dataset
dataset = Dataset.from_list(samples1)
samples = dataset


In [6]:
model = LlavaForConditionalGeneration.from_pretrained("llava-hf/llava-1.5-7b-hf", torch_dtype=torch.float16, device_map="auto")
processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")
model = accelerator.prepare(model)
tokenizer = processor.tokenizer  # Use the tokenizer from the processor
model.device

Loading checkpoint shards: 100%|██████████| 3/3 [00:11<00:00,  3.95s/it]
Some kwargs in processor config are unused and will not have any effect: num_additional_image_tokens. 


device(type='cuda', index=0)

In [7]:
# tokenizer_llama = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
# model_llama = AutoModelForCausalLM.from_pretrained(
#     "meta-llama/Llama-2-7b-chat-hf",
#     device_map="auto",
#     use_cache=None,
#     attn_implementation=None,
# )

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Step 3: Define a function to generate text
def generate_formatted_response(tokenizer, model, prompt, max_new_tokens=100):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    
    outputs = model.generate(
        inputs["input_ids"],
        max_new_tokens=max_new_tokens,
        num_beams=1,
        early_stopping=True
    )
    
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Extract the text between the last "### START ###" and the next "### END ###"
    start_delimiter = "### START ###"
    end_delimiter = "### END ###"
    
    # Find the last occurrence of the start delimiter
    last_start_index = response.rfind(start_delimiter)
    if last_start_index != -1:
        last_start_index += len(start_delimiter)
        end_index = response.find(end_delimiter, last_start_index)
        
        # Extract text if both delimiters are found
        if end_index != -1:
            extracted_text = response[last_start_index:end_index].strip()
            return extracted_text
    
    return "No content found between delimiters."

In [80]:
# Function to compute exact match accuracy
def exact_match(pred, target):
    return int(pred.strip().lower() == target.strip().lower())

# Initialize variables for accuracy calculation
total_predictions = 0
correct_predictions_orig = 0
correct_predictions_paraphrase = 0
data = []  # To store data for each iteration

In [81]:
df_para = pd.read_csv("./Paraphrased_Questions.csv")
def find_value(df, search_value, column_to_search, column_to_return):
    """
    Searches for a value in column_to_search and returns the corresponding value
    from column_to_return. If not found, raises a ValueError.
    """
    result = df.loc[df[column_to_search] == search_value, column_to_return]
    if not result.empty:
        return result.values[0]  # Return the first matching value
    else:
        raise ValueError(f"Error: Value '{search_value}' not found in column '{column_to_search}'.")

In [82]:
import cv2
import numpy as np

def add_gaussian_noise(image, mean=0, var=0.01):
    # Load the image using Pillow
    
    # Convert to NumPy array
    image = np.array(image).astype(np.float32) / 255.0  # Normalize to [0, 1]
    
    # Generate Gaussian noise
    noise = np.random.normal(mean, var**0.5, image.shape)
    
    # Add noise to the image
    noisy_image = image + noise
    
    # Clip the image to [0, 1] range
    noisy_image = np.clip(noisy_image, 0, 1)
    
    # Convert back to uint8 format
    noisy_image = (noisy_image * 255).astype(np.uint8)
    
    return noisy_image

def add_salt_and_pepper_noise(image, amount=0.05):
    # Ensure the image is a NumPy array
    if not isinstance(image, np.ndarray):
        image = np.array(image)

    # Create a copy of the image
    noisy_image = np.copy(image)

    # Calculate the number of salt and pepper pixels
    num_salt = int(amount * image.size * 0.5)
    num_pepper = int(amount * image.size * 0.5)

    # Add salt (white) noise
    coords = [np.random.randint(0, i - 1, num_salt) for i in image.shape]
    noisy_image[coords[0], coords[1], :] = 255

    # Add pepper (black) noise
    coords = [np.random.randint(0, i - 1, num_pepper) for i in image.shape]
    noisy_image[coords[0], coords[1], :] = 0

    return noisy_image


def make_random_pixels_black_pil(image, amount=0.05):
    """
    Adds random black pixels to a PIL image.

    Parameters:
    - image (PIL.Image.Image): Input image.
    - amount (float): Fraction of pixels to turn black.

    Returns:
    - PIL.Image.Image: Image with random black pixels.
    """
    # Convert PIL image to NumPy array
    image_np = np.array(image)

    # Create a copy of the image to modify
    noisy_image = np.copy(image_np)

    # Calculate the number of pixels to turn black
    num_black = int(amount * image_np.size / image_np.shape[2])  # Adjust for channels

    # Generate random coordinates for black pixels
    coords = [np.random.randint(0, dim, num_black) for dim in image_np.shape[:2]]

    # Set those pixels to black
    noisy_image[coords[0], coords[1]] = [0, 0, 0]  # Black in RGB

    # Convert back to PIL image
    return Image.fromarray(noisy_image)

def rotate_image_pil(image, angle):
    """
    Rotates a PIL image by a specified angle.

    Parameters:
    - image (PIL.Image.Image): Input image.
    - angle (float): Angle in degrees to rotate the image. Positive values for counterclockwise rotation.

    Returns:
    - PIL.Image.Image: Rotated image.
    """
    return image.rotate(angle, resample=Image.BICUBIC, expand=True)

def horizontal_flip_pil(image):
    """
    Flips a PIL image horizontally.

    Parameters:
    - image (PIL.Image.Image): Input image.

    Returns:
    - PIL.Image.Image: Horizontally flipped image.
    """
    return image.transpose(Image.FLIP_LEFT_RIGHT)


In [83]:
for a,sample in enumerate(samples):
    print(a)
    question = sample["question"]
    answer = sample["multiple_choice_answer"]
    
    image = sample['image']
    image.save(f"./vqa_image/{a+1}.jpg")
    if image.mode != "RGB":
        print(f"Image mode is {image.mode}. Converting to RGB.")
        image = image.convert("RGB")
    image_noise = rotate_image_pil(image, 45)
    # image_noise = add_gaussian_noise(image)
    # image_noise = make_random_pixels_black(image)
    
    # #########Para
    # # Example usage
    # prompt = """
    # Rephrase the query given but provide only one alternative that means the same.
    
    # Please respond with only the rephrased sentence strictly between the delimiters.
    
    # Example:
    # Query: How can I improve my code?
    # ### START ###
    # How can I enhance my code?
    # ### END ###
    
    # Now, rephrase the given query:
    
    # Query: {}
    # ### START ###
    # """.format(question)
    
    
    # rephrased_llama = generate_formatted_response(tokenizer_llama, model_llama,prompt)
    rephrased_llama = find_value(df_para, question, "Original question","Paraphrased Questions")
    

    #############
    
    conversation_1 = [
        {
            "role": "user",
            "content": [
                {"type": "image"},
                {"type": "text", "text": "Answer within 3 words maximum:- " + question},
            ],
        },
    ]

    conversation_2 = [
        {
            "role": "user",
            "content": [
                {"type": "image"},
                {"type": "text", "text": "Answer within 3 words maximum:- " + rephrased_llama},
            ],
        },
    ]
    
    prompt_1 = processor.apply_chat_template(conversation_1, add_generation_prompt=True)
    
    
    
    # We can simply feed images in the order they have to be used in the text prompt
    inputs = processor(images=image, text=prompt_1, padding=True, return_tensors="pt").to(model.device, torch.float16)
    
    
    # Generate
    generate_ids = model.generate(**inputs, max_new_tokens=6)
    response = processor.batch_decode(generate_ids, skip_special_tokens=True)[0]
    assistant_response = response.split("ASSISTANT:")[-1].strip()

    if assistant_response:
        total_predictions += 1
        # print("Prompt:- " + question)
        # print("Assistant:- " + assistant_response)
        # print("Answer:- " + answer)
        correct_predictions_orig += exact_match(assistant_response, answer)
    var3 = assistant_response
    prompt_1 = processor.apply_chat_template(conversation_2, add_generation_prompt=True)
    # We can simply feed images in the order they have to be used in the text prompt
    inputs = processor(images=image_noise, text=prompt_1, padding=True, return_tensors="pt").to(model.device, torch.float16)

    generate_ids = model.generate(**inputs, max_new_tokens=6)
    response = processor.batch_decode(generate_ids, skip_special_tokens=True)[0]
    assistant_response = response.split("ASSISTANT:")[-1].strip()
    data.append([question, rephrased_llama, var3, assistant_response, answer]) 
    if assistant_response:
        correct_predictions_paraphrase += exact_match(assistant_response, answer)
    
    

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49


In [84]:
total_predictions

50

In [85]:
correct_predictions_orig/total_predictions

0.66

In [86]:
correct_predictions_paraphrase/total_predictions

0.7

In [87]:
import pandas as pd
df = pd.DataFrame(data, columns=["Original question", "Paraphrased question", "Model response-orig", "Model-response-para", "answer"])

# Export DataFrame to Excel
df.to_excel("./output_paraphrased_rotate_45.xlsx", index=False, engine="openpyxl")

In [88]:
!zip -r vqa_images.zip ./vqa_image/

  adding: vqa_image/ (stored 0%)
  adding: vqa_image/5.jpg (deflated 0%)
  adding: vqa_image/7.jpg (deflated 0%)
  adding: vqa_image/17.jpg (deflated 0%)
  adding: vqa_image/10.jpg (deflated 0%)
  adding: vqa_image/.ipynb_checkpoints/ (stored 0%)
  adding: vqa_image/.ipynb_checkpoints/1-checkpoint.jpg (deflated 7%)
  adding: vqa_image/40.jpg (deflated 2%)
  adding: vqa_image/31.jpg (deflated 2%)
  adding: vqa_image/18.jpg (deflated 0%)
  adding: vqa_image/26.jpg (deflated 2%)
  adding: vqa_image/15.jpg (deflated 0%)
  adding: vqa_image/41.jpg (deflated 2%)
  adding: vqa_image/22.jpg (deflated 2%)
  adding: vqa_image/12.jpg (deflated 0%)
  adding: vqa_image/13.jpg (deflated 0%)
  adding: vqa_image/33.jpg (deflated 2%)
  adding: vqa_image/32.jpg (deflated 2%)
  adding: vqa_image/35.jpg (deflated 2%)
  adding: vqa_image/36.jpg (deflated 2%)
  adding: vqa_image/27.jpg (deflated 2%)
  adding: vqa_image/30.jpg (deflated 2%)
  adding: vqa_image/2.jpg (deflated 7%)
  adding: vqa_image/8.jpg (d

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [70]:

print("Rephrased Output:", response)

Rephrased Output: How is Martha doing?
