In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
from transformers import AutoProcessor, PaliGemmaForConditionalGeneration, PaliGemmaProcessor,  BitsAndBytesConfig
import torch



bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model_id = "google/paligemma2-3b-pt-896"
model = PaliGemmaForConditionalGeneration.from_pretrained(model_id,
                                                         quantization_config=bnb_config,
                                                         device_map={"":3})
processor = AutoProcessor.from_pretrained(model_id)

In [None]:
!nvidia-smi

In [8]:
import json

data = []
with open('DataVLM/mmqa_extended/two_tables/mmqa_extended_two_tables.jsonl') as f:
    for line in map(str.strip, f):
        if not line:
            continue
        try:
            item = json.loads(line)
            data.append(item)
            
        except json.JSONDecodeError:
            pass
        
print(data[0])

{'question': "Show the name and number of employees for the departments managed by heads whose temporary acting value is 'Yes'?", 'answer': {'columns': ['Name', 'Num_Employees'], 'index': [0, 1, 2], 'data': [['Treasury', 115897.0], ['Homeland Security', 208000.0], ['Treasury', 115897.0]]}, 'table_names': ['department', 'management'], 'table_image_ids': ['TableImg_1qdjq_15.png', 'TableImg_56w5t_5.png'], 'original_data_index': 0}


In [None]:
from PIL import Image 
import torch
images = data[0]['table_image_ids']

image2 = Image.open(f"DataVLM/mmqa_extended/two_tables/table_images/{data[0]['table_image_ids'][0]}").resize((448,448))
img1 = Image.open(f"DataVLM/mmqa_extended/two_tables/table_images/{data[0]['table_image_ids'][1]}").resize((448, 448))

prompt = f" Provide natural language answer to the question: {data[0]['question']} \n answer:"
text=prompt
inputs = processor(images=[[image2, img1]], text=prompt, return_tensors="pt").to(model.device)


In [None]:
inputs.input_ids.shape, inputs.attention_mask.shape

In [None]:
print(processor.decode(output[0], skip_special_tokens=True))

In [39]:
output = model.generate(**inputs)

In [2]:
! uv pip install google-genai

[2mUsing Python 3.13.1 environment at: vlmenv[0m
[2mAudited [1m1 package[0m [2min 5ms[0m[0m


In [6]:
import os

import google.generativeai as genai

genai.configure(api_key="AIzaSyBMLGJr-55FedA7vul19WYPbKpfIpg1I5w")

def upload_to_gemini(path, mime_type=None):
  """Uploads the given file to Gemini.

  See https://ai.google.dev/gemini-api/docs/prompting_with_media
  """
  file = genai.upload_file(path, mime_type=mime_type)
  print(f"Uploaded file '{file.display_name}' as: {file.uri}")
  return file



# Create the model
generation_config = {
  "temperature": 1,
  "top_p": 0.95,
  "top_k": 64,
  "max_output_tokens": 8192,
  "response_mime_type": "text/plain",
}


SYSTEMS_INSTRUCTIONS = """
You are an intelligent assistant capable of understanding and reasoning about tabular data. You will be presented with one or more tables containing information on a specific topic. 
You will then be asked a question that requires you to analyze the data in the table(s) and provide a correct answer.
\n**Your task is to:**\n\n
1. Carefully examine the provided table(s).Pay close attention to the column headers, the data types within each column, and the relationships between tables if multiple tables are given.
2. Understand the question being asked. Identify the specific information being requested and determine which table(s) and columns are relevant to answering the question.
3. Extract the necessary information from the table(s).** Perform any required filtering, joining, aggregation, or calculations on the data to arrive at the answer.
4. Formulate a clear and concise answer in natural language.** The answer should be directly responsive to the question and presented in a human-readable format. It may involve listing data, presenting a single value, or explaining a derived insight.
5. Do not include any SQL queries in the answer.** Your response should be in natural language only, as if you were explaining the answer to a human.
6. Be accurate and avoid hallucinations.** Your answer should be completely based on the data in the provided table(s). Do not introduce any external information or make assumptions not supported by the data.
7. Be specific and follow the instructions in the question.** If the question ask to get specific columns, return only mentioned columns, otherwise return all columns.
8. If the question is unanswerable** based on the provided tables, state "The question cannot be answered based on the provided data.
9. Give answer in json format like this { ['ans1', ans1], ['ans1', ans1], ['ans1', ans1] }",
)
"""

model = genai.GenerativeModel(
  model_name="gemini-exp-1206",
  generation_config=generation_config,
  system_instruction=SYSTEMS_INSTRUCTIONS)

In [None]:
import tqdm
with open("evals/results/my_results.jsonl", "w", encoding="utf-8") as out_file:
  for i, row in enumerate(tqdm(data)):
    files = [
      upload_to_gemini(f"DataVLM/mmqa_extended/two_tables/table_images/{img}", mime_type="image/png")
      for img in row["table_image_ids"]
    ]
    prompt = f"Question: {row['question']} \n answer:"
    chat_session = model.start_chat(
      history=[
        {
          "role": "user",
          "parts": files,
        },
      ],
    )
    response = chat_session.send_message(prompt)
    
    result = {
      "question": row["question"],
      "golden_answer": row["answer"],
      "table_image_ids": row["table_image_ids"],
      "gemini_response": response.text,
    }
    out_file.write(json.dumps(result) + "\n")
    

Uploaded file 'TableImg_1qdjq_15.png' as: https://generativelanguage.googleapis.com/v1beta/files/sfz3oxyy0ti3
Uploaded file 'TableImg_56w5t_5.png' as: https://generativelanguage.googleapis.com/v1beta/files/pxy7e5l5bdhz
```json
{
  "ans": [
    [
      "Treasury",
      "115897.000000"
    ],
    [
      "Homeland Security",
      "208000.000000"
    ]
  ]
}
```
Uploaded file 'TableImg_1ft36_15.png' as: https://generativelanguage.googleapis.com/v1beta/files/9tb6w4owgpqe
Uploaded file 'TableImg_Sab76_5.png' as: https://generativelanguage.googleapis.com/v1beta/files/5xcvh4jjxskm
```json
{
  "ans": [
    "10"
  ]
}
```
Uploaded file 'TableImg_34u3i_10.png' as: https://generativelanguage.googleapis.com/v1beta/files/12hufwltz61t
Uploaded file 'TableImg_3lekt_5.png' as: https://generativelanguage.googleapis.com/v1beta/files/7c4hbx6i4nn3
```json
{
  "ans": [
    "53.000000",
    "52.000000",
    "69.000000"
  ]
}
```


In [13]:
data[1]

{'question': 'How many departments are led by heads who are not mentioned?',
 'answer': {'columns': ['count(*)'], 'index': [0], 'data': [[11]]},
 'table_names': ['department', 'management'],
 'table_image_ids': ['TableImg_1ft36_15.png', 'TableImg_Sab76_5.png'],
 'original_data_index': 1}

In [None]:
chat_session = model.start_chat(
  history=[
    {
      "role": "user",
      "parts": [
        files[0],
        files[1],
        "Question: Show the name and number of employees for the departments managed by heads whose temporary acting value is 'Yes'?\n\nAnswer:",
      ],
    },
  ]
)

response = chat_session.send_message("INSERT_INPUT_HERE")

print(response.text)

In [None]:
import os
import google.generativeai as genai

genai.configure(api_key=os.environ["GEMINI_API_KEY"])

def upload_to_gemini(path, mime_type=None):
  """Uploads the given file to Gemini.

  See https://ai.google.dev/gemini-api/docs/prompting_with_media
  """
  file = genai.upload_file(path, mime_type=mime_type)
  print(f"Uploaded file '{file.display_name}' as: {file.uri}")
  return file

# Create the model
generation_config = {
  "temperature": 1,
  "top_p": 0.95,
  "top_k": 64,
  "max_output_tokens": 8192,
  "response_mime_type": "text/plain",
}

model = genai.GenerativeModel(
  model_name="gemini-exp-1206",
  generation_config=generation_config,
)

# TODO Make these files available on the local file system
# You may need to update the file paths
files = [
  upload_to_gemini("TableImg_1qdjq_15.png", mime_type="image/png"),
  upload_to_gemini("TableImg_56w5t_5.png", mime_type="image/png"),
]

chat_session = model.start_chat(
  history=[
    {
      "role": "user",
      "parts": [
        files[0],
        files[1],
        "Provide natural language answer to the question : Show the name and number of employees for the departments managed by heads whose temporary acting value is 'Yes'? Answer: ",
      ],
    },
    {
      "role": "model",
      "parts": [
        "The departments managed by heads with a temporary acting value of 'Yes' are the **Treasury** department, which has **115897.0** employees, and the **Homeland Security** department, which has **208000.0** employees.",
      ],
    },
  ]
)

response = chat_session.send_message("INSERT_INPUT_HERE")

print(response.text)

In [None]:
import torch
from PIL import Image
from transformers import MllamaForConditionalGeneration, AutoProcessor

model_id = "meta-llama/Llama-3.2-11B-Vision"

model = MllamaForConditionalGeneration.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)
processor = AutoProcessor.from_pretrained(model_id)



In [None]:
url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg"
image = Image.open(requests.get(url, stream=True).raw)

prompt = "<|image|><|image|><|begin_of_text|>If I had to write a haiku for this one"
inputs = processor(image, prompt, return_tensors="pt").to(model.device)

output = model.generate(**inputs, max_new_tokens=30)
print(processor.decode(output[0]))

In [14]:
! uv pip install qwen-vl-utils

[2mUsing Python 3.13.1 environment at: vlmenv[0m
[2K[2mResolved [1m9 packages[0m [2min 484ms[0m[0m                                         [0m
[2K[2mPrepared [1m2 packages[0m [2min 1.67s[0m[0m                                             
[2K[2mInstalled [1m2 packages[0m [2min 51ms[0m[0m                                [0m
 [32m+[39m [1mav[0m[2m==14.0.1[0m
 [32m+[39m [1mqwen-vl-utils[0m[2m==0.0.8[0m


In [2]:
import json

data = []
with open('DataVLM/mmqa_extended/two_tables/mmqa_extended_two_tables.jsonl') as f:
    for line in map(str.strip, f):
        if not line:
            continue
        try:
            item = json.loads(line)
            data.append(item)
            
        except json.JSONDecodeError:
            pass
        
print(data[0])

{'question': "Show the name and number of employees for the departments managed by heads whose temporary acting value is 'Yes'?", 'answer': {'columns': ['Name', 'Num_Employees'], 'index': [0, 1, 2], 'data': [['Treasury', 115897.0], ['Homeland Security', 208000.0], ['Treasury', 115897.0]]}, 'table_names': ['department', 'management'], 'table_image_ids': ['TableImg_1qdjq_15.png', 'TableImg_56w5t_5.png'], 'original_data_index': 0}


In [18]:
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor, BitsAndBytesConfig
from qwen_vl_utils import process_vision_info
import torch
from PIL import Image


bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)
# # default: Load the model on the available device(s)
model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",torch_dtype=torch.bfloat16).to("cuda:3")

# Load the model on the specified device(s) 



# We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
# model = Qwen2VLForConditionalGeneration.from_pretrained(
#     "Qwen/Qwen2-VL-7B-Instruct",
#     torch_dtype=torch.bfloat16,
#     attn_implementation="flash_attention_2",
#     device_map="auto",
# )

# default processer
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")

# The default range for the number of visual tokens per image in the model is 4-16384. You can set min_pixels and max_pixels according to your needs, such as a token count range of 256-1280, to balance speed and memory usage.
# min_pixels = 256*28*28
# max_pixels = 1280*28*28
# processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels)
messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": Image.open(f"DataVLM/mmqa_extended/two_tables/table_images/{data[0]['table_image_ids'][0]}").resize((448, 448)),
            },
            {
                "type": "image",
                "image": Image.open(f"DataVLM/mmqa_extended/two_tables/table_images/{data[1]['table_image_ids'][0]}").resize((448, 448)),
            },
            {"type": "text", "text": f"{SYSTEMS_INSTRUCTIONS}  \n Question: {data[0]['question']}  \nAnswer: "},
        ],
    }
]

# Preparation for inference
text = processor.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
image_inputs, Video = process_vision_info(messages)
inputs = processor(
    text=[text],
    images=image_inputs,
    padding=True,
    return_tensors="pt",
)
inputs = inputs.to(model.device)

# Inference: Generation of the output
generated_ids = model.generate(**inputs, max_new_tokens=1000)
generated_ids_trimmed = [
    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_text)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

['[\n    {\n        "name": "State",\n        "num_employees": 3026600000\n    },\n    {\n        "name": "Justice",\n        "num_employees": 11255700000\n    },\n    {\n        "name": "Labor",\n        "num_employees": 17347000000\n    },\n    {\n        "name": "Veterans Affairs",\n        "num_employees": 23500000000\n    }\n]']


In [15]:
SYSTEMS_INSTRUCTIONS = """
You are an intelligent assistant capable of understanding and reasoning about tabular data. You will be presented with one or more tables containing information on a specific topic. 
You will then be asked a question that requires you to analyze the data in the table(s) and provide a correct answer.
\n**Your task is to:**\n\n
1. Carefully examine the provided table(s).Pay close attention to the column headers, the data types within each column, and the relationships between tables if multiple tables are given.
2. Understand the question being asked. Identify the specific information being requested and determine which table(s) and columns are relevant to answering the question.
3. Extract the necessary information from the table(s).** Perform any required filtering, joining, aggregation, or calculations on the data to arrive at the answer.
4. Formulate a clear and concise answer in natural language.** The answer should be directly responsive to the question and presented in a human-readable format. It may involve listing data, presenting a single value, or explaining a derived insight.
5. Do not include any SQL queries in the answer.** Your response should be in natural language only, as if you were explaining the answer to a human.
6. Be accurate and avoid hallucinations.** Your answer should be completely based on the data in the provided table(s). Do not introduce any external information or make assumptions not supported by the data.
7. Be specific and follow the instructions in the question.** If the question ask to get specific columns, return only mentioned columns, otherwise return all columns.
8. If the question is unanswerable** based on the provided tables, state "The question cannot be answered based on the provided data.
9. Give answer directly in json format.,
)
"""

In [3]:
! nvidia-smi

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Tue Jan 21 23:12:07 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 555.42.02              Driver Version: 555.42.02      CUDA Version: 12.5     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA TITAN X (Pascal)        Off |   00000000:02:00.0 Off |                  N/A |
| 23%   25C    P8              8W /  250W |   11955MiB /  12288MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   1  NVIDIA TITAN X (Pascal)        Off |   00

In [1]:
!pip install torchvision==0.17.0

[31mERROR: Ignored the following yanked versions: 0.1.6, 0.1.7, 0.1.8, 0.1.9, 0.2.0, 0.2.1, 0.2.2, 0.2.2.post2, 0.2.2.post3[0m[31m
[0m[31mERROR: Could not find a version that satisfies the requirement torchvision==0.17.0 (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for torchvision==0.17.0[0m[31m
[0m

In [4]:
import gc

gc.collect()
torch.cuda.empty_cache()

In [4]:
!nvidia-smi

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Tue Jan 21 23:09:49 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 555.42.02              Driver Version: 555.42.02      CUDA Version: 12.5     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA TITAN X (Pascal)        Off |   00000000:02:00.0 Off |                  N/A |
| 23%   25C    P8              8W /  250W |   11955MiB /  12288MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   1  NVIDIA TITAN X (Pascal)        Off |   00

In [4]:
!kill 3790572

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


: 

: 

: 

In [1]:
!nvidia-smi

Tue Jan 21 23:10:08 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 555.42.02              Driver Version: 555.42.02      CUDA Version: 12.5     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA TITAN X (Pascal)        Off |   00000000:02:00.0 Off |                  N/A |
| 23%   25C    P8              8W /  250W |   11955MiB /  12288MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   1  NVIDIA TITAN X (Pascal)        Off |   00