In [2]:
import requests

import torch
from PIL import Image

from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor, AutoModelForCausalLM, AutoModelForImageTextToText

# Image-Text

## Qwen/Qwen2-VL-7B-Instruct

In [None]:
!pip install qwen-vl-utils

Collecting qwen-vl-utils
  Downloading qwen_vl_utils-0.0.10-py3-none-any.whl.metadata (6.3 kB)
Collecting av (from qwen-vl-utils)
  Downloading av-14.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.7 kB)
Downloading qwen_vl_utils-0.0.10-py3-none-any.whl (6.7 kB)
Downloading av-14.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (39.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m39.5/39.5 MB[0m [31m55.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: av, qwen-vl-utils
Successfully installed av-14.1.0 qwen-vl-utils-0.0.10


In [None]:
from qwen_vl_utils import process_vision_info

In [None]:
# default: Load the model on the available device(s)
model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-7B-Instruct", torch_dtype="auto", device_map="auto"
)

# default processer
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")

messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": "https://raw.githubusercontent.com/vis-nlp/ChartQA/main/ChartQA%20Dataset/val/png/multi_col_1229.png",
            },
            {"type": "text", "text": "Give me detailed insights of the picture then generate some interesting business insights."},
        ],
    }
]

# Preparation for inference
text = processor.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
)

# Inference: Generation of the output
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [
    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_text)



Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

['The bar chart illustrates the distribution of Facebook Messenger and WhatsApp users across different age groups in the United States as of 2021. \n\nKey insights from the chart include:\n\n1. **Facebook Messenger Dominance**: Facebook Messenger is significantly more popular than WhatsApp across all age groups. The highest percentage of Facebook Messenger users is among the 18-29 age group, with 73% of respondents using it. This indicates that Facebook Messenger is particularly popular among younger adults.\n\n2. **WhatsApp Usage**: WhatsApp usage is highest among the 30-59 age group, with 66% of respondents using it']


## microsoft/Florence-2-large

In [None]:
model_id = 'microsoft/Florence-2-large'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True, torch_dtype=torch_dtype).to(device)
processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)

config.json:   0%|          | 0.00/2.44k [00:00<?, ?B/s]

configuration_florence2.py:   0%|          | 0.00/15.1k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Florence-2-large:
- configuration_florence2.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_florence2.py:   0%|          | 0.00/127k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Florence-2-large:
- modeling_florence2.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


pytorch_model.bin:   0%|          | 0.00/1.56G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/51.0 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/806 [00:00<?, ?B/s]

processing_florence2.py:   0%|          | 0.00/48.7k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Florence-2-large:
- processing_florence2.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


tokenizer_config.json:   0%|          | 0.00/34.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.10M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
def run_example(task_prompt, image=None, text_input=None):
    if text_input is None:
        prompt = task_prompt
    else:
        prompt = task_prompt + text_input

    image = image.convert('RGB')
    inputs = processor(text=prompt, images=image, return_tensors="pt").to(device)
    generated_ids = model.generate(
      input_ids=inputs["input_ids"],
      pixel_values=inputs["pixel_values"],
      max_new_tokens=1024,
      early_stopping=False,
      do_sample=False,
      num_beams=3,
    )
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
    parsed_answer = processor.post_process_generation(
        generated_text,
        task=task_prompt,
        image_size=(image.width, image.height)
    )

    parsed_answer_cleaned = parsed_answer.get('<MORE_DETAILED_CAPTION>', parsed_answer)

    formatted_answer = "\n\n".join([p.strip() for p in parsed_answer_cleaned.split("\n") if p.strip()])

    return print(formatted_answer)

url = "https://raw.githubusercontent.com/vis-nlp/ChartQA/main/ChartQA%20Dataset/val/png/multi_col_1229.png"
image = Image.open(requests.get(url, stream=True).raw)
task_prompt = '<MORE_DETAILED_CAPTION>'
run_example(task_prompt, image)

The image is a bar graph that shows the percentage of respondents who have received Facebook Messenger and WhatsApp messages. The x-axis of the graph is divided into four sections, each representing a different percentage.

The first section, representing Facebook Messenger, shows that the majority of respondents have received the same percentage as the other three. The second section, represented by WhatsApp, shows a percentage of the respondents. The third section, from 18-29 to 30-59, shows an increase in the percentage. The fourth section, titled "Facebook Messenger", shows a decrease in the number of respondents with the highest percentage.

There are also several icons on the top right corner of the image, including a search bar, a chat icon, and a social media icon. The text on the image reads "Additional Information" and "Statista 2021".


## Salesforce/blip-image-captioning-base

In [None]:
processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = AutoModelForImageTextToText.from_pretrained("Salesforce/blip-image-captioning-base")

img_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg'
raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')

# unconditional image captioning
inputs = processor(raw_image, return_tensors="pt")

out = model.generate(**inputs)
print(processor.decode(out[0], skip_special_tokens=True))

a woman sitting on the beach with her dog


In [None]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")

pipe('https://raw.githubusercontent.com/vis-nlp/ChartQA/main/ChartQA%20Dataset/val/png/multi_col_1229.png')

Device set to use cpu


[{'generated_text': 'a bar chart showing the percentage of facebook users who have been on their account'}]

# Text-Text

## google/flan-t5-large

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-large")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-large")

input_text = "create business insights from this chart description: The image is a bar graph that shows the percentage of respondents who have received Facebook Messenger and WhatsApp messages. The x-axis of the graph is divided into four sections, each representing a different percentage.\n\nThe first section, representing Facebook Messenger, shows that the majority of respondents have received the same percentage as the other three. The second section, represented by WhatsApp, shows a percentage of the respondents. The third section, from 18-29 to 30-59, shows an increase in the percentage. The fourth section, titled Facebook Messenger, shows a decrease in the number of respondents with the highest percentage. \n\nThere are also several icons on the top right corner of the image, including a search bar, a chat icon, and a social media icon. The text on the image reads Additional Information and Statista 2021"
input_ids = tokenizer(input_text, return_tensors="pt").input_ids

outputs = model.generate(input_ids)
print(tokenizer.decode(outputs[0]))

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

<pad>The image is a bar graph that shows the percentage of respondents who have received Facebook Messenger and WhatsApp


## openai-community/gpt2

In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Load pre-trained model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Your text to summarize
text = "create business insights and suggest actions for companies according to this chart description: The image is a bar graph that shows the percentage of respondents who have received Facebook Messenger and WhatsApp messages. The x-axis of the graph is divided into four sections, each representing a different percentage.\n\nThe first section, representing Facebook Messenger, shows that the majority of respondents have received the same percentage as the other three. The second section, represented by WhatsApp, shows a percentage of the respondents. The third section, from 18-29 to 30-59, shows an increase in the percentage. The fourth section, titled Facebook Messenger, shows a decrease in the number of respondents with the highest percentage. \n\nThere are also several icons on the top right corner of the image, including a search bar, a chat icon, and a social media icon. The text on the image reads Additional Information and Statista 2021"

# Tokenize input text
encoded_input = tokenizer.encode(text, return_tensors='pt')

# Generate summary (using the model's text generation capability)
summary_ids = model.generate(encoded_input, max_new_tokens=50, num_beams=5, no_repeat_ngram_size=2, early_stopping=True)

# Decode the generated tokens back to text
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Print the summary
print(summary)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


create business insights and suggest actions for companies according to this chart description: The image is a bar graph that shows the percentage of respondents who have received Facebook Messenger and WhatsApp messages. The x-axis of the graph is divided into four sections, each representing a different percentage.

The first section, representing Facebook Messenger, shows that the majority of respondents have received the same percentage as the other three. The second section, represented by WhatsApp, shows a percentage of the respondents. The third section, from 18-29 to 30-59, shows an increase in the percentage. The fourth section, titled Facebook Messenger, shows a decrease in the number of respondents with the highest percentage. 

There are also several icons on the top right corner of the image, including a search bar, a chat icon, and a social media icon. The text on the image reads Additional Information and Statista 2021, which can be found at: https://en.wikipedia.org/wik

## facebook/bart-large-cnn

In [None]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("summarization", model="facebook/bart-large-cnn")

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cpu


In [None]:
from transformers import pipeline

summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

ARTICLE = """ The image is a bar graph that shows the percentage of respondents who have received Facebook Messenger and WhatsApp messages.
              The x-axis of the graph is divided into four sections, each representing a different percentage.
              The first section, representing Facebook Messenger, shows that the majority of respondents have received the same percentage as the other three.
              The second section, represented by WhatsApp, shows a percentage of the respondents.
              The third section, from 18-29 to 30-59, shows an increase in the percentage.
              The fourth section, titled Facebook Messenger, shows a decrease in the number of respondents with the highest percentage.
              There are also several icons on the top right corner of the image, including a search bar, a chat icon, and a social media icon.
              The text on the image reads Additional Information and Statista 2021.
"""
print(summarizer(ARTICLE, max_length=130, min_length=30, do_sample=False))

Device set to use cpu


[{'summary_text': 'The image is a bar graph that shows the percentage of respondents who have received Facebook Messenger and WhatsApp messages. The x-axis of the graph is divided into four sections, each representing a different percentage.'}]


# Chart-Text

## google/matcha-chart2text-pew

In [None]:
from PIL import Image
import requests
import torch
from transformers import AutoProcessor, AutoModelForImageTextToText

# Load the processor and model
processor = AutoProcessor.from_pretrained("google/matcha-chart2text-pew")
model = AutoModelForImageTextToText.from_pretrained("google/matcha-chart2text-pew")

# Download a sample chart image
url = "https://raw.githubusercontent.com/vis-nlp/ChartQA/main/ChartQA%20Dataset/val/png/20294671002019.png"
image = Image.open(requests.get(url, stream=True).raw)

# Define a question related to the chart
question = "What is the percentage of children in the world?"

# Process the input
inputs = processor(images=image, header_text=question, return_tensors="pt")

# Generate prediction
with torch.no_grad():  # Disable gradient calculations for inference
    predictions = model.generate(**inputs, max_new_tokens=150)

# Decode and display the result
output_text = processor.decode(predictions[0], skip_special_tokens=True)
print("Extracted Answer:", output_text)


Extracted Answer: This chart shows the7-percentage-child deaths among the world’s 116 million children. For a country with a population of 1.3 million, that’s two-to-one. (180% of all births in the world are to unmarried children). In addition, 14% of children are to be65.1% of the population.


## ahmed-masry/chartgemma

In [None]:
from PIL import Image
import requests
import torch
from transformers import AutoProcessor, AutoModelForImageTextToText

processor = AutoProcessor.from_pretrained("ahmed-masry/chartgemma")
model = AutoModelForImageTextToText.from_pretrained("ahmed-masry/chartgemma")

torch.hub.download_url_to_file('https://raw.githubusercontent.com/vis-nlp/ChartQA/main/ChartQA%20Dataset/val/png/multi_col_1229.png', 'chart_example_1.png')

image_path = "/content/chart_example_1.png"
input_text ="program of thought: what is the sum of Facebook Messenger and Whatsapp values in the 18-29 age group?"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Process Inputs
image = Image.open(image_path).convert('RGB')
inputs = processor(text=input_text, images=image, return_tensors="pt")
prompt_length = inputs['input_ids'].shape[1]
inputs = {k: v.to(device) for k, v in inputs.items()}

# Generate
generate_ids = model.generate(**inputs, num_beams=4, max_new_tokens=512)
output_text = processor.batch_decode(generate_ids[:, prompt_length:], skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
print(output_text)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

100%|██████████| 34.2k/34.2k [00:00<00:00, 17.0MB/s]
You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text. For this call, we will infer how many images each text has and add special tokens.


facebook_messenger_18_29 = 73
whatsapp_value_18_29 = 30
sum = facebook_messenger_18_29 + whatsapp_value_18_29
print(sum)
