# Responses## Evaluation of large vision-language models on technical illustration
This notebook collects the responses from different multi-modal AI models for further evaluation of their ability to understand technical illustrations. The images are collected in the folder `data`, and the questions about these images are in the file `questions.csv`. Each image can have 1 to 3 questions. The `results.csv` has all the responses from all the models.### Models
- `MiniCPM-`V
- `moondream2`
- `imp-b1-3v`


**Note : Don't run multipl  mode" cells at the same time because the GPU memory will overflow*V

## MiniCPM-V

In [None]:
!pip install -r MiniCPM-V_req.txt

In [None]:
import torch
import os
import csv
from PIL import Image
from transformers import AutoModel, AutoTokenizer

model = AutoModel.from_pretrained('openbmb/MiniCPM-V', trust_remote_code=True)
model = model.to(device='cuda', dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-V', trust_remote_code=True)
model.eval()

# Create CSV file for results
with open('results/MiniCPM-V.csv', 'w', newline='') as csvfile:
    fieldnames = ['Image', 'Question', 'Response', 'Reference_answer']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()

    # Read questions from CSV file
    with open('answers.csv', 'r') as file:
        reader = csv.reader(file)
        for row in reader:
            image_name = row[0]
            question = row[1]
            reference_answer = row[2]

            # Load image
            image_path = os.path.join("data", image_name + ".png")
            image_path = image_path.replace(" ","")
            image = Image.open(image_path).convert('RGB')

            # Generate response
            msgs = [{'role': 'user', 'content': question}]
            res, _, _ = model.chat(
                image=image,
                msgs=msgs,
                context=None,
                tokenizer=tokenizer,
                sampling=True,
                temperature=0.7
            )
    
            # Write result to CSV
            writer.writerow({'Image': image_name, 'Question': question, 'Response': res, 'Reference_answer': reference_answer})

print("Results saved in MiniCPM-V.csv")

### Moondream2

In [None]:
!pip install transformers timm einops

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from PIL import Image
import os
import csv

model_id = "vikhyatk/moondream2"
revision = "2024-03-06"
model = AutoModelForCausalLM.from_pretrained(
    model_id, trust_remote_code=True, revision=revision
).to("cuda").eval()
tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision)

# Create CSV file for results
with open('results/moondream2.csv', 'w', newline='') as csvfile:
    fieldnames = ['Image', 'Question', 'Response', 'Reference_answer']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()

    # Read questions from CSV file
    with open('answers.csv', 'r') as file:
        reader = csv.reader(file)
        for row in reader:
            image_path = row[0]
            image_path = image_path.replace(" ","")
            question = row[1]
            reference_answer = row[2]

            # Load image
            image = Image.open("data/"+image_path+".png")
            # Encode the image
            enc_image = model.encode_image(image)
            
            # Generate response
            response = model.answer_question(enc_image, question, tokenizer)
            # Write result to CSV
            writer.writerow({'Image': os.path.basename(image_path), 'Question': question, 'Response': response , 'Reference_answer': reference_answer})

print("Results saved in results.csv")


### MILVLG/imp-v1-3b

In [None]:
!pip install transformers # latest version is ok, but we recommend v4.31.0
!pip install -q pillow accelerate einops
!pip install accelerate

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from PIL import Image
import os
import csv

torch.set_default_device("cuda")

# Create model
model = AutoModelForCausalLM.from_pretrained(
    "MILVLG/imp-v1-3b", 
    torch_dtype=torch.float16, 
    device_map="auto",
    trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained("MILVLG/imp-v1-3b", trust_remote_code=True)

# Create CSV file for results
with open('results/imp-v1-3b.csv', 'w', newline='') as csvfile:
    fieldnames = ['Image', 'Question', 'Response' , 'Reference_answer']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    
    # Read questions from CSV file
    with open('answers.csv', 'r') as file:
        reader = csv.reader(file)
        for row in reader:
            image_path = row[0]
            question = row[1]
            reference_answer = row[2]
            
            # Load image
            image_path = image_path.replace(" ","")
            image = Image.open("data/"+image_path+".png")

            # Set inputs
            text = f"A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\n{question} ASSISTANT:"
            input_ids = tokenizer(text, return_tensors='pt').input_ids
            image_tensor = model.image_preprocess(image)
        
            # Generate the answer
            output_ids = model.generate(
                input_ids,
                max_new_tokens=100,
                images=image_tensor,
                use_cache=True)[0]
            response = tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
            # Write result to CSV
            writer.writerow({'Image': os.path.basename(image_path), 'Question': question, 'Response': response, 'Reference_answer':reference_answer})

print("Results saved.")


# Salesforce/instructblip-vicuna-7b

In [None]:
from transformers import InstructBlipProcessor, InstructBlipForConditionalGeneration
import torch
from PIL import Image
import requests
import os
import csv

model = InstructBlipForConditionalGeneration.from_pretrained("Salesforce/instructblip-vicuna-7b")
processor = InstructBlipProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b")

device = "cpu"#"cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# Create CSV file for results
with open('results/instructblip.csv', 'w', newline='') as csvfile:
    fieldnames = ['Image', 'Question', 'Response' , 'Reference_answer']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    
    # Read questions from CSV file
    with open('answers.csv', 'r') as file:
        reader = csv.reader(file)
        for row in reader:
            image_path = row[0]
            question = row[1]
            reference_answer = row[2]
            
            # Load image
            image_path = image_path.replace(" ","")
            image = Image.open("data/"+image_path+".png").convert("RGB")
            # image = Image.open(requests.get("data/"+image_path+".png", stream=True).raw).convert("RGB")

            # Set inputs
            prompt = question
            inputs = processor(images=image, text=prompt, return_tensors="pt").to(device)
        
            # Generate the answer
            outputs = model.generate(
                    **inputs,
                    do_sample=False,
                    num_beams=5,
                    max_length=256,
                    min_length=1,
                    top_p=0.9,
                    repetition_penalty=1.5,
                    length_penalty=1.0,
                    temperature=1,
            )
            generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()
            print(generated_text)
            # Write result to CSV
            writer.writerow({'Image': os.path.basename(image_path), 'Question': question, 'Response': generated_text, 'Reference_answer':reference_answer})

print("Results saved.")


In [None]:
from transformers import InstructBlipProcessor, InstructBlipForConditionalGeneration
import torch
from PIL import Image
import requests

model = InstructBlipForConditionalGeneration.from_pretrained("Salesforce/instructblip-vicuna-7b")
processor = InstructBlipProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b")

device ="cpu"
model.to(device)

url = "https://raw.githubusercontent.com/salesforce/LAVIS/main/docs/_static/Confusing-Pictures.jpg"
image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
prompt = "In steps, explain how to make this photo?"
inputs = processor(images=image, text=prompt, return_tensors="pt").to(device)

outputs = model.generate(
        **inputs,
        do_sample=False,
        num_beams=5,
        max_length=256,
        min_length=1,
        top_p=0.9,
        repetition_penalty=1.5,
        length_penalty=1.0,
        temperature=1,
)
generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()
print(generated_text)


# start from here

# THUDM/glm-4v-9b

In [1]:
!pip install transformers
!pip3 install torch torchvision torchaudio
!pip install tiktoken
!pip install accelerate

Looking in indexes: https://nexus.iisys.de/repository/ki-awz-pypi-group/simple, https://pypi.org/simple
Looking in indexes: https://nexus.iisys.de/repository/ki-awz-pypi-group/simple, https://pypi.org/simple
Looking in indexes: https://nexus.iisys.de/repository/ki-awz-pypi-group/simple, https://pypi.org/simple
Looking in indexes: https://nexus.iisys.de/repository/ki-awz-pypi-group/simple, https://pypi.org/simple


In [2]:
import torch
from PIL import Image
from transformers import AutoModelForCausalLM, AutoTokenizer

device = "cuda"

tokenizer = AutoTokenizer.from_pretrained("THUDM/glm-4v-9b", trust_remote_code=True)

model = AutoModelForCausalLM.from_pretrained(
    "THUDM/glm-4v-9b",
    torch_dtype=torch.bfloat16,
    low_cpu_mem_usage=True,
    trust_remote_code=True
).to(device).eval()


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/15 [00:00<?, ?it/s]

In [3]:
query = 'describe the image'
image = Image.open("data/vr_12_1.png").convert('RGB')
inputs = tokenizer.apply_chat_template([{"role": "user", "image": image, "content": query}],add_generation_prompt=True, tokenize=True, return_tensors="pt",return_dict=True)  # chat mode
inputs = inputs.to(device)

gen_kwargs = {"max_length": 2500, "do_sample": True, "top_k": 1}
with torch.no_grad():
    outputs = model.generate(**inputs, **gen_kwargs)
    outputs = outputs[:, inputs['input_ids'].shape[1]:]
    print(tokenizer.decode(outputs[0]))


ValueError: too many values to unpack (expected 2)

In [None]:
import torch
from PIL import Image
from transformers import AutoModelForCausalLM, AutoTokenizer
import os
import csv

device = "cuda"

tokenizer = AutoTokenizer.from_pretrained("THUDM/glm-4v-9b", trust_remote_code=True)


# Create CSV file for results
with open('results/glm-4v-9b.csv', 'w', newline='') as csvfile:
    fieldnames = ['Image', 'Question', 'Response' , 'Reference_answer']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    
    # Read questions from CSV file
    with open('answers.csv', 'r') as file:
        reader = csv.reader(file)
        for row in reader:
            image_path = row[0]
            question = row[1]
            reference_answer = row[2]
            
            # Load image
            image_path = image_path.replace(" ","")
            image = Image.open("data/"+image_path+".png").convert("RGB")
            # image = Image.open(requests.get("data/"+image_path+".png", stream=True).raw).convert("RGB")

            # Set inputs
            query = question
            inputs = tokenizer.apply_chat_template([{"role": "user", "image": image, "content": query}],
                                                   add_generation_prompt=True, tokenize=True, return_tensors="pt",
                                                   return_dict=True)  # chat mode
            
            inputs = inputs.to(device)
            model = AutoModelForCausalLM.from_pretrained(
                "THUDM/glm-4v-9b",
                torch_dtype=torch.bfloat16,
                low_cpu_mem_usage=True,
                trust_remote_code=True
            ).to(device).eval()
            
            gen_kwargs = {"max_length": 2500, "do_sample": True, "top_k": 1}
            with torch.no_grad():
                outputs = model.generate(**inputs, **gen_kwargs)
                outputs = outputs[:, inputs['input_ids'].shape[1]:]
                generated_text= tokenizer.decode(outputs[0])
                print(generated_text)
            # Write result to CSV
            writer.writerow({'Image': os.path.basename(image_path), 'Question': question, 'Response': generated_text, 'Reference_answer':reference_answer})

print("Results saved.")


# openbmb/MiniCPM-Llama3-V-2_5

In [None]:
!pip install -r requirement/MiniCPM-Llama3.txt

In [None]:
import torch
from PIL import Image
from transformers import AutoModel, AutoTokenizer
import os
import csv

model = AutoModel.from_pretrained('openbmb/MiniCPM-Llama3-V-2_5', trust_remote_code=True, torch_dtype=torch.float16)
model = model.to(device='cuda')

tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-Llama3-V-2_5', trust_remote_code=True)
model.eval()

# Create CSV file for results
with open('results/MiniCPM-Llama3-V-2_5.csv', 'w', newline='') as csvfile:
    fieldnames = ['Image', 'Question', 'Response', 'Reference_answer']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()

    # Read questions from CSV file
    with open('answers.csv', 'r') as file:
        reader = csv.reader(file)
        for row in reader:
            image_path = row[0]
            image_path = image_path.replace(" ","")
            question = row[1]
            reference_answer = row[2]

            # Load image
            image = Image.open("data/"+image_path+".png").convert('RGB')
            msgs = [{'role': 'user', 'content': question}]
            
            res = model.chat(
                image=image,
                msgs=msgs,
                tokenizer=tokenizer,
                sampling=True, # if sampling=False, beam_search will be used by default
                temperature=0.7,
                # system_prompt='' # pass system_prompt if needed
            )
            print(res)
            # Write result to CSV
            writer.writerow({'Image': os.path.basename(image_path), 'Question': question, 'Response': res , 'Reference_answer': reference_answer})

print("Results saved in results.csv")


# WeMM

In [1]:
!pip install -r requirement/WeMM.txt

Looking in indexes: https://nexus.iisys.de/repository/ki-awz-pypi-group/simple, https://pypi.org/simple
Collecting transformers<=4.40.0 (from -r requirement/WeMM.txt (line 1))
  Obtaining dependency information for transformers<=4.40.0 from https://files.pythonhosted.org/packages/09/c8/844d5518a6aeb4ffdc0cf0cae65ae13dbe5838306728c5c640b5a6e2a0c9/transformers-4.40.0-py3-none-any.whl.metadata
  Using cached transformers-4.40.0-py3-none-any.whl.metadata (137 kB)
Collecting sentencepiece==0.1.99 (from -r requirement/WeMM.txt (line 2))
  Obtaining dependency information for sentencepiece==0.1.99 from https://files.pythonhosted.org/packages/7f/e5/323dc813b3e1339305f888d035e2f3725084fc4dcf051995b366dd26cc90/sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
  Using cached sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.7 kB)
Collecting numpy==1.23.5 (from -r requirement/WeMM.txt (line 3))
  Obtaining dependency 

In [3]:
!pip install torch==2.1.0+cu118 torchvision==0.16.0+cu118 torchaudio==2.1.0

Looking in indexes: https://nexus.iisys.de/repository/ki-awz-pypi-group/simple, https://pypi.org/simple
[0m

In [1]:
import torch
from PIL import Image
from transformers import AutoModel, GenerationConfig

model_path = 'feipengma/WeMM' # the path to the model 
wemm = AutoModel.from_pretrained(model_path, torch_dtype=torch.bfloat16, trust_remote_code=True)
wemm.cuda()
wemm.eval()

query = 'Describe this image.'
image = 'data/airconditioner0_16_1.png'
pred = wemm.mm_generate(image, query)
print(pred)



ImportError: libcudart.so.12: cannot open shared object file: No such file or directory

# THUDM/cogvlm2-llama3-chat-19B

In [1]:
!pip install torch==2.1.0
!pip install xformers
!pip install transformers==4.40.0
!pip install einops
!pip install torchvision

Looking in indexes: https://nexus.iisys.de/repository/ki-awz-pypi-group/simple, https://pypi.org/simple
Collecting torch==2.1.0
  Using cached torch-2.1.0-cp310-cp310-manylinux1_x86_64.whl.metadata (25 kB)
Collecting filelock (from torch==2.1.0)
  Using cached filelock-3.15.4-py3-none-any.whl.metadata (2.9 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch==2.1.0)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch==2.1.0)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch==2.1.0)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch==2.1.0)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch==2

In [2]:
import torch
from PIL import Image
from transformers import AutoModelForCausalLM, AutoTokenizer

MODEL_PATH = "THUDM/cogvlm2-llama3-chat-19B"
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
TORCH_TYPE = torch.bfloat16 if torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8 else torch.float16

tokenizer = AutoTokenizer.from_pretrained(
    MODEL_PATH,
    trust_remote_code=True
)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_PATH,
    torch_dtype=TORCH_TYPE,
    trust_remote_code=True,
).to(DEVICE).eval()


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

In [3]:

text_only_template = "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: {} ASSISTANT:"

while True:
    image_path = input("data/airconditioner0_16.png")
    if image_path == '':
        print('You did not enter image path, the following will be a plain text conversation.')
        image = None
        text_only_first_query = True
    else:
        image = Image.open(image_path).convert('RGB')

    history = []

    while True:
        query = input("Human:")
        if query == "clear":
            break

        if image is None:
            if text_only_first_query:
                query = text_only_template.format(query)
                text_only_first_query = False
            else:
                old_prompt = ''
                for _, (old_query, response) in enumerate(history):
                    old_prompt += old_query + " " + response + "\n"
                query = old_prompt + "USER: {describe the image} ASSISTANT:".format(query)
        if image is None:
            input_by_model = model.build_conversation_input_ids(
                tokenizer,
                query=query,
                history=history,
                template_version='chat'
            )
        else:
            input_by_model = model.build_conversation_input_ids(
                tokenizer,
                query=query,
                history=history,
                images=[image],
                template_version='chat'
            )
        inputs = {
            'input_ids': input_by_model['input_ids'].unsqueeze(0).to(DEVICE),
            'token_type_ids': input_by_model['token_type_ids'].unsqueeze(0).to(DEVICE),
            'attention_mask': input_by_model['attention_mask'].unsqueeze(0).to(DEVICE),
            'images': [[input_by_model['images'][0].to(DEVICE).to(TORCH_TYPE)]] if image is not None else None,
        }
        gen_kwargs = {
            "max_new_tokens": 2048,
            "pad_token_id": 128002,  
        }
        with torch.no_grad():
            outputs = model.generate(**inputs, **gen_kwargs)
            outputs = outputs[:, inputs['input_ids'].shape[1]:]
            response = tokenizer.decode(outputs[0])
            response = response.split("<|end_of_text|>")[0]
            print("\nCogVLM2:", response)
        history.append((query, response))


data/airconditioner0_16.png data/airconditioner0_16.png
Human: describe the image


RuntimeError: NVML_SUCCESS == r INTERNAL ASSERT FAILED at "../c10/cuda/CUDACachingAllocator.cpp":844, please report a bug to PyTorch. 

In [1]:
## int 4 
!pip install torch==2.1.0 torchvision transformers==4.40.0 xformers PILLOW accelerate bitsandbytes einops

Looking in indexes: https://nexus.iisys.de/repository/ki-awz-pypi-group/simple, https://pypi.org/simple
Collecting torch==2.1.0
  Using cached torch-2.1.0-cp310-cp310-manylinux1_x86_64.whl.metadata (25 kB)
Collecting torchvision
  Using cached torchvision-0.18.1-cp310-cp310-manylinux1_x86_64.whl.metadata (6.6 kB)
Collecting transformers==4.40.0
  Using cached transformers-4.40.0-py3-none-any.whl.metadata (137 kB)
Collecting xformers
  Using cached xformers-0.0.27-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.0 kB)
Collecting accelerate
  Using cached accelerate-0.32.1-py3-none-any.whl.metadata (18 kB)
Collecting bitsandbytes
  Using cached bitsandbytes-0.43.1-py3-none-manylinux_2_24_x86_64.whl.metadata (2.2 kB)
Collecting einops
  Using cached einops-0.8.0-py3-none-any.whl.metadata (12 kB)
Collecting filelock (from torch==2.1.0)
  Using cached filelock-3.15.4-py3-none-any.whl.metadata (2.9 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch==2.1.0)
  Using cached nvidia_cuda

In [2]:
import torch
from PIL import Image
from transformers import AutoModelForCausalLM, AutoTokenizer

MODEL_PATH = "THUDM/cogvlm2-llama3-chat-19B-int4"
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
TORCH_TYPE = torch.bfloat16 if torch.cuda.is_available() and torch.cuda.get_device_capability()[
    0] >= 8 else torch.float16

tokenizer = AutoTokenizer.from_pretrained(
    MODEL_PATH,
    trust_remote_code=True
)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_PATH,
    torch_dtype=TORCH_TYPE,
    trust_remote_code=True,
    low_cpu_mem_usage=True,
).eval()


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
  return self.fget.__get__(instance, owner)()


JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [None]:

text_only_template = "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: {} ASSISTANT:"

while True:
    image_path = input("image path >>>>> ")
    if image_path == '':
        print('You did not enter image path, the following will be a plain text conversation.')
        image = None
        text_only_first_query = True
    else:
        image = Image.open(image_path).convert('RGB')

    history = []

    while True:
        query = input("Human:")
        if query == "clear":
            break

        if image is None:
            if text_only_first_query:
                query = text_only_template.format(query)
                text_only_first_query = False
            else:
                old_prompt = ''
                for _, (old_query, response) in enumerate(history):
                    old_prompt += old_query + " " + response + "\n"
                query = old_prompt + "USER: {} ASSISTANT:".format(query)
        if image is None:
            input_by_model = model.build_conversation_input_ids(
                tokenizer,
                query=query,
                history=history,
                template_version='chat'
            )
        else:
            input_by_model = model.build_conversation_input_ids(
                tokenizer,
                query=query,
                history=history,
                images=[image],
                template_version='chat'
            )
        inputs = {
            'input_ids': input_by_model['input_ids'].unsqueeze(0).to(DEVICE),
            'token_type_ids': input_by_model['token_type_ids'].unsqueeze(0).to(DEVICE),
            'attention_mask': input_by_model['attention_mask'].unsqueeze(0).to(DEVICE),
            'images': [[input_by_model['images'][0].to(DEVICE).to(TORCH_TYPE)]] if image is not None else None,
        }
        gen_kwargs = {
            "max_new_tokens": 2048,
            "pad_token_id": 128002,
        }
        with torch.no_grad():
            outputs = model.generate(**inputs, **gen_kwargs)
            outputs = outputs[:, inputs['input_ids'].shape[1]:]
            response = tokenizer.decode(outputs[0])
            response = response.split("<|end_of_text|>")[0]
            print("\nCogVLM2:", response)
        history.append((query, response))


# internlm/internlm-xcomposer2-vl-7b

In [None]:
!pip install transformers
!pip install torch torchvision torchaudio
!pip install einops sentencepiece

In [None]:
import torch
from transformers import AutoModel, AutoTokenizer

torch.set_grad_enabled(False)

model_name='internlm/internlm-xcomposer2-vl-7b'
# init model and tokenizer
model = AutoModel.from_pretrained(model_name, trust_remote_code=True).cuda().eval()
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)


In [None]:
import os
import csv

# Create CSV file for results
with open('results/internlm-xcomposer2.csv', 'w', newline='') as csvfile:
    fieldnames = ['Image', 'Question', 'Response', 'Reference_answer']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()

    # Read questions from CSV file
    with open('answers.csv', 'r') as file:
        reader = csv.reader(file)
        for row in reader:
            image_path = row[0]
            image_path = image_path.replace(" ","")
            question = row[1]
            reference_answer = row[2]

            # Load image
            query = '<ImageHere>'+ question
            image = "data/"+image_path+".png"
            
            with torch.cpu.amp.autocast(): # it should be cuda instead of CPU, but there is no enough space 
                response, _ = model.chat(tokenizer, query=query, image=image, history=[], do_sample=False)
            print(response)
            # Write result to CSV
            writer.writerow({'Image': os.path.basename(image_path), 'Question': question, 'Response': response , 'Reference_answer': reference_answer})

print("Results saved in results.csv")


# OpenGVLab/Mini-InternVL-Chat-4B-V1-5

In [None]:
!pip install decord
!pip install einops flash_attn timm
!pip install transformers==4.37.2
!pip install accelerate sentencepiece

In [None]:
import numpy as np
import torch
import torchvision.transforms as T
from decord import cpu
from PIL import Image
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
import os
import csv

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)


def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform


def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
    best_ratio_diff = float('inf')
    best_ratio = (1, 1)
    area = width * height
    for ratio in target_ratios:
        target_aspect_ratio = ratio[0] / ratio[1]
        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
        if ratio_diff < best_ratio_diff:
            best_ratio_diff = ratio_diff
            best_ratio = ratio
        elif ratio_diff == best_ratio_diff:
            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
                best_ratio = ratio
    return best_ratio


def dynamic_preprocess(image, min_num=1, max_num=6, image_size=448, use_thumbnail=False):
    orig_width, orig_height = image.size
    aspect_ratio = orig_width / orig_height

    # calculate the existing image aspect ratio
    target_ratios = set(
        (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
        i * j <= max_num and i * j >= min_num)
    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])

    # find the closest aspect ratio to the target
    target_aspect_ratio = find_closest_aspect_ratio(
        aspect_ratio, target_ratios, orig_width, orig_height, image_size)

    # calculate the target width and height
    target_width = image_size * target_aspect_ratio[0]
    target_height = image_size * target_aspect_ratio[1]
    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]

    # resize the image
    resized_img = image.resize((target_width, target_height))
    processed_images = []
    for i in range(blocks):
        box = (
            (i % (target_width // image_size)) * image_size,
            (i // (target_width // image_size)) * image_size,
            ((i % (target_width // image_size)) + 1) * image_size,
            ((i // (target_width // image_size)) + 1) * image_size
        )
        # split the image
        split_img = resized_img.crop(box)
        processed_images.append(split_img)
    assert len(processed_images) == blocks
    if use_thumbnail and len(processed_images) != 1:
        thumbnail_img = image.resize((image_size, image_size))
        processed_images.append(thumbnail_img)
    return processed_images


def load_image(image_file, input_size=448, max_num=6):
    image = Image.open(image_file).convert('RGB')
    transform = build_transform(input_size=input_size)
    images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
    pixel_values = [transform(image) for image in images]
    pixel_values = torch.stack(pixel_values)
    return pixel_values


path = 'OpenGVLab/Mini-InternVL-Chat-4B-V1-5'
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    low_cpu_mem_usage=True,
    trust_remote_code=True).eval().cuda()

tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True)

#  # set the max number of tiles in `max_num`
# pixel_values = load_image("data/airconditioner0_16_1.png", max_num=6).to(torch.bfloat16).cuda()
            
# generation_config = dict(
#     num_beams=1,
#     max_new_tokens=1024,
#     do_sample=False,
# )
            
#             # single-image single-round conversation (单图单轮对话)
# response = model.chat(tokenizer, pixel_values, '<image>\n'+question, generation_config)
# print(response)


# Create CSV file for results
with open('results/Mini-InternVL-Chat-4B-V1-5.csv', 'w', newline='') as csvfile:
    fieldnames = ['Image', 'Question', 'Response' , 'Reference_answer']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    
    # Read questions from CSV file
    with open('answers.csv', 'r') as file:
        reader = csv.reader(file)
        for row in reader:
            image_path = "data/"+row[0]+".png"
            question = row[1]
            reference_answer = row[2]
            
            # set the max number of tiles in `max_num`
            pixel_values = load_image(image_path, max_num=6).to(torch.bfloat16).cuda()
            
            generation_config = dict(
                num_beams=1,
                max_new_tokens=1024,
                do_sample=False,
            )
            
            # single-image single-round conversation (单图单轮对话)
            response = model.chat(tokenizer, pixel_values, '<image>\n'+question, generation_config)

            # Write result to CSV
            writer.writerow({'Image': os.path.basename(image_path), 'Question': question, 'Response': response, 'Reference_answer':reference_answer})

print("Results saved.")


# OpenGVLab/Mini-InternVL-Chat-2B-V1-5

In [None]:
!pip install transformers==4.37.2
!pip3 install torch torchvision torchaudio
!pip install decord
!pip install einops flash_attn timm sentencepiece
!pip install accelerate

In [None]:
import numpy as np
import torch
import torchvision.transforms as T
from decord import cpu
from PIL import Image
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
import os
import csv

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)


def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform


def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
    best_ratio_diff = float('inf')
    best_ratio = (1, 1)
    area = width * height
    for ratio in target_ratios:
        target_aspect_ratio = ratio[0] / ratio[1]
        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
        if ratio_diff < best_ratio_diff:
            best_ratio_diff = ratio_diff
            best_ratio = ratio
        elif ratio_diff == best_ratio_diff:
            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
                best_ratio = ratio
    return best_ratio


def dynamic_preprocess(image, min_num=1, max_num=6, image_size=448, use_thumbnail=False):
    orig_width, orig_height = image.size
    aspect_ratio = orig_width / orig_height

    # calculate the existing image aspect ratio
    target_ratios = set(
        (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
        i * j <= max_num and i * j >= min_num)
    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])

    # find the closest aspect ratio to the target
    target_aspect_ratio = find_closest_aspect_ratio(
        aspect_ratio, target_ratios, orig_width, orig_height, image_size)

    # calculate the target width and height
    target_width = image_size * target_aspect_ratio[0]
    target_height = image_size * target_aspect_ratio[1]
    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]

    # resize the image
    resized_img = image.resize((target_width, target_height))
    processed_images = []
    for i in range(blocks):
        box = (
            (i % (target_width // image_size)) * image_size,
            (i // (target_width // image_size)) * image_size,
            ((i % (target_width // image_size)) + 1) * image_size,
            ((i // (target_width // image_size)) + 1) * image_size
        )
        # split the image
        split_img = resized_img.crop(box)
        processed_images.append(split_img)
    assert len(processed_images) == blocks
    if use_thumbnail and len(processed_images) != 1:
        thumbnail_img = image.resize((image_size, image_size))
        processed_images.append(thumbnail_img)
    return processed_images


def load_image(image_file, input_size=448, max_num=6):
    image = Image.open(image_file).convert('RGB')
    transform = build_transform(input_size=input_size)
    images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
    pixel_values = [transform(image) for image in images]
    pixel_values = torch.stack(pixel_values)
    return pixel_values


path = 'OpenGVLab/Mini-InternVL-Chat-2B-V1-5'
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    low_cpu_mem_usage=True,
    trust_remote_code=True).eval().cuda()

tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True)



# Create CSV file for results
with open('results/Mini-InternVL-Chat-2B-V1-5.csv', 'w', newline='') as csvfile:
    fieldnames = ['Image', 'Question', 'Response' , 'Reference_answer']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    
    # Read questions from CSV file
    with open('answers.csv', 'r') as file:
        reader = csv.reader(file)
        for row in reader:
            image_path = "data/"+row[0]+".png"
            question = row[1]
            reference_answer = row[2]
            
            # set the max number of tiles in `max_num`
            pixel_values = load_image(image_path, max_num=6).to(torch.bfloat16).cuda()
            
            generation_config = dict(
                num_beams=1,
                max_new_tokens=1024,
                do_sample=False,
            )
            
            # single-image single-round conversation
            question = '<image>\n'+question
            response = model.chat(tokenizer, pixel_values, question, generation_config)
            print(f'Assistant: {response}')

            # Write result to CSV
            writer.writerow({'Image': os.path.basename(image_path), 'Question': question, 'Response': response, 'Reference_answer':reference_answer})

print("Results saved.")

# HuggingFaceM4/idefics2-8b

In [None]:
!pip install -q git+https://github.com/huggingface/transformers.git
!pip install -q accelerate datasets peft bitsandbytes

In [None]:
import requests
import torch
from PIL import Image
from io import BytesIO

from transformers import AutoProcessor, AutoModelForVision2Seq
from transformers.image_utils import load_image

DEVICE = "cuda:0"

# Note that passing the image urls (instead of the actual pil images) to the processor is also possible
image = load_image("data/airconditioner0_16_1.png")

processor = AutoProcessor.from_pretrained("HuggingFaceM4/idefics2-8b")
model = AutoModelForVision2Seq.from_pretrained(
    "HuggingFaceM4/idefics2-8b",
).to(DEVICE)

# Create inputs
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image"},
            {"type": "text", "text": "What do we see in this image?"},
        ]
    },    
]
prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
inputs = processor(text=prompt, images=[image], return_tensors="pt")
inputs = {k: v.to(DEVICE) for k, v in inputs.items()}

# Generate
generated_ids = model.generate(**inputs, max_new_tokens=500)
generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)

print(generated_texts)