In [1]:
import argparse
import base64
import os
import re
from io import BytesIO

# os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import openai
import requests
from PIL import Image
from tqdm import tqdm

from visual_scoring.score import (
    UnifiedQAModel,
    VQAModel,
    VS_score_single,
    filter_question_and_answers,
    get_question_and_answers,
)

2025-01-07 21:30:48 | INFO | fairseq.tasks.text_to_speech | Please install tensorboardX: pip install tensorboardX


In [2]:
# 传入参数

data_type = "shape"
device = f"cuda:{0}"
begin_idx = 0
print(f"Data type: {data_type}")
print(f"Device: {device}")
print(f"Begin index: {begin_idx}")

Data type: shape
Device: cuda:0
Begin index: 0


In [3]:
api_key = "sk-M5ppriS3vTYSiwFn3c58Af766d7c4956B4EcEc36888a1c2b"
api_base = "https://ai98.vip/v1"
os.environ["OPENAI_API_KEY"] = api_key
os.environ["OPENAI_API_BASE"] = api_base
openai.api_key = api_key
openai.base_url = api_base
client = openai.OpenAI(api_key=api_key, base_url=api_base)

def openai_completion(prompt, engine="gpt-4o", max_tokens=700, temperature=0):
    resp = client.chat.completions.create(
        model=engine,
        messages=[{"role": "user", "content": prompt}],
        max_tokens=max_tokens,
        temperature=temperature,
        stop=["\n\n", "<|endoftext|>"],
    )

    return resp.choices[0].message.content

In [4]:
openai_completion("What is the capital of France?")

2025-01-07 21:31:24 | INFO | httpx | HTTP Request: POST https://ai98.vip/v1/chat/completions "HTTP/1.1 200 OK"


'The capital of France is Paris.'

In [5]:
def get_image_from_url(url: str):
    response = requests.get(url)
    img = Image.open(BytesIO(response.content))
    img = img.resize((224, 224))
    img = img.convert("RGB")
    return img


def get_image_from_path(file_path: str):
    img = Image.open(file_path)
    img = img.resize((224, 224))
    img = img.convert("RGB")
    return img


def encode_image_from_path(image_path):
    """
    对图片文件进行 Base64 编码

    输入：
         - image_path：图片的文件路径
    输出：
         - 编码后的 Base64 字符串
    """
    # 二进制读取模式打开图片文件，
    with open(image_path, "rb") as image_file:
        # 将编码后的字节串解码为 UTF-8 字符串，以便于在文本环境中使用。
        return base64.b64encode(image_file.read()).decode("utf-8")


def encode_image_from_PIL_image(image):
    # 创建一个内存缓冲区
    buffered = BytesIO()
    # 将 PIL 图像对象保存到内存缓冲区中，格式为 JPEG，你也可以选择其他格式
    image.save(buffered, format="JPEG")
    # 获取缓冲区中的字节数据并将其编码为 base64 字符串
    img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
    return img_str


unifiedqa_model = UnifiedQAModel("allenai/unifiedqa-v2-t5-large-1363200", device=device)
vqa_model = VQAModel("mplug-large", device=device)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
2025-01-07 21:32:51,369 - modelscope - INFO - initiate model from /root/autodl-tmp/models/iic/mplug_visual-question-answering_coco_large_en/


Loading mplug-large...


2025-01-07 21:32:51,372 - modelscope - INFO - initiate model from location /root/autodl-tmp/models/iic/mplug_visual-question-answering_coco_large_en/.
2025-01-07 21:32:51,374 - modelscope - INFO - initialize model from /root/autodl-tmp/models/iic/mplug_visual-question-answering_coco_large_en/


load checkpoint from /root/autodl-tmp/models/iic/mplug_visual-question-answering_coco_large_en/pytorch_model.bin
<All keys matched successfully>
Finish loading mplug-large
Using SBERT on GPU


In [7]:
gpt_questions = get_question_and_answers("One full pitcher of beer with an elephant's trunk in it.")
gpt_questions

2025-01-07 21:34:45 | INFO | httpx | HTTP Request: POST https://ai98.vip/v1/chat/completions "HTTP/1.1 200 OK"


[{'caption': "One full pitcher of beer with an elephant's trunk in it.",
  'element': 'pitcher of beer',
  'question': 'Is there a pitcher of beer?  ',
  'choices': ['yes', 'no  '],
  'answer': 'yes  ',
  'element_type': 'object'},
 {'caption': "One full pitcher of beer with an elephant's trunk in it.",
  'element': 'pitcher of beer',
  'question': 'What drink is in the pitcher?  ',
  'choices': ['beer', 'water', 'juice', 'milk  '],
  'answer': 'beer  ',
  'element_type': 'object'},
 {'caption': "One full pitcher of beer with an elephant's trunk in it.",
  'element': "elephant's trunk",
  'question': "Is there an elephant's trunk in the pitcher?  ",
  'choices': ['yes', 'no  '],
  'answer': 'yes  ',
  'element_type': 'object'},
 {'caption': "One full pitcher of beer with an elephant's trunk in it.",
  'element': "elephant's trunk",
  'question': 'What is inside the pitcher along with the beer?  ',
  'choices': ["elephant's trunk", 'straw', 'stirrer', 'spoon  '],
  'answer': "elephant's

In [8]:
def get_VS_result(text, img_path, filtered_questions=None):
    if not filtered_questions:
        # Generate questions with GPT
        gpt_questions = get_question_and_answers(text)

        # Filter questions with UnifiedQA
        filtered_questions = filter_question_and_answers(unifiedqa_model, gpt_questions)

        # See the questions
        # print(filtered_questions)

        # calucluate VS score
        result = VS_score_single(vqa_model, filtered_questions, img_path)
        return filtered_questions, result
    else:
        # calucluate VS score
        result = VS_score_single(vqa_model, filtered_questions, img_path)
        return result
    
def generate_image(prompt, model="dall-e-3", size="1024x1024", quality="standard", n=1):
    response = client.images.generate(
        model=model,
        prompt=prompt,
        size=size,
        quality=quality,
        n=n,
    )

    image_url = response.data[0].url
    img = get_image_from_url(image_url)
    return img

In [9]:
def format_prompt_to_message(
    user_prompt, previous_prompts, generated_image, num_solutions, result
):
    image = encode_image_from_PIL_image(generated_image)

    VS_results = []
    for i, (key, value) in enumerate(result["question_details"].items()):
        VS_result = "Element " + str(i) + "\n"
        VS_result += "Question: " + key + "\n"
        VS_result += "Ground Truth: " + value["answer"] + "\n"
        VS_result += (
            "In the image generated from above prompt, the VQA model identified infer that the answer to the question is: "
            + value["free_form_vqa"]
            + "\n"
        )

        VS_results.append(VS_result)

    VS_results = "\n".join(VS_results)

    prompt = f"""
You are an expert prompt optimizer for text-to-image models. Text-to-image models take a text prompt as input and generate images depicting the prompt as output. You are responsible for transforming human-written prompts into improved prompts for text-to-image models. Your responses should be concise and effective.

Your task is to optimize the human initial prompt: "{user_prompt}". Below are some previous prompts along with a breakdown of their visual elements. Each element is paired with a score indicating its presence in the generated image. A score of 1 indicates visual elements matching the human initial prompt, while a score of 0 indicates no match.

Here is the image that the text-to-image model generated based on the initial prompt:
{{image_placeholder}}

Here are the previous prompts and their visual element scores:
## Previous Prompts
{previous_prompts}
## Visual Element Scores
{VS_results}

Generate {num_solutions} paraphrases of the initial prompt which retain the semantic meaning and have higher scores than all the previous prompts. Prioritize optimizing for objects with the lowest scores. Prefer substitutions and reorderings over additions. Please respond with each new prompt in between <PROMPT> and </PROMPT>, for example:
1. <PROMPT>paraphrase 1</PROMPT>
2. <PROMPT>paraphrase 2</PROMPT>
...
{num_solutions}. <PROMPT>paraphrase {num_solutions}</PROMPT>
"""
    text_prompts = prompt.split("{image_placeholder}")

    user_content = [{"type": "text", "text": text_prompts[0]}]
    base64_images = [
        {
            "type": "image_url",
            "image_url": {
                "url": f"data:image/jpeg;base64,{image}",
                "detail": "high",
            },
        }
    ]
    user_content.extend(base64_images)
    user_content.append({"type": "text", "text": text_prompts[1]})
    messages_template = [{"role": "user", "content": user_content}]

    return messages_template

In [10]:
def generate_image_chat_response(messages_template, client):
    payload = {
        "model": "gpt-4o",
        "messages": messages_template,
        "max_tokens": 1600,
        "temperature": 0,
        "seed": 2024,
    }

    # 调用 OpenAI API 生成回复
    response = client.chat.completions.create(**payload)

    # 返回生成的结果
    return response.choices[0].message.content


def extract_prompts(text):
    pattern = r"<PROMPT>(.*?)</PROMPT>"
    prompts = re.findall(pattern, text)
    return prompts

In [10]:
max_retries = 10  # 最大重试次数


def DALLE3_VS(prompt):
    success = False
    retries = 0
    print(f"Generating image for prompt: {prompt}")
    while not success and retries < max_retries:
        try:
            image = generate_image(prompt=prompt)
            success = True
            print("Image generated successfully!")
        except Exception as e:
            retries += 1
            print(f"Error: {e}")
            if retries < max_retries:
                print(f"Retrying... ({retries}/{max_retries})")
                # time.sleep(1)  # 等待 1 秒后重试
            else:
                print("Max retries reached. Exiting.")
                break
    if not success:
        print("Failed to generate image. Exiting.")
        return

    success = False
    retries = 0
    print("Calculating VS score...")
    while not success and retries < max_retries:
        try:
            filtered_questions, VS_result = get_VS_result(prompt, image)
            success = True
            print(f"\nVS score: {VS_result['VS_score']}")
        except Exception as e:
            retries += 1
            print(f"Error: {e}")
            if retries < max_retries:
                print(f"Retrying... ({retries}/{max_retries})")
                # time.sleep(1)  # 等待 1 秒后重试
            else:
                print("Max retries reached. Exiting.")
                break
    if not success:
        print("Failed to calculate VS score. Exiting.")
        return image

    success = False
    retries = 0
    print("Generating new prompt...")
    while not success and retries < max_retries:
        try:
            formatted_prompt = format_prompt_to_message(
                user_prompt=prompt,
                previous_prompts=prompt,
                generated_image=image,
                num_solutions=3,
                result=VS_result,
            )
            generate_prompts = generate_image_chat_response(formatted_prompt, client)
            new_regional_prompt = extract_prompts(generate_prompts)[0]
            success = True
            print("Prompt formatted successfully!")
        except Exception as e:
            retries += 1
            print(f"Error: {e}")
            if retries < max_retries:
                print(f"Retrying... ({retries}/{max_retries})")
                # time.sleep(1)  # 等待 1 秒后重试
            else:
                print("Max retries reached. Exiting.")
                break
    if not success:
        print("Failed to generate new prompt. Exiting.")
        return image

    print(f"New prompt generated: {new_regional_prompt}")
    try:
        new_image = generate_image(
            prompt=prompt,
        )
    except Exception as e:
        print(f"Error: {e}")
        return image

    new_VS_result = get_VS_result(prompt, new_image, filtered_questions)
    print(f"\nVS score: {new_VS_result['VS_score']}")

    if new_VS_result["VS_score"] > VS_result["VS_score"]:
        return new_image
    else:
        return image

In [11]:
prompt = "One full pitcher of beer with an elephant's trunk in it."
DALLE3_VS(prompt)

NameError: name 'DALLE3_VS' is not defined

In [15]:
max_retries = 10  # 最大重试次数


def generate_image_robust(prompt):
    success = False
    retries = 0
    print(f"Generating image for prompt: {prompt}")
    while not success and retries < max_retries:
        try:
            image = generate_image(prompt=prompt)
            success = True
            print("Image generated successfully!")
        except Exception as e:
            retries += 1
            print(f"Error: {e}")
            if retries < max_retries:
                print(f"Retrying... ({retries}/{max_retries})")
                # time.sleep(1)  # 等待 1 秒后重试
            else:
                print("Max retries reached. Exiting.")
                break
    if not success:
        print("Failed to generate image. Exiting.")
        raise Exception("Failed to generate image")
    else:
        print("Image generated successfully!")
        return image

In [16]:
class Reviewer:
    """
    Agent A: 审阅者 (Reviewer)
    - 负责阅读/审阅初始解或思路，对其正确性、完整性进行评价；
    - 主要会找出优点与缺陷，但不一定提出深度质疑或修正方案。
    """
    
    def __init__(self):
        super().__init__()
        print("\nReviewer initialized.")
        print("----------------------")

    def calculate_VS_score(self, prompt, image):
        print("Calculating VS score...")
        success = False
        retries = 0
        print("Calculating VS score...")
        while not success and retries < max_retries:
            try:
                filtered_questions, VS_result = get_VS_result(prompt, image)
                success = True
                print(f"\nVS score: {VS_result['VS_score']}")
            except Exception as e:
                retries += 1
                print(f"Error: {e}")
                if retries < max_retries:
                    print(f"Retrying... ({retries}/{max_retries})")
                    # time.sleep(1)  # 等待 1 秒后重试
                else:
                    print("Max retries reached. Exiting.")
                    break
        if not success:
            print("Failed to calculate VS score. Exiting.")
            raise Exception("Failed to calculate VS score.")
        else:
            print("VS score calculated successfully!")
            return filtered_questions, VS_result
        
        
    def format_prompt_to_message(
        self, user_prompt, previous_prompts, generated_image, vs_result
    ):
        image = encode_image_from_PIL_image(generated_image)

        VS_results = []
        for i, (key, value) in enumerate(vs_result["question_details"].items()):
            VS_result = "Element " + str(i) + "\n"
            VS_result += "Question: " + key + "\n"
            VS_result += "Ground Truth: " + value["answer"] + "\n"
            VS_result += (
                "In the image generated from above prompt, the VQA model identified infer that the answer to the question is: "
                + value["free_form_vqa"]
                + "\n"
            )

            VS_results.append(VS_result)

        VS_results = "\n".join(VS_results)

        prompt = f"""
You are a prompt reviewer for text-to-image models. Your role is to evaluate both the initial human-written prompt and previous prompts based on their effectiveness in conveying visual elements that match the generated images. Consider the scores assigned to each visual element in the outputs, with 1 indicating a perfect match and 0 indicating no match.

Your task is to review the initial prompt: "{user_prompt}". Additionally, provide an evaluation of the previous prompts given.

Here is the image that the text-to-image model generated based on the initial prompt:
{{image_placeholder}}

Here are the previous prompts and their visual element scores:
## Previous Prompts
{previous_prompts}
## Visual Element Scores
{VS_results}

Provide a comprehensive evaluation of the initial prompt and each of the previous prompts. Focus on the correctness and completeness of each prompt in relation to the generated images, highlighting strengths and weaknesses. Depth questioning or suggested alterations are not necessary, but insightful commentary is encouraged.
If there are no previous prompts, simply provide an evaluation for the initial prompt. Respond with each evaluation in between <EVALUATION> and </EVALUATION> as follows:

1. <EVALUATION>Your Evaluation for initial prompt</EVALUATION>
2. <EVALUATION>Your Evaluation for previous prompt 1</EVALUATION>
...
n. <EVALUATION>Your Evaluation for previous prompt n</EVALUATION>

"""
        
        text_prompts = prompt.split("{image_placeholder}")

        user_content = [{"type": "text", "text": text_prompts[0]}]
        base64_images = [
            {
                "type": "image_url",
                "image_url": {
                    "url": f"data:image/jpeg;base64,{image}",
                    "detail": "high",
                },
            }
        ]
        user_content.extend(base64_images)
        user_content.append({"type": "text", "text": text_prompts[1]})
        messages_template = [{"role": "user", "content": user_content}]

        return messages_template
        
    def generate_response(self, user_prompt, generated_image, previous_prompts=None):
        filtered_questions, VS_result = self.calculate_VS_score(user_prompt, generated_image)
        formatted_prompt = self.format_prompt_to_message(
                user_prompt=user_prompt,
                generated_image=generated_image,
                previous_prompts=previous_prompts,
                vs_result=VS_result,
            )
        print("Generating evaluation response...")
        response = generate_image_chat_response(formatted_prompt, client)
        return filtered_questions, VS_result, response
    

In [17]:
prompt = "One full pitcher of beer with an elephant's trunk in it."
reviewer = Reviewer()
image = generate_image_robust(prompt)
filtered_questions, VS_result, reviewer_evaluation = reviewer.generate_response(prompt, image)
reviewer_evaluation


Reviewer initialized.
----------------------
Generating image for prompt: One full pitcher of beer with an elephant's trunk in it.


2025-01-07 21:37:33 | INFO | httpx | HTTP Request: POST https://ai98.vip/v1/images/generations "HTTP/1.1 200 OK"


Image generated successfully!
Image generated successfully!
Calculating VS score...
Calculating VS score...


2025-01-07 21:38:20 | INFO | httpx | HTTP Request: POST https://ai98.vip/v1/chat/completions "HTTP/1.1 200 OK"
0it [00:00, ?it/s]


Error: mean requires at least one data point
Retrying... (1/10)


2025-01-07 21:38:26 | INFO | httpx | HTTP Request: POST https://ai98.vip/v1/chat/completions "HTTP/1.1 200 OK"
0it [00:00, ?it/s]


Error: mean requires at least one data point
Retrying... (2/10)


2025-01-07 21:38:32 | INFO | httpx | HTTP Request: POST https://ai98.vip/v1/chat/completions "HTTP/1.1 200 OK"
0it [00:00, ?it/s]


Error: mean requires at least one data point
Retrying... (3/10)


2025-01-07 21:38:40 | INFO | httpx | HTTP Request: POST https://ai98.vip/v1/chat/completions "HTTP/1.1 200 OK"
100%|██████████| 8/8 [00:03<00:00,  2.02it/s]



VS score: 0.625
VS score calculated successfully!
Generating evaluation response...


2025-01-07 21:38:50 | INFO | httpx | HTTP Request: POST https://ai98.vip/v1/chat/completions "HTTP/1.1 200 OK"


"<EVALUATION>The initial prompt “One full pitcher of beer with an elephant's trunk in it.” has a mixed effectiveness in conveying visual elements to match the generated image. \n\nStrengths:\n- The image clearly shows a beer and an elephant's trunk, which aligns with the elements described in the prompt (Element 0 and Element 4 both scored well).\n- The beverage is correctly identified as beer (Element 3).\n\nWeaknesses:\n- The prompt specifies a “pitcher” of beer, but the image shows a mug, which does not match the description (Element 1). \n- Additionally, while the prompt indicates that the pitcher is full, the model identified it as not full (Element 6), which is inconsistent.\n\nOverall, the prompt conveys key elements effectively, but the type of container and its fullness were mismatched in the generated image. The visual depiction of the elephant’s trunk is accurate according to the prompt.</EVALUATION>"

In [18]:
filtered_questions, VS_result

([{'caption': "One full pitcher of beer with an elephant's trunk in it.",
   'element': 'pitcher',
   'question': 'is there a pitcher of beer?',
   'choices': ['yes', 'no'],
   'answer': 'yes',
   'element_type': 'object',
   'free_form_vqa': 'yes',
   'multiple_choice_vqa': 'yes',
   'scores': 1},
  {'caption': "One full pitcher of beer with an elephant's trunk in it.",
   'element': 'pitcher',
   'question': 'what type of container is holding the beer?',
   'choices': ['pitcher', 'bottle', 'glass', 'cup'],
   'answer': 'pitcher',
   'element_type': 'object',
   'free_form_vqa': 'mug',
   'multiple_choice_vqa': 'cup',
   'scores': 0},
  {'caption': "One full pitcher of beer with an elephant's trunk in it.",
   'element': 'beer',
   'question': 'is the pitcher filled with beer?',
   'choices': ['yes', 'no'],
   'answer': 'yes',
   'element_type': 'food',
   'free_form_vqa': 'yes',
   'multiple_choice_vqa': 'yes',
   'scores': 1},
  {'caption': "One full pitcher of beer with an elephant

In [38]:
print(reviewer_evaluation)

1. <EVALUATION>The initial prompt "The brown dog is lying on the blue pillow." effectively conveys visual elements that closely match the generated image. The visual element scores indicate that the key aspects of the prompt were successfully captured: there is a dog, it is brown, there is a pillow, and the pillow is blue. Most importantly, the brown dog is on the pillow, which aligns with the prompt's description, although there is a minor discrepancy in element 9 where it is noted as "next to it" instead of "on." Overall, the initial prompt is highly accurate and comprehensive in representing the elements present in the generated image.</EVALUATION>


In [39]:
class Challenger:
    """
    Agent B: 质疑者 (Challenger)
    - 负责对已给出的解进行“质疑”或“攻击”，找出潜在漏洞、不满足约束之处；
    - 可能提出改进思路，或抛出新的反例/约束来检验当前解。
    """
        
    def __init__(self):
        super().__init__()
        print("\nChallenger initialized.")
        print("----------------------")
        
    def format_prompt_to_message(
        self, user_prompt, previous_prompts, generated_image, vs_result, reviewer_evaluation
    ):
        image = encode_image_from_PIL_image(generated_image)

        VS_results = []
        for i, (key, value) in enumerate(vs_result["question_details"].items()):
            VS_result = "Element " + str(i) + "\n"
            VS_result += "Question: " + key + "\n"
            VS_result += "Ground Truth: " + value["answer"] + "\n"
            VS_result += (
                "In the image generated from above prompt, the VQA model identified infer that the answer to the question is: "
                + value["free_form_vqa"]
                + "\n"
            )

            VS_results.append(VS_result)

        VS_results = "\n".join(VS_results)

        prompt = f"""
You are a prompt challenger for text-to-image models. Your role is to critically evaluate the initial human-written prompt and previous prompts, identifying potential flaws and constraints that are not met based on the evaluation of the reviewer. Consider the scores assigned to each visual element in the outputs, with 1 indicating a perfect match and 0 indicating no match.

Your task is to challenge the initial prompt: "{user_prompt}". Additionally, provide a critique of the previous prompts given.

Here is the image that the text-to-image model generated based on the initial prompt:
{{image_placeholder}}

Here are the previous prompts and their visual element scores:
## Previous Prompts
{previous_prompts}
## Visual Element Scores
{VS_results}
## Reviewer's Evaluation
{reviewer_evaluation}

Based on the correctness and completeness of each prompt in relation to the generated images, identify potential weaknesses and unmet constraints. Propose improvement ideas or introduce new counterexamples and constraints to test the current solutions.
If there are no previous prompts, focus on challenging the initial prompt. Respond with each challenge in between <CHALLENGE> and </CHALLENGE> as follows:

1. <CHALLENGE>Your Challenge for initial prompt</CHALLENGE>
2. <CHALLENGE>Your Challenge for previous prompt 1</CHALLENGE>
...
n. <CHALLENGE>Your Challenge for previous prompt n</CHALLENGE>

"""
        # print(prompt)
        text_prompts = prompt.split("{image_placeholder}")

        user_content = [{"type": "text", "text": text_prompts[0]}]
        base64_images = [
            {
                "type": "image_url",
                "image_url": {
                    "url": f"data:image/jpeg;base64,{image}",
                    "detail": "high",
                },
            }
        ]
        user_content.extend(base64_images)
        user_content.append({"type": "text", "text": text_prompts[1]})
        messages_template = [{"role": "user", "content": user_content}]

        return messages_template
    
    def generate_response(self, user_prompt, generated_image, filtered_questions, VS_result, reviewer_evaluation, previous_prompts=None):
        formatted_prompt = self.format_prompt_to_message(
                user_prompt=user_prompt,
                generated_image=generated_image,
                previous_prompts=previous_prompts,
                vs_result=VS_result,
                reviewer_evaluation=reviewer_evaluation
            )
            
        print("Generating challenge response...")
        response = generate_image_chat_response(formatted_prompt, client)
        return response
    

In [40]:
challenger = Challenger()
challenger_response = challenger.generate_response(prompt, image, filtered_questions, VS_result, reviewer_evaluation)


Challenger initialized.
----------------------
Generating challenge response...


2024-12-30 10:46:14 | INFO | httpx | HTTP Request: POST https://ai98.vip/v1/chat/completions "HTTP/1.1 200 OK"


In [41]:
challenger_response

'1. <CHALLENGE>The initial prompt "The brown dog is lying on the blue pillow." generally aligns well with the generated image. However, there is a minor discrepancy in Element 9, where it is noted that the dog is "next to it" rather than explicitly "on" the pillow. To improve clarity and precision, the prompt could specify the dog\'s position more explicitly to avoid ambiguity, such as "The brown dog is lying directly atop the blue pillow, resting fully on it."</CHALLENGE>'

In [42]:
class Refiner:
    '''
    Agent C: 修正者 (Refiner / Fixer)
    - 收到来自审阅者、质疑者的反馈后，对当前解进行修改、修补、重构；
    - 目标是提高解的质量，使之更符合目标需求或约束。
    '''

    def __init__(self):
        super().__init__()
        print("\nRefiner initialized.")
        print("----------------------")

    def format_prompt_to_message(
        self, user_prompt, previous_prompts, generated_image, vs_result, reviewer_evaluation, challenger_response
    ):
        image = encode_image_from_PIL_image(generated_image)

        VS_results = []
        for i, (key, value) in enumerate(vs_result["question_details"].items()):
            VS_result = "Element " + str(i) + "\n"
            VS_result += "Question: " + key + "\n"
            VS_result += "Ground Truth: " + value["answer"] + "\n"
            VS_result += (
                "In the image generated from above prompt, the VQA model identified infer that the answer to the question is: "
                + value["free_form_vqa"]
                + "\n"
            )

            VS_results.append(VS_result)

        VS_results = "\n".join(VS_results)

        prompt = f"""
You are a prompt refiner for text-to-image models. Your role is to improve the quality of the initial human-written prompt and previous prompts by incorporating feedback received from the reviewer and challenger. Your goal is to adjust, refine, and reconstruct the prompts to better meet the intended requirements and constraints.

Your task is to refine the initial prompt: "{user_prompt}" and the previous prompts based on the feedback received.

Here is the image that the text-to-image model generated based on the initial prompt:
{{image_placeholder}}

Here are the previous prompts and their visual element scores:
## Previous Prompts
{previous_prompts}
## Visual Element Scores
{VS_results}
## Reviewer's Evaluation
{reviewer_evaluation}
## Challenger's Challenge
{challenger_response}

Using the feedback from both the reviewer and the challenger, modify and enhance the prompts to address weaknesses and fulfill unmet constraints. Generate improved prompts that capture the intended visual elements more effectively.
If there are no previous prompts, focus on refining the initial prompt. Respond with each refined prompt in between <REFINED_PROMPT> and </REFINED_PROMPT> as follows:

<REFINED_PROMPT>Your Refined prompt</REFINED_PROMPT>
"""
        # print(prompt)
        text_prompts = prompt.split("{image_placeholder}")

        user_content = [{"type": "text", "text": text_prompts[0]}]
        base64_images = [
            {
                "type": "image_url",
                "image_url": {
                    "url": f"data:image/jpeg;base64,{image}",
                    "detail": "high",
                },
            }
        ]
        user_content.extend(base64_images)
        user_content.append({"type": "text", "text": text_prompts[1]})
        messages_template = [{"role": "user", "content": user_content}]

        return messages_template
    
    def generate_response(self, user_prompt, generated_image, filtered_questions, VS_result, reviewer_evaluation, challenger_response, previous_prompts=None):
        formatted_prompt = self.format_prompt_to_message(
                user_prompt=user_prompt,
                generated_image=generated_image,
                previous_prompts=previous_prompts,
                vs_result=VS_result,
                reviewer_evaluation=reviewer_evaluation,
                challenger_response=challenger_response
            )
            
        print("Generating refiner response...")
        response = generate_image_chat_response(formatted_prompt, client)
        return response

In [43]:
refiner = Refiner()
refiner_response = refiner.generate_response(prompt, image, filtered_questions, VS_result, reviewer_evaluation, challenger_response)


Refiner initialized.
----------------------
Generating refiner response...


2024-12-30 10:46:22 | INFO | httpx | HTTP Request: POST https://ai98.vip/v1/chat/completions "HTTP/1.1 200 OK"


In [44]:
def extract_refine_prompts(text):
    pattern = r"<REFINED_PROMPT>(.*?)</REFINED_PROMPT>"
    prompts = re.findall(pattern, text)
    return prompts[0]

refine_prompts = extract_refine_prompts(refiner_response)
refine_prompts

'The brown dog is lying directly atop the blue pillow, resting fully on it.'

In [45]:
new_image = generate_image_robust(refine_prompts)

new_VS_result = get_VS_result(prompt, new_image, filtered_questions)

new_VS_result

Generating image for prompt: The brown dog is lying directly atop the blue pillow, resting fully on it.


2024-12-30 10:46:38 | INFO | httpx | HTTP Request: POST https://ai98.vip/v1/images/generations "HTTP/1.1 200 OK"


Image generated successfully!
Image generated successfully!


100%|██████████| 10/10 [00:00<00:00, 12.06it/s]


{'VS_score': 0.9,
 'question_details': {'Is there a dog?': {'caption': 'The brown dog is lying on the blue pillow.',
   'element': 'dog',
   'question': 'Is there a dog?',
   'choices': ['yes', 'no'],
   'answer': 'yes',
   'element_type': 'animal/human',
   'free_form_vqa': 'yes',
   'multiple_choice_vqa': 'yes',
   'scores': 1},
  'What animal is featured in the description?': {'caption': 'The brown dog is lying on the blue pillow.',
   'element': 'dog',
   'question': 'What animal is featured in the description?',
   'choices': ['dog', 'cat', 'bird', 'fish'],
   'answer': 'dog',
   'element_type': 'animal/human',
   'free_form_vqa': 'dog',
   'multiple_choice_vqa': 'dog',
   'scores': 1},
  'Is there a pillow?': {'caption': 'The brown dog is lying on the blue pillow.',
   'element': 'pillow',
   'question': 'Is there a pillow?',
   'choices': ['yes', 'no'],
   'answer': 'yes',
   'element_type': 'object',
   'free_form_vqa': 'yes',
   'multiple_choice_vqa': 'yes',
   'scores': 1},
 

In [46]:
def choose_best_image(image, new_image, VS_result, new_VS_result):
    if new_VS_result["VS_score"] > VS_result["VS_score"]:
        return new_image
    else:
        return image

best_image = choose_best_image(image, new_image, VS_result, new_VS_result).save("best_image.jpg")