add openai_based_image_eval in evaluator

YiVal · Nov 14, 2023 · 3db4a85 · 3db4a85
1 parent 6e93038
commit 3db4a85
Show file tree

Hide file tree

Showing 8 changed files with 345 additions and 51 deletions.
diff --git a/demo/building_design.py b/demo/building_design.py
@@ -1,8 +1,11 @@
 '''This script is used to generate image from a building design prompt'''
 
 import os
+from io import BytesIO
 
+import requests
 from openai import OpenAI
+from PIL import Image
 
 from yival.logger.token_logger import TokenLogger
 from yival.schemas.experiment_config import MultimodalOutput
@@ -37,26 +40,29 @@ def prompt_generation(prompt: str) -> str:
     return res
 
 
-# def load_image(response):
-#     '''load image from response'''
-#     print("[INFO] start load images")
-#     url = f"{BASE_URL}/getImage"
-#     image_urls = response['response']['imageUrls']
-#     image_list = []
-#     for image_url in image_urls:
-#         payload = json.dumps({"imgUrl": image_url})
-#         response = s.post(url, headers=HEADERS, data=payload)
-#         if response.status_code == 200:
-#             image_data = response.content
-#             image = Image.open(io.BytesIO(image_data))
-#             image_list.append(image)
-#         else:
-#             print(
-#                 f"[Error] Failed to load image from {image_url}. Response code: {response.status_code}"
-#             )
-#     print("[INFO] Successfully load images.")
-
-#     return image_list
+def load_image(images):
+    '''load image from response'''
+    print("[INFO] start load images")
+
+    image_dict = {}
+    for image in images:
+        image_url = image.url
+        try:
+            response = requests.get(image_url)
+            if response.status_code == 200:
+                image = Image.open(BytesIO(response.content))
+                image_dict[image_url] = image
+            else:
+                print(
+                    f"[Error] Failed to load image from {image_url}. Response code: {response.status_code}"
+                )
+        except Exception as e:
+            print(
+                f"[Error] Failed to load image from {image_url}. Error: {str(e)}"
+            )
+
+    print("[INFO] Successfully load images.")
+    return image_dict
 
 
 def building_design(location: str, function: str, state: ExperimentState):
@@ -75,17 +81,17 @@ def building_design(location: str, function: str, state: ExperimentState):
             )
         )
     )
-    print(f"prompt: {prompt}")
+
     response = client.images.generate(
         model="dall-e-3", prompt=prompt, n=1, size="1024x1024"
     )
-    print(f"response: {response}")
-    # image_res = MultimodalOutput(
-    #     text_output=response['response']['content'],
-    #     image_output=response['response']['imageUrls'],
-    # )
-    # return image_res
-    return response
+    print(f"\nresponse: {response}\n")
+    image_res = MultimodalOutput(
+        #     text_output=response.data.revised_prompt,
+        text_output=prompt,
+        image_output=load_image(response.data),
+    )
+    return image_res
 
 
 def main():

diff --git a/demo/configs/building_design.yml b/demo/configs/building_design.yml
@@ -6,13 +6,12 @@ dataset:
       chunk_size: 1000
       diversify: true
       prompt:
-          "Please provide a concrete and realistic test case as a dictionary for function invocation using the ** operator.
-          Only include parameters, excluding description and name.
-          Ensure it's succinct and well-structured.
-          **Only provide the dictionary.**"
+        "Please provide a concrete and realistic test case as a dictionary for function invocation using the ** operator.
+        Only include parameters, excluding description and name.
+        Ensure it's succinct and well-structured.
+        **Only provide the dictionary.**"
       input_function:
-        description:
-          Given the geographical location of the building and the actual function of the building, generate a description of the exterior scene of the building.
+        description: Given the geographical location of the building and the actual function of the building, generate a description of the exterior scene of the building.
         name: building_design_generation
         parameters:
           location: str
@@ -30,21 +29,21 @@ variations:
       diversify: false
       max_tokens: 2000
       variables: null
-      prompt: 
+      prompt:
         - content: |-
-                Your objective is to construct a concise instruction prompt for GPT-4. This prompt will instruct GPT-4 as an innovative, architectural designer to create diverse building design variations based on geographical location and building function.
+            Your objective is to construct a concise instruction prompt for GPT-4. This prompt will instruct GPT-4 as an innovative, architectural designer to create diverse building design variations based on geographical location and building function.
 
-                Points to emphasize in your instruction:
-                  - GPT-4 responses should include a brief design concept and should be sufficiently imaginative. Responses should focus on the geographical location and the function of the building. The design can be contrary to common architectural norms, dream-like or realistic.
-                  - The design description must be kept lively and as short, short, short, short, short as possible. The design should have a variety of features and fully emphasize the geographical location; it can be dreamlike and generally bizarre.
-                  - Be sure to use words that are as easy to understand as possible so TEENAGERS can easily understand the design concept.
-                  - Please limit the output to within 300 words
-                  - {building_location} represents the geographical location of the building
-                  - {building_function} represents the function of the building
-                  - At all times, GPT-4 must give a response as a building design variation
+            Points to emphasize in your instruction:
+              - GPT-4 responses should include a brief design concept and should be sufficiently imaginative. Responses should focus on the geographical location and the function of the building. The design can be contrary to common architectural norms, dream-like or realistic.
+              - The design description must be kept lively and as short, short, short, short, short as possible. The design should have a variety of features and fully emphasize the geographical location; it can be dreamlike and generally bizarre.
+              - Be sure to use words that are as easy to understand as possible so TEENAGERS can easily understand the design concept.
+              - Please limit the output to within 300 words
+              - {building_location} represents the geographical location of the building
+              - {building_function} represents the function of the building
+              - At all times, GPT-4 must give a response as a building design variation
 
-                Craft your instruction. Ensure that GPT-4 is able to fully understand the geographical location and function of the building and create engaging and fully pictorial designs.
-                Keep your output crisp: only the prompt, devoid of any extraneous content.
+            Craft your instruction. Ensure that GPT-4 is able to fully understand the geographical location and function of the building and create engaging and fully pictorial designs.
+            Keep your output crisp: only the prompt, devoid of any extraneous content.
 
           role: system
 
@@ -77,7 +76,33 @@ evaluators:
       D: 3
       E: 4
 
-
+  - evaluator_type: individual
+    metric_calculators:
+      - method: AVERAGE
+    name: openai_prompt_based_image_evaluator
+    display_name: relevance_quality
+    prompt: |-
+      You are assessing a submitted building design description based on a given task and criterion. Here is the data:
+      - Task: Given the geographical location and function of a building, generate a design description and a corresponding image.
+      - Does the image clearly illustrate the content described in the text? The image should immediately make sense to anyone who reads the text. A lack of relevance can lead to confusion and may impact the understanding of the text.
+      [Input]: building_location : {location}, building_function: {function}
+      [Result]: {text_output}
+      Answer the question by selecting one of the following options:
+      A The image fails to match the text content at all.
+      B The image somewhat matches the text content, but there is significant room for improvement.
+      C The image matches the text content to a satisfactory degree.
+      D The image matches the text content very well.
+      E The image matches the text content exceptionally well, with little to no room for improvement.
+    choices: ["A", "B", "C", "D", "E"]
+    model_name: gpt-4-vision-preview
+    description: "evaluate the relevance of the generated building design text and image"
+    scale_description: "0-4"
+    choice_scores:
+      A: 0
+      B: 1
+      C: 2
+      D: 3
+      E: 4
 
 selection_strategy:
   ahp_selection:

diff --git a/src/yival/cli/init.py b/src/yival/cli/init.py
@@ -11,6 +11,7 @@
 from ..evaluators.bertscore_evaluator import BertScoreEvaluator
 from ..evaluators.openai_elo_evaluator import OpenAIEloEvaluator
 from ..evaluators.openai_prompt_based_evaluator import OpenAIPromptBasedEvaluator
+from ..evaluators.openai_prompt_based_image_evaluator import OpenAIPromptBasedImageEvaluator
 from ..evaluators.rouge_evaluator import RougeEvaluator
 from ..evaluators.string_expected_result_evaluator import StringExpectedResultEvaluator
 from ..result_selectors.ahp_selection import AHPSelection
@@ -47,6 +48,7 @@ def _prevent_unused_imports():
     _ = BertScoreEvaluator
     _ = OpenAIEloEvaluator
     _ = OpenAIPromptBasedEvaluator
+    _ = OpenAIPromptBasedImageEvaluator
 
     #Enhancer
     _ = OpenAIPromptBasedCombinationEnhancer