# Agent Chat with Multimodal Models

 The implement of session refers to [`babyagi`](../../camel/societies/babyagi_playing.py).

In [1]:
import os

os.environ["OPENAI_API_KEY"] = ""

In [4]:
from pathlib import Path
import re
from PIL import Image

from camel.utils import print_text_animated
from camel.configs import FunctionCallingConfig
from camel.functions import T2I_FUNCS
from camel.responses import ChatAgentResponse
from camel.agents import ChatAgent
from camel.messages import BaseMessage
from camel.prompts import PromptTemplateGenerator
from camel.types import TaskType, RoleType, ModelType
from copy import deepcopy

from IPython.core.display import Markdown
from colorama import Fore

class MMChat:
    r"""The class of multimodal chat session.
    """

    def __init__(
        self,
    ) -> None:
        self.critic = None
        self.artist = None
        critic_sys = """You need to describe what you see in the figure and improve the prompt of it.
Reply with the following format:

CRITICS: the image needs to improve...
PROMPT: here is the updated prompt!
        """
        self.critic_sys_msg = BaseMessage.make_assistant_message(role_name='critic', content= critic_sys)
        
        self.artist_sys_msg = BaseMessage.make_assistant_message(
            role_name="Artist",
            content=PromptTemplateGenerator().get_prompt_from_key(TaskType.MULTI_CONDITION_IMAGE_CRAFT, RoleType.ASSISTANT),
        )
        
        self.init_agents()

    def init_agents(self):
        r"""Initialize artist and critic agents with their system messages.

        """
        
        function_list=[*T2I_FUNCS]
        assistant_model_config = FunctionCallingConfig.from_openai_function_list(
            function_list=function_list,
            kwargs=dict(temperature=0.0),
        )

        self.artist = ChatAgent(
            system_message=self.artist_sys_msg,
            model_type=ModelType.GPT_4_TURBO_VISION,
            model_config=assistant_model_config,
            function_list=[*T2I_FUNCS],
        )
        self.artist.reset()
        
        self.critic = ChatAgent(
            system_message=self.critic_sys_msg,
            model_type=ModelType.GPT_4_TURBO_VISION,
        )
        self.critic.reset()


    def step(self, initialPrompt: str, iter_num = 2) -> ChatAgentResponse:
        r"""Process of the drawing and criticising.
        
        Returns:
            ChatAgentResponse: it contains the response message of the artist agent in the last iteration.

        """
        
        artist_user_msg = BaseMessage.make_user_message(
        role_name="User",
        content = initialPrompt
        )
        print(Fore.MAGENTA + "=" * 10 + "ARTIST SYS" + "=" * 10 + "\n" + self.artist_sys_msg.content)
        print()
        print(Fore.YELLOW + "=" * 10 + "ARTIST USR" + "=" * 10 + "\n" + artist_user_msg.content)
        print("\n")
        
        pattern = r'\(.*?/([0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12})(\.jpg|\.png)\)'
        response = self.artist.step(artist_user_msg)
        matches = re.findall(pattern, response.msg.content)

        image_paths = [f"./img/{uuid}{ext}" for uuid, ext in matches]
        tmp_paths = deepcopy(image_paths)
        response_msg = re.sub(pattern, lambda x: "(" + image_paths.pop(0) + ")", response.msg.content)
        image_paths = deepcopy(tmp_paths)

        print_text_animated(Fore.BLUE + "=" *10 + "ARTIST RES" + "="*10 + "\n" + response_msg)
        print()
        display(Markdown(response_msg))

        i = 0
        while i < iter_num:         
            i += 1
            # print("image_paths", image_paths)
            # print()
            
            # Resize the image to 128x128
            resized_imgs = [Image.open(image_path).resize((128, 128), Image.Resampling.LANCZOS) for image_path in image_paths]
            # Save for maintaining the image format
            [img.save(f"tmp_{i}.png", "PNG") for i, img in enumerate(resized_imgs)]
            saved = [f"tmp_{i}.png" for i in range(len(resized_imgs))]
            image_list = [Image.open(image) for image in saved]
            
            critic_user_msg = BaseMessage.make_user_message(
                role_name="User",
                content="image:",
                image_list=image_list,
                image_detail="low",
            )
            print(Fore.GREEN + "=" *10 + "CRITIC SYS" + "="*10 + "\n" + self.critic_sys_msg.content)
            print()
            print(Fore.RED + "=" *10 + "CRITIC USR" + "="*10 + "\n" + critic_user_msg.content)
            prompt = self.critic.step(critic_user_msg).msg.content
            print()
            print_text_animated(Fore.CYAN + "=" * 10 + "CRITIC RES" + "=" * 10 + "\n" + prompt + Fore.RESET)
            print()
            
            [Path(image).unlink() for image in saved]

            artist_user_msg = BaseMessage.make_user_message(
                role_name="User",
                content="Please generate a image based on the following prompt: \n" + prompt,
            )
            response = self.artist.step(artist_user_msg)

            matches = re.findall(pattern, response.msg.content)
            image_paths = [f"./img/{uuid}{ext}" for uuid, ext in matches]
            tmp_paths = deepcopy(image_paths)
            response_msg = re.sub(pattern, lambda x: "(" + image_paths.pop(0) + ")", response.msg.content)
            image_paths = deepcopy(tmp_paths)
            print_text_animated(Fore.BLUE + "=" *10 + "ARTIST RES" + "="*10 + "\n" + response_msg)
            print()

            display(Markdown(response_msg))

        return response

In [5]:
session = MMChat()
res = session.step(initialPrompt="Create an image with pink background, a dog is showing a sign with 'I Love Camel'.", iter_num=1)

You are tasked with creating an image based on the provided text and images conditions. Please use your imagination and artistic capabilities to visualize and draw the images and explain what you are thinking about.

Create an image with pink background, a dog is showing a sign with 'I Love Camel'.

Here is the image of a cute dog holding a sign that says "I Love Camel" on a pink background:
![Dog with Sign](./img/8e7eb094-a08e-4f6b-896c-400ba464d23e.png)


Here is the image of a cute dog holding a sign that says "I Love Camel" on a pink background:

![Dog with Sign](./img/8e7eb094-a08e-4f6b-896c-400ba464d23e.png)

You need to describe what you see in the figure and improve the prompt of it.
Reply with the following format:

CRITICS: the image needs to improve...
PROMPT: here is the updated prompt!
        

image:

CRITICS: The image needs to improve the clarity of the message it intends to convey. The sign held by the dog with the text "I ❤️ CAMEL" might be confusing without context. It's unclear whether it refers to the animal, a brand, or something else.
PROMPT: Here is the updated prompt!
"Create an image of a cute beagle puppy sitting against a vibrant pink background. The puppy is holding a sign that reads 'I ❤️ CAMEL'. Ensure the text is bold and clear. This image could be used for humorous or promotional content, so consider adding elements that clarify the context, like a small camel graphic on the sign or a playful background element related to the theme."[39m

Here is the updated image based on your prompt:
![Cute Beagle Puppy with Sign](./img/1c2c4e0b-86aa-4389-9deb-f7e1303071d2.png

Here is the updated image based on your prompt:

![Cute Beagle Puppy with Sign](./img/1c2c4e0b-86aa-4389-9deb-f7e1303071d2.png)

This image features a cute beagle puppy sitting against a vibrant pink background, holding a sign that reads "I ❤️ CAMEL". The sign includes a small camel graphic to clarify the context, and the background has playful elements related to the theme.

- Drawbacks
    - Image Path Processing

- TODO
    - Multimodal Memory
    - Multi modality in Recommendation Systems