In [None]:
import base64
import os
import textwrap

import autogen


# ------------------------------------------------------------------ helpers
def save_b64_png(b64_str, fname="generated.png"):
    with open(fname, "wb") as f:
        f.write(base64.b64decode(b64_str))
    print(f"image saved → {fname}")


# ------------------------------------------------------------------ LLM config
llm_cfg = {
    "config_list": [
        {
            "api_type": "responses",  # use /responses
            "model": "gpt-4o",  # supports vision + images
            "api_key": os.getenv("OPENAI_API_KEY"),
        }
    ]
}

# ------------------------------------------------------------------ agents
user = autogen.UserProxyAgent(
    name="User",
    human_input_mode="ALWAYS",
)

assistant = autogen.AssistantAgent(
    name="ArtBot",
    llm_config=llm_cfg,
    system_message=textwrap.dedent("""
        You are an assistant that can reason over images and
        use the built-in image_generation tool. When generating
        an image, return ONLY the tool call result you receive.
    """).strip(),
)

# ------------------------------------------------------------------ initial image (URL or data-URI)
IMAGE_URL = "https://upload.wikimedia.org/wikipedia/commons/3/3b/BlkStdSchnauzer2.jpg"

# ------------------------------------------------------------------ chat sequence
chat1 = {
    "role": "user",
    "content": [
        {"type": "input_text", "text": "Describe this image in one sentence."},
        {"type": "input_image", "image_url": IMAGE_URL},
    ],
}
chat2 = {"role": "user", "content": "Generate a watercolor painting of a lighthouse at sunset."}

In [None]:
# Start the first turn
result1 = user.initiate_chat(
    assistant,
    message=chat1,
    max_turns=1,
    summary_method="last_msg",
)
print("Assistant description →", result1.summary)

In [None]:
#
# Second turn: user asks for watercolor image
result2 = user.initiate_chat(
    assistant,
    message=chat2,
    max_turns=1,  # allow assistant’s tool call & follow-up
    summary_method="last_msg",
    tools=[{"type": "image_generation"}],
)
# print("Assistant (raw) →", result2.summary[:120], "...")
#
# ------------------------------------------------------------------ extract & save the generated image
# The Responses API returns an output item of type "image_generation_call"
# for item in result2.chat_history:
# if isinstance(item, dict) and item.get("tool_responses"):
# Iterate through tool responses
# for tr in item["tool_responses"]:
# if tr.get("type") == "image_generation_call":
# save_b64_png(tr["result"])
# break

In [None]:
result1 = user.initiate_chat(
    assistant,
    message="Describe this image https://upload.wikimedia.org/wikipedia/commons/3/3b/BlkStdSchnauzer2.jpg in one sentence.",
    max_turns=1,
    summary_method="last_msg",
)
print(result1)

In [None]:
result2 = user.initiate_chat(
    assistant,
    message="Generate a watercolor version of this image https://upload.wikimedia.org/wikipedia/commons/3/3b/BlkStdSchnauzer2.jpg",
    max_turns=2,  # allow assistant’s tool call & follow-up
    summary_method="last_msg",
    tools=[{"type": "image_generation"}],
)