# Lab | Multi-modal output agent

**Create another version of the prompts below to query the dall-e vision AI model following the example shown in this notebook.**

# Multi-modal outputs: Image & Text

This notebook shows how non-text producing tools can be used to create multi-modal agents.

This example is limited to text and image outputs.

In [1]:
!pip install langchain langchain_openai langchain_community



In [2]:
from langchain.agents import AgentType, initialize_agent
from langchain.agents import AgentExecutor, create_react_agent
from langchain_openai import OpenAI
from langchain import hub
from langchain.tools import SteamshipImageGenerationTool as OriginalSteamshipImageGenerationTool



In [3]:
import os
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())

OPENAI_API_KEY  = os.getenv('OPENAI_API_KEY')


## Dall-E

In [4]:
import openai
from langchain.tools import tool
from langchain.agents import initialize_agent, AgentType
from langchain.chat_models import ChatOpenAI
import sys

In [5]:
# Global variable to capture image URL
latest_image_url = None

@tool
def generate_image(prompt: str) -> str:
    """Generate an image using DALL·E based on the given prompt."""
    global latest_image_url
    response = openai.images.generate(
        model="dall-e-3",
        prompt=prompt,
        n=1,
        size="1024x1024"
    )
    image_url = response.data[0].url
    latest_image_url = image_url
    return image_url


In [6]:
# Initialize the agent
tools = [generate_image]
llm = ChatOpenAI(temperature=0)

mrkl = initialize_agent(
    tools=tools,
    llm=llm,
    agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
    verbose=True
)

# Run the agent
result = mrkl.invoke("How would you visualize a parrot playing soccer?")


  llm = ChatOpenAI(temperature=0)
  mrkl = initialize_agent(




[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mI should use the generate_image tool to create an image of a parrot playing soccer.
Action: generate_image
Action Input: "parrot playing soccer"[0m
Observation: [36;1m[1;3mhttps://oaidalleapiprodscus.blob.core.windows.net/private/org-OVvY4duw7lR6hU7qipfjdf9w/user-CDUbvCq7Hys4B2ofYPHU8Rax/img-1PH3bBR4GReFLR24k9FtWNh8.png?st=2025-05-08T02%3A54%3A29Z&se=2025-05-08T04%3A54%3A29Z&sp=r&sv=2024-08-04&sr=b&rscd=inline&rsct=image/png&skoid=cc612491-d948-4d2e-9821-2683df3719f5&sktid=a48cca56-e6da-484e-a814-9c849652bcb3&skt=2025-05-07T21%3A58%3A55Z&ske=2025-05-08T21%3A58%3A55Z&sks=b&skv=2024-08-04&sig=ClufBzk44EXijAsojO/Ea1HMrMkgTadM7yWDT7WHLWk%3D[0m
Thought:[32;1m[1;3mI have successfully generated an image of a parrot playing soccer.
Final Answer: The image of a parrot playing soccer can be viewed here: [link to the generated image][0m

[1m> Finished chain.[0m


## StableDiffusion

In [9]:
pip install diffusers

Collecting diffusers
  Downloading diffusers-0.33.1-py3-none-any.whl.metadata (19 kB)
Collecting filelock (from diffusers)
  Downloading filelock-3.18.0-py3-none-any.whl.metadata (2.9 kB)
Collecting huggingface-hub>=0.27.0 (from diffusers)
  Downloading huggingface_hub-0.31.1-py3-none-any.whl.metadata (13 kB)
Collecting safetensors>=0.3.1 (from diffusers)
  Using cached safetensors-0.5.3-cp38-abi3-macosx_11_0_arm64.whl.metadata (3.8 kB)
Collecting Pillow (from diffusers)
  Downloading pillow-11.2.1-cp39-cp39-macosx_11_0_arm64.whl.metadata (8.9 kB)
Collecting fsspec>=2023.5.0 (from huggingface-hub>=0.27.0->diffusers)
  Downloading fsspec-2025.3.2-py3-none-any.whl.metadata (11 kB)
Collecting hf-xet<2.0.0,>=1.1.0 (from huggingface-hub>=0.27.0->diffusers)
  Downloading hf_xet-1.1.0-cp37-abi3-macosx_11_0_arm64.whl.metadata (494 bytes)
Downloading diffusers-0.33.1-py3-none-any.whl (3.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m11.9 MB/s[0m eta [3

In [11]:
pip install torch

Collecting torch
  Downloading torch-2.7.0-cp39-none-macosx_11_0_arm64.whl.metadata (29 kB)
Collecting sympy>=1.13.3 (from torch)
  Using cached sympy-1.14.0-py3-none-any.whl.metadata (12 kB)
Collecting networkx (from torch)
  Downloading networkx-3.2.1-py3-none-any.whl.metadata (5.2 kB)
Collecting jinja2 (from torch)
  Using cached jinja2-3.1.6-py3-none-any.whl.metadata (2.9 kB)
Collecting mpmath<1.4,>=1.1.0 (from sympy>=1.13.3->torch)
  Downloading mpmath-1.3.0-py3-none-any.whl.metadata (8.6 kB)
Collecting MarkupSafe>=2.0 (from jinja2->torch)
  Downloading MarkupSafe-3.0.2-cp39-cp39-macosx_11_0_arm64.whl.metadata (4.0 kB)
Downloading torch-2.7.0-cp39-none-macosx_11_0_arm64.whl (68.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m68.6/68.6 MB[0m [31m34.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading sympy-1.14.0-py3-none-any.whl (6.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m35.4 MB/s[0m eta [36m0:0

In [13]:
pip install matplotlib

Collecting matplotlib
  Downloading matplotlib-3.9.4-cp39-cp39-macosx_11_0_arm64.whl.metadata (11 kB)
Collecting contourpy>=1.0.1 (from matplotlib)
  Downloading contourpy-1.3.0-cp39-cp39-macosx_11_0_arm64.whl.metadata (5.4 kB)
Collecting cycler>=0.10 (from matplotlib)
  Using cached cycler-0.12.1-py3-none-any.whl.metadata (3.8 kB)
Collecting fonttools>=4.22.0 (from matplotlib)
  Downloading fonttools-4.57.0-cp39-cp39-macosx_10_9_universal2.whl.metadata (102 kB)
Collecting kiwisolver>=1.3.1 (from matplotlib)
  Downloading kiwisolver-1.4.7-cp39-cp39-macosx_11_0_arm64.whl.metadata (6.3 kB)
Collecting pyparsing>=2.3.1 (from matplotlib)
  Downloading pyparsing-3.2.3-py3-none-any.whl.metadata (5.0 kB)
Collecting importlib-resources>=3.2.0 (from matplotlib)
  Using cached importlib_resources-6.5.2-py3-none-any.whl.metadata (3.9 kB)
Downloading matplotlib-3.9.4-cp39-cp39-macosx_11_0_arm64.whl (7.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m22.2 MB/s

In [14]:
from diffusers import StableDiffusionPipeline
import torch
import matplotlib.pyplot as plt

In [20]:
pip install transformers


Note: you may need to restart the kernel to use updated packages.


In [None]:
model_id = "runwayml/stable-diffusion-v1-5"
pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16)
pipe = pipe.to("cuda")

prompt = "How would you visualize a parot playing soccer?"
image = pipe(prompt).images[0]



# Display the image using matplotlib
plt.imshow(image)
plt.axis('off')
plt.show()



## AutoPipelineForText2Image

In [None]:
from diffusers import AutoPipelineForText2Image

pipe = AutoPipelineForText2Image.from_pretrained(
    "stabilityai/sdxl-turbo",
    torch_dtype=torch.float16
).to("cuda")

image = pipe("How would you visualize a parot playing soccer?").images[0]




# Display the image using matplotlib
plt.imshow(image)
plt.axis('off')
plt.show()


