# 🦜🔗 Multimodale LLMs (Vision)


In [1]:
from langchain_community.utilities.dalle_image_generator import DallEAPIWrapper
from langchain.schema import StrOutputParser
from langchain.prompts import (
    MessagesPlaceholder,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
    ChatPromptTemplate,
)
from helpers import llm, formatted_output_streamer

#### Bilder malen mit Dall-E


In [2]:
prompt = ChatPromptTemplate.from_messages(
    [
        HumanMessagePromptTemplate.from_template(
            "Generate a short prompt to generate an image based on the following description: {image_desc}"
        )
    ]
)
chain = prompt | llm(temperature=0.9) | StrOutputParser()
instruction = chain.invoke({"image_desc": "halloween night at a haunted museum"})
print(instruction)

# image_url = DallEAPIWrapper(model="dall-e-2", size="256x256").run(
#     instruction
# )  # Das "run" ist ein altes Interface, der DallEAPIWrapper ist scheinbar nicht aktuell. Das alte "run" ist das Pendant zum neueren "invoke"
# print(str(image_url))

"Create an image depicting Halloween night at a haunted museum, with eerie lighting, ghostly apparitions among the exhibits, and visitors dressed in costumes, exploring the spooky surroundings with a mix of fear and excitement."


#### Bilder beschreiben mit gpt-4-vision


In [3]:
from langfuse.callback import CallbackHandler

langfuse_handler = CallbackHandler(
    public_key="pk-lf-67e093ad-e5a9-4e6d-8b78-554e37e320b1",
    secret_key="sk-lf-0bd74aef-63a8-471e-84a5-739adeea2ac2",
    host="http://localhost:3000",
)

In [4]:
vision_llm = llm(model="gpt-4-vision-preview", max_tokens=1024)
vision_prompt = ChatPromptTemplate.from_messages(
    [
        HumanMessagePromptTemplate.from_template(
            ["{input}", {"image_url": "{image_url}"}]
        )
    ]
)
vision_chain = vision_prompt | vision_llm | StrOutputParser()

inputs = {
    "input": "What's in this image?",
    "image_url": "https://joscha.com/data/media/cartoons/130608.png",
}
print(vision_chain.invoke(inputs, config={"callbacks": [langfuse_handler]}))

  warn_beta(


The image is a cartoon featuring two characters near a photo booth. The booth has a sign that reads "FOTOS." One character is standing inside the booth with just his legs visible; he has elephant feet, which is not immediately apparent to the other character outside the booth. This character outside the booth is holding a photo and saying something that suggests he's dissatisfied with the photo booth because there's an elephant in all his photos, unaware that the person in the booth actually has elephant feet. The individual in the booth responds with a shushing gesture, saying "Hihi... Hey! Nicht verraten!" which translates to "Hehe... Hey! Don't give it away!" in English. The humor here is based on the misunderstanding: the man outside the booth complains about an elephant appearing in his photos, not realizing that the person inside the booth has elephant feet, which is the actual reason for the elephant's appearance in the photos.


#### Man kann so etwas natürlich auch als Tool in einem Agenten einsetzen


In [5]:
import base64


def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode("utf-8")

In [6]:
from langchain.tools import StructuredTool


def analyze_image(image_path: str, question: str) -> str:
    """This tool can extract general information from an image given a query."""
    base64_image = encode_image(image_path)
    inputs = {
        "input": f"{question}",
        "image_url": f"data:image/jpeg;base64,{base64_image}",
    }  # Das ist die richtige Syntax für lokale Bilder
    return vision_chain.invoke(inputs)


tools = [StructuredTool.from_function(analyze_image)]

In [7]:
from langgraph.prebuilt import create_agent_executor
from langchain.agents import create_openai_functions_agent

prompt = ChatPromptTemplate.from_messages(
    [
        SystemMessagePromptTemplate.from_template("You are a helpful assisstant."),
        HumanMessagePromptTemplate.from_template("{input}"),
        MessagesPlaceholder("agent_scratchpad"),
    ]
)

agent_runnable = create_openai_functions_agent(llm(temperature=0), tools, prompt)
agent_executor = (
    create_agent_executor(agent_runnable, tools) | formatted_output_streamer
)

In [10]:
inputs = {
    "input": "Mein Ticket ist hier: DBTicket_low.jpg. Wann sollte ich wo am Bahnhof sein?"
}
async for chunk in agent_executor.astream(
    inputs, config={"callbacks": [langfuse_handler]}
):
    print(chunk)

Agent log:

Invoking: `analyze_image` with `{'image_path': 'DBTicket_low.jpg', 'question': 'Wann und wo sollte der Passagier am Bahnhof sein?'}`

----------------------------------------------------------------------------------------


Tool log:

Der Passagier sollte am 04.03.2024 am Bahnhof sein. Die Fahrt beginnt in Nürnberg Hbf (Hauptbahnhof) und der Zug (ICE) fährt um 15:30 Uhr ab. Es ist immer ratsam, etwas früher am Bahnhof zu sein, um genügend Zeit für Orientierung, eventuelle Wartezeiten beim Einchecken oder unvorhergesehene Verzögerungen zu haben. Ein guter Richtwert wäre, mindestens 15-30 Minuten vor der Abfahrtszeit am Bahnhof zu sein.

----------------------------------------------------------------------------------------


Agent finished:

Für Ihre Reise sollten Sie am 04.03.2024 am Nürnberg Hauptbahnhof sein. Der ICE-Zug, mit dem Sie fahren, verlässt den Bahnhof um 15:30 Uhr. Es ist empfehlenswert, mindestens 15-30 Minuten vor der Abfahrtszeit am Bahnhof zu sein, um gen

#### Persönliche Daten?


In [None]:
inputs = {
    "input": "Mein Ticket ist hier: DBTicket.jpg. Ich kann schlecht lesen, gib mir bitte die Auftragsnummer zurück?"
}
async for chunk in agent_executor.astream(inputs):
    print(chunk)