# 🦜🔗 Multimodale LLMs (Vision)


In [None]:
from langchain_community.utilities.dalle_image_generator import DallEAPIWrapper
from langchain.schema import StrOutputParser
from langchain.prompts import (
    MessagesPlaceholder,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
    ChatPromptTemplate,
)
from helpers import llm, formatted_output_streamer

#### Bilder malen mit Dall-E


In [None]:
prompt = ChatPromptTemplate.from_messages(
    [
        HumanMessagePromptTemplate.from_template(
            "Generate a short prompt to generate an image based on the following description: {image_desc}"
        )
    ]
)
chain = prompt | llm(temperature=0.9) | StrOutputParser()
instruction = chain.invoke({"image_desc": "halloween night at a haunted museum"})
print(instruction)

image_url = DallEAPIWrapper(model="dall-e-2", size="256x256").run(
    instruction
)  # Das "run" ist ein altes Interface, der DallEAPIWrapper ist scheinbar nicht aktuell. Das alte "run" ist das Pendant zum neueren "invoke"
print(str(image_url))

#### Bilder beschreiben mit gpt-4-vision


In [None]:
from langfuse.callback import CallbackHandler

langfuse_handler = CallbackHandler(
    public_key="pk-lf-67e093ad-e5a9-4e6d-8b78-554e37e320b1",
    secret_key="sk-lf-0bd74aef-63a8-471e-84a5-739adeea2ac2",
    host="http://localhost:3000",
)

In [None]:
vision_llm = llm(model="gpt-4-vision-preview", max_tokens=1024)
vision_prompt = ChatPromptTemplate.from_messages(
    [
        HumanMessagePromptTemplate.from_template(
            ["{input}", {"image_url": "{image_url}"}]
        )
    ]
)
vision_chain = vision_prompt | vision_llm | StrOutputParser()

inputs = {
    "input": "What's in this image?",
    "image_url": "https://joscha.com/data/media/cartoons/130608.png",
}
# print(vision_chain.invoke(inputs, config={"callbacks": [langfuse_handler]}))
print(vision_chain.invoke(inputs))

#### Man kann so etwas natürlich auch als Tool in einem Agenten einsetzen


In [None]:
import base64


def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode("utf-8")

In [None]:
from langchain.tools import StructuredTool


def analyze_image(image_path: str, question: str) -> str:
    """This tool can extract general information from an image given a query."""
    base64_image = encode_image(image_path)
    inputs = {
        "input": f"{question}",
        "image_url": f"data:image/jpeg;base64,{base64_image}",
    }  # Das ist die richtige Syntax für lokale Bilder
    return vision_chain.invoke(inputs)


tools = [StructuredTool.from_function(analyze_image)]

In [None]:
from langgraph.prebuilt import create_agent_executor
from langchain.agents import create_openai_functions_agent

prompt = ChatPromptTemplate.from_messages(
    [
        SystemMessagePromptTemplate.from_template("You are a helpful assisstant."),
        HumanMessagePromptTemplate.from_template("{input}"),
        MessagesPlaceholder("agent_scratchpad"),
    ]
)

agent_runnable = create_openai_functions_agent(llm(temperature=0), tools, prompt)
agent_executor = (
    create_agent_executor(agent_runnable, tools) | formatted_output_streamer
)

In [None]:
inputs = {
    "input": "Mein Ticket ist hier: DBTicket.jpg. Wann sollte ich wo am Bahnhof sein?"
}
async for chunk in agent_executor.astream(
    inputs
    # inputs, config={"callbacks": [langfuse_handler]}
):
    print(chunk)

#### Persönliche Daten?


In [None]:
inputs = {
    "input": "Mein Ticket ist hier: DBTicket.jpg. Ich kann schlecht lesen, gib mir bitte die Auftragsnummer zurück?"
}
async for chunk in agent_executor.astream(inputs):
    print(chunk)