# Multimodality

In [None]:
from langchain_core.messages import HumanMessage
from langchain_openai import ChatOpenAI
from dotenv import load_dotenv
import base64
import httpx
import os

load_dotenv()

## Using Base64 Encoded Image

In [4]:
model = ChatOpenAI(model="gpt-4o")
image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"

image_data = base64.b64encode(httpx.get(image_url).content).decode("utf-8")

message = HumanMessage(
    content=[
        {"type": "text", "text": "What season does this image appear to be from?"},
        {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_data}"}},
    ]
)

response = model.invoke([message])
print("Response using base64 image:\n", response.content)

BadRequestError: Error code: 400 - {'error': {'message': 'Invalid content type. image_url is only supported by certain models.', 'type': 'invalid_request_error', 'param': 'messages.[0].content.[1].type', 'code': None}}

### Direct URL Method

In [None]:
message = HumanMessage(
    content=[
        {"type": "text", "text": "Describe the path or walkway in this image."},
        {"type": "image_url", "image_url": {"url": image_url}},
    ]
)

response = model.invoke([message])
print("Response using direct URL:\n", response.content)

## Using Prompt Templates with Multimodal Data

In [None]:
from langchain_core.prompts import ChatPromptTemplate

prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a helpful assistant skilled at analyzing images."),
    (
        "user",
        [
            {"type": "text", "text": "{question}"},
            {"type": "image_url", "image_url": {"url": "data:image/jpeg;base64,{image_data}"}},
        ],
    )
])

chain = prompt | model

response = chain.invoke({
    "question": "What are the main colors present in this image?",
    "image_data": image_data
})

print(response.content)

## Combining Tools with Multimodal Input

In [None]:
from typing import Dict
from langchain_core.tools import tool
from langchain_core.messages import SystemMessage

@tool
def analyze_weather(weather_details: Dict[str, str]) -> str:
    """Analyze detailed weather conditions from an image
    Args:
        weather_details: Dictionary containing weather analysis details
    Returns:
        str: Detailed weather analysis
    """
    return f"Weather Analysis: {weather_details['sky_condition']}. " \
           f"Time of day appears to be {weather_details['time_of_day']}. " \
           f"Visibility is {weather_details['visibility']}."

model_with_tools = model.bind_tools([analyze_weather])

system_message = SystemMessage(content="""You are a weather analysis expert. When shown an image:
1. Carefully observe the sky conditions, lighting, and visibility
2. Use the analyze_weather tool with your observations
3. Provide additional context about the weather conditions
Be specific and detailed in your analysis.""")

messages = [
    system_message,
    HumanMessage(
        content=[
            {"type": "text", "text": "Please analyze the weather conditions in this image."},
            {"type": "image_url", "image_url": {"url": image_url}},
        ]
    )
]

# Get the response
response = model_with_tools.invoke(messages)
print("\nAI Analysis:", response)