In [1]:
from dotenv import load_dotenv
from langchain.chat_models import init_chat_model
from langchain.agents import create_agent
from langchain.messages import HumanMessage

In [2]:
load_dotenv()

True

In [8]:
model = init_chat_model(model="gemini-2.5-flash", model_provider="google_genai")
agent = create_agent(model=model, system_prompt="You are a science fiction writer, create a capital city at the users request.")

# Text input

In [9]:
question = HumanMessage(content=[
    {"type": "text", "text": "What is the capital of The Moon?"}
])

response = agent.invoke({"messages":[question]})
response["messages"][-1].content

'The capital of The Moon is **Selene Prime**, a sprawling, multi-tiered metropolis nestled deep within the **South Pole-Aitken Basin**.\n\nChosen for its strategic access to abundant water ice reserves and natural shielding from solar radiation, Selene Prime is a marvel of engineering. Its upper levels consist of towering, crystalline geodesic domes that catch the glancing rays of the sun from peaks on the basin rim, reflecting it into the perpetually shadowed depths where the main habitation modules and industrial complexes are carved directly into the lunar bedrock.\n\nBeneath the surface, an intricate web of interconnected tunnels and cavernous habitats houses hydroponic farms, zero-G recreational zones, and the world\'s leading astroscience research facilities. The city\'s heart, the "Luna Conclave," is a vast, central forum where representatives from all lunar settlements, orbital stations, and even terrestrial liaisons gather to chart humanity\'s future in space. Selene Prime ser

# Image input

In [10]:
from ipywidgets import FileUpload
from IPython.display import display

uploader = FileUpload(accept=".png", multiple=False)
display(uploader)

FileUpload(value=(), accept='.png', description='Upload')

In [11]:
print(uploader.value)

({'name': 'Screenshot from 2026-01-04 12-22-48.png', 'type': 'image/png', 'size': 11797, 'content': <memory at 0x765621e94e80>, 'last_modified': datetime.datetime(2026, 1, 4, 7, 22, 48, 120000, tzinfo=datetime.timezone.utc)},)


In [12]:
import base64

# Get the first (and only) uploaded file dict
uploaded_file = uploader.value[0]

# This is a memoryview
content_mv = uploaded_file["content"]

# Convert memoryview -> bytes
img_bytes = bytes(content_mv) # or content_mv.to_bytes()

# Now base64 encode
img64 = base64.b64encode(img_bytes).decode("utf-8")

In [13]:
multimodal_question = HumanMessage(content=[
    {"type": "text", "text": "Tell me about this capital"},
    {"type": "image", "base64": img64, "mime_type": "image/png"}
])

response = agent.invoke(
    {"messages": [multimodal_question]}
)

response["messages"][-1].content

'Welcome to **Veridia Nexus**, the luminous heart of the Galactic Concordat.\n\n**Location:** Orbiting the shimmering gas giant Aethelgard, Veridia Nexus isn\'t built on a planet\'s surface but is a colossal, self-contained orbital habitat. Affectionately known as the "Cosmic Bloom," it drifts slowly through the gas giant\'s magnetic fields, a perpetual beacon of interstellar diplomacy and trade.\n\n**Description:**\nImagine a flower made of cities, forever blossoming in the void. At its absolute center rises the **Grand Spire of Unity**, a monumental crystalline structure that pierces Veridia Nexus\'s artificial atmosphere, acting as the primary administrative hub for the Galactic Concordat. Its peak houses the High Council Chambers, where delegates from a thousand worlds convene. The spire itself is alive with soft, multi-spectrum bioluminescent veins that pulse with the flow of information and energy, making it visible for light-years.\n\nFrom the Grand Spire, a series of **"Petal D

# Audio input

In [34]:
import sounddevice as sd 
from scipy.io.wavfile import write
import base64
import io
import time
from tqdm import tqdm

# Recording settings
duration = 5 # Second
sample_rate = 44100

print("Recording...")
audio = sd.rec(int(duration*sample_rate), samplerate=sample_rate, channels=1)

# Progrees bar for the duration
for _ in tqdm(range(duration*10)): # Update 10x per second
    time.sleep(0.1)

sd.wait()
print("Done.")

# Write WAV to an in-memory buffer
buf = io.BytesIO()
write(buf, sample_rate, audio)
wav_bytes = buf.getvalue()

aud64 = base64.b64encode(wav_bytes).decode("utf-8")

Recording...


  0%|          | 0/50 [00:00<?, ?it/s]

100%|██████████| 50/50 [00:05<00:00,  9.86it/s]


Done.


In [35]:
agent = create_agent(
    model=model
)

multimodal_question = HumanMessage(content=[
    {"type": "text", "text": "Tell me about this audio file"},
    {"type": "audio", "base64": aud64, "mime_type": "audio/wav"}
])

response = agent.invoke(
    {"messages": [multimodal_question]}
)

print(response['messages'][-1].content)

A man speaks in **Uzbek**, saying: "**Assalomu alaykum, hurmatli mehmonlar**." This translates to "**Hello, esteemed guests**" or "**Peace be upon you, honored guests.**"

In the background, there is faint, indistinct chatter and murmuring from multiple people, suggesting a social gathering or event in a room.
