# L3: Image Reasoning and Grounding

<p style="background-color:#fff6e4; padding:15px; border-width:3px; border-color:#f5ecda; border-style:solid; border-radius:6px"> ⏳ <b>Note <code>(Kernel Starting)</code>:</b> This notebook takes about 30 seconds to be ready to use. You may start and watch the video while you wait.</p>

In [None]:
import warnings
warnings.filterwarnings('ignore')

## Load API keys

In [None]:
import os
from utils import get_llama_api_key, get_llama_base_url, get_together_api_key

llama_api_key = get_llama_api_key()
llama_base_url = get_llama_base_url()
together_api_key = get_together_api_key()

<div style="background-color:#fff6ff; padding:13px; border-width:3px; border-color:#efe6ef; border-style:solid; border-radius:6px">
<p> 💻 &nbsp; <b>Access <code>requirements.txt</code> and <code>helper.py</code> files:</b> 1) click on the <em>"File"</em> option on the top menu of the notebook and then 2) click on <em>"Open"</em>.</p>

<p> ⬇ &nbsp; <b>Download Notebooks:</b> 1) click on the <em>"File"</em> option on the top menu of the notebook and then 2) click on <em>"Download as"</em> and select <em>"Notebook (.ipynb)"</em>.</p>

<p> 📒 &nbsp; For more help, please see the <em>"Appendix – Tips, Help, and Download"</em> Lesson.</p>
</div>

## Llama helpers

In [None]:
from utils import llama4, llama4_together

In [None]:
import base64

def encode_image_to_base64(image_path):
    """
    Encode a local image file to base64 string
    """
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

## Image Grounding

In [None]:
from utils import display_local_image
display_local_image("images/tools.png")

In [None]:
prompt = """Which tools in the image can be used for measuring length?
Provide bounding boxes for every recognized item."""

In [None]:
base64_tools = encode_image_to_base64("images/tools.png")

print(llama4(prompt,[f"data:image/jpeg;base64,{base64_tools}"]))

In [None]:
from utils import parse_output, draw_bounding_boxes
output = llama4(prompt, [f"data:image/jpeg;base64,{base64_tools}"])
tools = parse_output(output)
draw_bounding_boxes("images/tools.png", tools)

## Analyze table in PDF

In [None]:
from pathlib import Path
from pypdf import PdfReader

def pdf2text(file : str):
  text = ''
  with Path(file).open("rb") as f:
    reader = PdfReader(f)
    text = "\n\n".join([page.extract_text() for page in reader.pages])

  return text

In [None]:
meta_q4_2024_txt = pdf2text("Meta-Reports-Fourth-Quarter-and-Full-Year-2024-Results-2025.pdf")

In [None]:
start = meta_q4_2024_txt.find("Fourth Quarter and Full Year 2024 Financial Highlights")
print(meta_q4_2024_txt[start:start+1000])

In [None]:
prompt = f"""How much is 2024 operating margin based on Meta's financial
quarter report below:
{meta_q4_2024_txt}
"""
print(llama4(prompt))

In [None]:
display_local_image("images/meta-q4-2024-highlights.png")

In [None]:
base64_meta = encode_image_to_base64("images/meta-q4-2024-highlights.png")
prompt = """How much is 2024 operating margin based on Meta's financial
report?"""
print(llama4(prompt, [f"data:image/jpeg;base64,{base64_meta}"]))

## Generating code from a screenshot

In [None]:
vid_frame_address = "images/video_frame_1440.jpg"
display_local_image(vid_frame_address)

In [None]:
base64_image = encode_image_to_base64(vid_frame_address)
prompt = """If I want to change the temperature on the image,
where should I click? Return the bounding box for the location."""
output = llama4(prompt, [f"data:image/jpeg;base64,{base64_image}"])
output

In [None]:
prompt = """"Write a python script that uses Gradio to implement
the chatbot UI in the image."""

output = llama4(prompt,[f"data:image/jpeg;base64,{base64_image}"],
                model="Llama-4-Maverick-17B-128E-Instruct-FP8")
print(output)

In [None]:
import gradio as gr

def chatbot_settings(temp, max_tokens, repetition, top_p, streaming):
    return f"Temperature: {temp}, Max Tokens: {max_tokens}, Repetition: {repetition}, Top P: {top_p}, Streaming: {streaming}"

def main():
    with gr.Blocks() as demo:
        with gr.Row():
            with gr.Column(scale=3):
                chatbot = gr.Chatbot(label="Llama-4-Maverick-17B-128E-Instruct-FP8")
                msg = gr.Textbox(label="Enter message...")
            with gr.Column(scale=1):
                with gr.Group("Model settings"):
                    temp = gr.Slider(label="Temperature", minimum=0, maximum=1, value=0.6)
                    max_tokens = gr.Slider(label="Max tokens", minimum=0, maximum=4096, value=2048)
                    repetition = gr.Slider(label="Repetition", minimum=0, maximum=2, value=1.0)
                    top_p = gr.Slider(label="Top P", minimum=0, maximum=1, value=0.9)
                    streaming = gr.Checkbox(label="Streaming", value=True)
                    advanced = gr.Button("Advanced")
                    json_schema = gr.Button("JSON schema")
                    tools = gr.Button("Tools")

        def respond(message, history, temp, max_tokens, repetition, top_p, streaming):
            # Here you would implement your chatbot's response logic
            # For now, it just echoes the input
            bot_message = f"Echo: {message}"
            history.append((message, bot_message))
            return "", history

        msg.submit(respond, [msg, chatbot, temp, max_tokens, repetition, top_p, streaming], [msg, chatbot])

    demo.launch()

if __name__ == "__main__":
    main()

## Solving Math

In [None]:
display_local_image("images/simple_math.png")

In [None]:
base64_math = encode_image_to_base64("images/simple_math.png")
prompt = "Answer the question in the image."
print(llama4(prompt, [f"data:image/png;base64,{base64_math}"]))

## Analyzing computer screen

In [None]:
display_local_image("images/browser_screenshot.png")

In [None]:
base64_img = encode_image_to_base64("images/browser_screenshot.png")
prompt = """Describe the screenshot in detail,
including browser URL and tabs."""
print(llama4(prompt,[f"data:image/png;base64,{base64_img}"]))

In [None]:
display_local_image("images/browser_screenshot.png")
prompt = """If I want to go to the next lesson, what should I do?"""
print(llama4(prompt,[f"data:image/png;base64,{base64_img}"]))