This notebook contains examples how to run inferences on external (closed source) models

### Setup

In [1]:
from dotenv import load_dotenv

load_dotenv("./.env", override=True)

True

In [2]:
from opengvl.clients.gemini import GeminiClient

client = GeminiClient()

  from .autonotebook import tqdm as notebook_tqdm
2025-09-07 17:35:25.715548: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-09-07 17:35:26.280026: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1757259326.497707   25870 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1757259326.562094   25870 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1757259327.030059   25870 computation_placer.cc:177] computation placer already r

In [None]:
# Minimal end-to-end inference example with a GeminiClient
# (Requires GEMINI_API_KEY in environment and internet access.)

import os
from pprint import pprint

import numpy as np

from opengvl.clients.gemini import GeminiClient
from opengvl.utils.constants import PromptPhraseKey
from opengvl.utils.data_types import Episode
from opengvl.utils.prompts import get_prompt

# 1. Verify API key exists (fail fast for notebook users)
if not os.getenv("GEMINI_API_KEY"):
    raise RuntimeError("GEMINI_API_KEY is not set in the environment. Create a .env file or export the variable before running.")

# 2. Create synthetic images (random noise) standing in for real frames.
# In practice you would load frames from a dataset loader.
RNG = np.random.default_rng(0)

# Create a starting frame + 5 shuffled frames
starting_frame = (RNG.random((244, 244, 3)) * 255).astype("uint8")
frames = [ (RNG.random((244, 244, 3)) * 255).astype("uint8") for _ in range(5) ]

# 3. Build a minimal Episode + (empty) context episodes list
instruction = "pick up the red block and place it on the blue platform"
original_indices = list(range(6))  # 1 start + 5 frames (simple increasing indices)
# For simplicity we assign monotonically increasing task completion to original frames
original_completion = [0, 10, 25, 40, 70, 90]
# Shuffle order for model input (simulate random order frames for evaluation)
shuffled_indices = [3, 1, 5, 2, 4]  # pick any permutation subset of original indices (excluding starting frame index 0)
shuffled_frames = [frames[i-1] if i != 0 else starting_frame for i in shuffled_indices]
# Approx completion rates aligned to shuffled frames (these are the ground-truth-like signals for context episodes; here just placeholders)
shuffled_completion_approx = [original_completion[i] for i in shuffled_indices]

episode = Episode(
    instruction=instruction,
    starting_frame=starting_frame,
    episode_index=0,
    original_frames_indices=original_indices,
    shuffled_frames_indices=shuffled_indices,
    shuffled_frames_approx_completion_rates=shuffled_completion_approx,
    original_frames_task_completion_rates=original_completion,
    shuffled_frames=shuffled_frames,
)

# 4. Provide a minimal set of required prompt phrases
prompt_phrases = {
    PromptPhraseKey.INITIAL_SCENE_LABEL.value: "Initial Scene:",
    PromptPhraseKey.INITIAL_SCENE_COMPLETION.value: "Estimated completion: 0%",
    PromptPhraseKey.CONTEXT_FRAME_LABEL_TEMPLATE.value: "Context Frame {i}:",
    PromptPhraseKey.CONTEXT_FRAME_COMPLETION_TEMPLATE.value: "Completion: {p}%",
    PromptPhraseKey.EVAL_FRAME_LABEL_TEMPLATE.value: "Eval Frame {i}:",
    PromptPhraseKey.EVAL_TASK_COMPLETION_INSTRUCTION.value: [
        "Now, for the task of {instruction}, give ONLY the completion percentage for each Eval Frame.",
        "Format strictly: 'Eval Frame X: <number>%' with no extra commentary.",
    ],
}

prompt = get_prompt(instruction)

# 5. Instantiate the client (choose an available model name)
# Replace 'gemini-1.5-flash' with another Gemini model you have access to if needed.
client = GeminiClient(model_name="gemini-1.5-flash", rpm=30)

# 6. Run inference (no context episodes in this synthetic example)
response_text = client.generate_response(
    prompt=prompt,
    eval_episode=episode,
    context_episodes=[],
    prompt_phrases=prompt_phrases,
)

print("Raw model output:\n", response_text)

# 7. (Optional) Parse percentages using a simple heuristic regex
import re

pattern = re.compile(r"Eval Frame (\d+):\s*(\d{1,3})%")
parsed = [(int(m.group(1)), int(m.group(2))) for m in pattern.finditer(response_text)]
print("\nParsed percentages:")
print(parsed)

# 8. (Optional) Map parsed results back to shuffled frame order
# (Frame numbers in the model output start after context frames + initial scene logic in BaseModelClient.)
print("\nEpisode summary:")
pprint({
    "instruction": instruction,
    "shuffled_indices": shuffled_indices,
    "parsed": parsed,
})
