In [None]:
import re
import os
import time
import torchvision
from tqdm import tqdm
from glob import glob
from google import genai
from PIL import Image
from google.genai import types
from google.genai.types import HarmCategory, HarmBlockThreshold


client = genai.Client(api_key="YOUR_API_KEY")

In [None]:
# Load frames from the "frames" directory
frames = [
    Image.open(file).convert("RGB")
    for file in sorted(glob("frames/*.jpg"))
]

In [None]:
# Prompt for video reasoning
prompt = """
**Video Reasoning Task: Identifying the Key Transition Frame**

You will be provided with several frames sampled from a music video.
Your task is to identify the frame that marks the most significant shift in atmosphere and emotion, signaling a corresponding transition in the video's music. Follow these steps to derive your final answer:

1. **Frame Analysis:**
- Describe each frame **in detail**, focusing on the **visual atmosphere** and the **facial expressions of the actors**.
- Identify differences in adjacent frames to pinpoint the transition.
- Verify the descriptions match the given frames.

2. **Scene Categorization:**
- Since music videos often employ frequent scene transitions, categorize and list the frames according to their respective scenes.
- Minimize the number of scenes

3. **Selecting the Most Impactful Scene:**
- Based on the previous step, identify the scene that conveys the most powerful atmosphere.

4. **Identifying the Key Frame:**
- Within the selected scene, determine the frame that contributes most to the atmosphere.
- Prioritize the emotional impact conveyed through the actors’ facial expressions over visual effects.

5. **Final Answer:**
- Format the answer as follows: 'Answer: #X'

Your final answer should pinpoint the most critical frame that aligns with the music video’s emotional and atmospheric transition.
"""

In [None]:

response = client.models.generate_content(
    model="gemini-2.0-flash",
    contents=[prompt, frames],
    config=types.GenerateContentConfig(
        safety_settings=[
            types.SafetySetting(
                category=HarmCategory.HARM_CATEGORY_HARASSMENT,
                threshold=HarmBlockThreshold.BLOCK_NONE
            ),
            types.SafetySetting(
                category=HarmCategory.HARM_CATEGORY_HATE_SPEECH,
                threshold=HarmBlockThreshold.BLOCK_NONE
            ),
            types.SafetySetting(
                category=HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT,
                threshold=HarmBlockThreshold.BLOCK_NONE
            ),
            types.SafetySetting(
                category=HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT,
                threshold=HarmBlockThreshold.BLOCK_NONE
            ),
        ],
        seed=1019,
        temperature=0
    )
)

print(response.text)