# Video understanding with Gemini 2.0

In [None]:
!pip install -U -q google-genai

In [None]:
from google.colab import userdata

GOOGLE_API_KEY=userdata.get('GOOGLE_API_KEY')

In [None]:
from google import genai
from google.genai import types

client = genai.Client(api_key=GOOGLE_API_KEY)

In [None]:
model_name = "gemini-2.0-flash-exp"

### System instructions

 With the new SDK, the `system_instructions` and the `model` parameters must be passed in all `generate_content` calls, so let's save them to not have to type them all the time.

In [None]:
system_instructions = """
    When given a video and a query, call the relevant function only once with the appropriate timecodes and text for the video
  """


In [None]:
# Load sample images
!wget https://storage.googleapis.com/generativeai-downloads/videos/Pottery.mp4 -O Pottery.mp4 -q
!wget https://storage.googleapis.com/generativeai-downloads/videos/Jukin_Trailcam_Videounderstanding.mp4 -O Trailcam.mp4 -q
!wget https://storage.googleapis.com/generativeai-downloads/videos/post_its.mp4 -O Post_its.mp4 -q
!wget https://storage.googleapis.com/generativeai-downloads/videos/user_study.mp4 -O User_study.mp4 -q

In [None]:
import time

def upload_video(video_file_name):
  video_file = client.files.upload(path=video_file_name)

  while video_file.state == "PROCESSING":
      print('Waiting for video to be processed.')
      time.sleep(10)
      video_file = client.files.get(name=video_file.name)

  if video_file.state == "FAILED":
    raise ValueError(video_file.state)
  print(f'Video processing complete: ' + video_file.uri)

  return video_file

pottery_video = upload_video('Pottery.mp4')
trailcam_video = upload_video('Trailcam.mp4')
post_its_video = upload_video('Post_its.mp4')
user_study_video = upload_video('User_study.mp4')

Waiting for video to be processed.
Video processing complete: https://generativelanguage.googleapis.com/v1beta/files/xckrl6d7e1ig
Waiting for video to be processed.
Waiting for video to be processed.
Video processing complete: https://generativelanguage.googleapis.com/v1beta/files/6xfarr3jjyb7
Waiting for video to be processed.
Video processing complete: https://generativelanguage.googleapis.com/v1beta/files/9w56qgq4godt
Waiting for video to be processed.
Video processing complete: https://generativelanguage.googleapis.com/v1beta/files/1svtqex47rn6


### Imports

In [None]:
import json
from PIL import Image
from IPython.display import display, Markdown, HTML

# Search within videos

First, try using the model to search within your videos and describe all the animal sightings in the trailcam video.

<video controls width="500"><source src="https://storage.googleapis.com/generativeai-downloads/videos/Jukin_Trailcam_Videounderstanding.mp4" type="video/mp4"></video>

In [None]:
prompt = "For each scene in this video, generate captions that describe the scene along with any spoken text placed in quotation marks. Place each caption into an object with the timecode of the caption in the video."  # @param ["For each scene in this video, generate captions that describe the scene along with any spoken text placed in quotation marks. Place each caption into an object with the timecode of the caption in the video.", "Organize all scenes from this video in a table, along with timecode, a short description, a list of objects visible in the scene (with representative emojis) and an estimation of the level of excitement on a scale of 1 to 10"] {"allow-input":true}

video = trailcam_video # @param ["trailcam_video", "pottery_video", "post_its_video", "user_study_video"] {"type":"raw","allow-input":true}

response = client.models.generate_content(
    model=model_name,
    contents=[
        types.Content(
            role="user",
            parts=[
                types.Part.from_uri(
                    file_uri=video.uri,
                    mime_type=video.mime_type),
                ]),
        prompt,
    ]
)

Markdown(response.text)

```json
[
  {
    "timecode": "0:00",
    "caption": "A close-up shot shows light brown fur with a dark spot."
  },
    {
    "timecode": "0:00",
    "caption": "A fox is walking through a rocky area with trees. The audio includes an animal call, possibly a fox."
  },
  {
    "timecode": "0:03",
    "caption":"Two foxes are walking and sniffing around a rocky area."
  },
  {
    "timecode": "0:10",
    "caption":"Two foxes continue walking around the rocky area. One climbs on the rock."
    
  },
    {
    "timecode":"0:16",
    "caption":"A mountain lion with glowing eyes walks through a wooded area during a black and white scene."
  },
  {
    "timecode": "0:34",
    "caption": "Two foxes with glowing eyes are digging and moving around."
  },
    {
     "timecode":"0:50",
     "caption":"A very bright white shot is quickly followed by two foxes interacting."
   },
   {
     "timecode":"0:53",
     "caption":"A fox and a mountain lion with glowing eyes walks near rocks in the dark."
   },
  {
    "timecode":"1:05",
    "caption": "A mountain lion with glowing eyes walks among rocks at night."
  },
     {
    "timecode":"1:17",
      "caption":"A mountain lion with glowing eyes walks near rocks with another mountain lion in the background."
  },
      {
      "timecode":"1:29",
       "caption":"A bobcat with glowing eyes walks through a wooded area at night."
   },
     {
    "timecode": "1:51",
    "caption": "A large dark brown bear walks through a wooded area."
  },
 {
    "timecode":"1:56",
    "caption": "A mountain lion with glowing eyes walks in the woods during a black and white scene."
  },
    {
    "timecode":"2:04",
     "caption":"A close up shows the fur of a bear."
   },
  {
     "timecode":"2:07",
     "caption":"A bear walks and turns while a younger bear sniffs at the ground in front of it."
   },
  {
    "timecode":"2:23",
    "caption": "A fox with glowing eyes looks down at the lights of a city at night."
  },
      {
       "timecode":"2:35",
      "caption":"A large bear with glowing eyes walks on rocks at night as a city is seen in the background."
     },
    {
    "timecode":"2:52",
     "caption":"A mountain lion with glowing eyes looks at a rock."
  },
   {
    "timecode":"3:04",
    "caption": "A large black bear walks through a wooded area during the day."
  },
   {
     "timecode":"3:22",
      "caption": "A lighter brown bear looks around in a wooded area."
   },
     {
       "timecode":"3:27",
       "caption": "A brown bear is sniffing at the ground with another one nearby."
      },
      {
      "timecode":"3:30",
      "caption": "Two brown bears are seen from behind."
       },
       {
       "timecode":"3:37",
       "caption":"A brown bear looks up."
        },
        {
        "timecode":"3:44",
       "caption":"Two bears are sniffing at the ground."
        },
        {
        "timecode":"3:52",
       "caption":"A brown bear sits and looks around."
          },
         {
         "timecode":"4:01",
         "caption":"A group of three bears are walking through a wooded area."
        },
      {
     "timecode":"4:22",
        "caption":"A bobcat with glowing eyes sits and looks around in a dark wooded area at night."
   },
    {
       "timecode":"4:47",
        "caption":"A fox with glowing eyes walks around in a dark wooded area at night."
   },
      {
         "timecode":"4:56",
         "caption":"A mountain lion with glowing eyes sniffs at something on the ground."
      }
]
```

# Extract and organize text

Gemini can also read what's in the video and extract it in an organized way. You can even use Gemini reasoning capabilities to generate new ideas for you.

<video controls width="400"><source src="https://storage.googleapis.com/generativeai-downloads/videos/post_its.mp4" type="video/mp4"></video>

In [None]:
prompt = "Transcribe the sticky notes, organize them and put it in a table. Can you come up with a few more ideas?" # @param ["Transcribe the sticky notes, organize them and put it in a table. Can you come up with a few more ideas?", "Which of those names who fit an AI product that can resolve complex questions using its thinking abilities?"] {"allow-input":true}

video = post_its_video # @param ["trailcam_video", "pottery_video", "post_its_video", "user_study_video"] {"type":"raw","allow-input":true}

response = client.models.generate_content(
    model=model_name,
    contents=[
        types.Content(
            role="user",
            parts=[
                types.Part.from_uri(
                    file_uri=video.uri,
                    mime_type=video.mime_type),
                ]),
        prompt,
    ]
)

Markdown(response.text)

Certainly! Here's the transcription of the sticky notes organized into a table, along with a few additional project name ideas:

**Brainstorm Project Name Ideas**

| Category           | Project Name Ideas                |
|-------------------|-----------------------------------|
| **Constellations** | Canis Major, Leo Minor, Lyra, Orion's Belt, Sagitta, Delphinus, Centaurus |
| **Celestial Events** | Lunar Eclipse, Comets Tail, Supernova Echo,  Celestial Drift    |
| **Cosmic Structures**| Stellar Nexus,  Andromeda's Reach, Galactic Core, Chaos Field,  |
| **Math & Physics** | Symmetry, Golden Ratio, Infinity Loop, Taylor Series, Stokes Theorem, Fractal, Bayes Theorem, Riemann's Hypothesis, Chaos Theory, Euler's Path, Equilibrium|
| **Mythology**   |  Astral Forge,   Prometheus Rising, Chimera Dream,  Perseus Shield, Zephyr, Titan, Odin, Aether, Phoenix, Athena's Eye, Athena, Hera, Cerberus, Medusa |
| **Other**   |   Convergence, Lynx, Draco, Vector, Pandora's Box, Echo |
| **Additional** | Aurora Borealis,  Cosmic Web, Quantum Leap, Nebula Dance, Event Horizon, Singularity, Galactic Symphony |

I hope this is helpful! Let me know if you have any more questions!

# Structure information

Gemini 2.0 is not only able to read text but also to reason and structure about real world objects. Like in this video about a display of ceramics with handwritten prices and notes.

<video controls width="500"><source src="https://storage.googleapis.com/generativeai-downloads/videos/Pottery.mp4" type="video/mp4"></video>

In [None]:
prompt = "Give me a table of my items and notes" # @param ["Give me a table of my items and notes", "Help me come up with a selling pitch for my potteries"] {"allow-input":true}

video = pottery_video # @param ["trailcam_video", "pottery_video", "post_its_video", "user_study_video"] {"type":"raw","allow-input":true}

response = client.models.generate_content(
    model=model_name,
    contents=[
        types.Content(
            role="user",
            parts=[
                types.Part.from_uri(
                    file_uri=video.uri,
                    mime_type=video.mime_type),
                ]),
        prompt,
    ],
    config = types.GenerateContentConfig(
        system_instruction="Don't forget to escape the dollar signs",
    )
)

Markdown(response.text)

Okay, here is the table of items and notes you have displayed:

| Item              | Description                   | Height   | Diameter  | Price   | Notes                     |
|-------------------|-------------------------------|----------|-----------|---------|---------------------------|
| Tumblers        |  #5 Artichoke double dip | 4"      | 3"       | \$20      |  -ish                     |
| Small Bowls      |                         | 3.5"   | 6.5"       | \$35     |                             |
| Medium Bowls     |                        | 4"      | 7"       | \$40      |                             |
| Swatch #6        | #6 gemini double dip        | N/A    |   N/A      | N/A    |   SLOW COOL           |

# Analyze screen recordings for key moments

You can also use the model to analyze screen recordings. Let's say you're doing user studies on how people use your product, so you end up with lots of screen recordings, like this one, that you have to manually comb through.
With just one prompt, the model can describe all the actions in your video.

<video controls width="400"><source src="https://storage.googleapis.com/generativeai-downloads/videos/user_study.mp4" type="video/mp4"></video>

In [None]:
prompt = "Generate a paragraph that summarizes this video. Keep it to 3 to 5 sentences with corresponding timecodes." # @param ["Generate a paragraph that summarizes this video. Keep it to 3 to 5 sentences with corresponding timecodes.", "Choose 5 key shots from this video and put them in a table with the timecode, text description of 10 words or less, and a list of objects visible in the scene (with representative emojis).", "Generate bullet points for the video. Place each bullet point into an object with the timecode of the bullet point in the video."] {"allow-input":true}

video = user_study_video # @param ["trailcam_video", "pottery_video", "post_its_video", "user_study_video"] {"type":"raw","allow-input":true}

response = client.models.generate_content(
    model=model_name,
    contents=[
        types.Content(
            role="user",
            parts=[
                types.Part.from_uri(
                    file_uri=video.uri,
                    mime_type=video.mime_type),
                ]),
        prompt,
    ],
)

Markdown(response.text)

Certainly! Here is a paragraph summarizing the video:

The video demonstrates a garden app with a list of plants available for purchase (0:00).  The user likes several plants by tapping on the "Like" button next to the plant names, including a Rose Plant, Fern, Cactus, and Hibiscus (0:10-0:27). The user adds a Fern, Cactus, and Hibiscus to their cart (0:13-0:25). After these additions, the user views the shopping cart to see all added items and their total cost (0:31), and then the user views their profile which shows how many plants they have liked and added to their cart (0:34). Later, the user navigates back to the home screen (0:37) and likes some more plants, and adds an orchid to the cart (0:38-0:44).