In [None]:
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Gen Media on Vertex AI

Welcome to Gen Media on VertexAI! This notebook offers sample code on:
- Character Generation: Nano Banana
- Scripting the scenes: Gemini
- Generating the video: Veo 3

Each example is designed to be clear, easy to follow, and adaptable for your own projects, demonstrating a specific capability of the model.

| Authors |
| --- |
| [Laxmi Harikumar](https://github.com/laxmi-genai) |
| [Vlad Kolesnikov](https://github.com/https://github.com/vladkol/) |

## Environment Setup

Begin by installing the required Python packages for the notebook.

In [None]:
%pip install --upgrade --quiet google-genai pillow

### Authenticate your notebook environment (Colab only)

If you're running this notebook on Google Colab, run the cell below to authenticate your environment.

In [None]:
import sys

if "google.colab" in sys.modules:
    from google.colab import auth

    auth.authenticate_user()

### Configure Google Cloud Project

To use Vertex AI, you need a Google Cloud project with the [Vertex AI API enabled](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).

For more details, see the documentation on [setting up a project and development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment).

In [None]:
# Use the environment variable if the user doesn't provide Project ID.
import os

# fmt: off
PROJECT_ID = ""  # @param {type: "string", placeholder: "[your-project-id]", isTemplate: true}
# fmt: on
if not PROJECT_ID or PROJECT_ID == "[your-project-id]":
    PROJECT_ID = str(os.environ.get("GOOGLE_CLOUD_PROJECT"))

LOCATION = "global"

from google import genai

client = genai.Client(vertexai=True, project=PROJECT_ID, location=LOCATION)

## Image Generation with Nano Banana!

### Do the required imports

In [None]:
import io

from IPython.display import display
from PIL import Image
from google.genai import types

import base64
import time
from IPython.display import Video, Markdown

### Helper Functions and Utilities

This section contains the helper functions that will be used throughout the notebook to streamline image generation and processing tasks.

In [None]:
video_model = "veo-3.1-generate-preview"
gemini_model = "gemini-2.5-flash"

In [None]:
def show_video(video):
    if isinstance(video, str):
        file_name = video.split("/")[-1]
        !gsutil cp {video} {file_name}
        display(Video(file_name, embed=True, width=600, height=400))
    else:
        with open("sample.mp4", "wb") as out_file:
            out_file.write(video)
        display(Video("sample.mp4", embed=True, width=600, height=400))

## Example 1: Generate an Image from Text Prompt

We'll begin with a fundamental task: generating an image from a text prompt. Consider a scenario where you need to generate a menu for your restaurant.

In [None]:
prompt = """Create a menu with 3 sections and with images of 3 of its plates:

To start
OYSTERS BY THE SHELL
GARDEN TOMATO BRUSCHETTA
BABY GREENS SALAD

Main Dishes
CITRUS CHICKEN
GARLIC PRAWNS WITH LEMON
PAN SEARED SEA BASS

Desserts
CHOCOLATE WEDDING CAKE
ASSORTED LOCAL FRESH FRUITS
AFFOGATO
"""

MODEL_NAME = "gemini-2.5-flash-image"
GENERATION_CONFIG = types.GenerateContentConfig(
    temperature=1,
    top_p=0.95,
    max_output_tokens=32768,
    response_modalities=["TEXT", "IMAGE"],
)


contents = [types.Content(role="user",
                          parts=[types.Part.from_text(text=prompt)])]

print("Generating image from prompt...")
response = client.models.generate_content(
        model=MODEL_NAME,
        contents=contents,
        config=GENERATION_CONFIG,
    )

if response.candidates and response.candidates[0].content.parts:
    for part in response.candidates[0].content.parts:
        if part.inline_data and part.inline_data.data:
            display(Image.open(io.BytesIO(part.inline_data.data)))
else:
  print("No image was generated.")

## Example 2: Generate an Image from Text Prompt and an Image

Chat with the data to modify the generated images!

In [None]:
file_path = '/content/images/speaker.png'

prompt = """Create a vibrant lifestyle photo. Place the reference speaker on a
            picnic blanket in a park. A group of diverse, happy friends in
            their 20s are laughing in the background, enjoying a sunny
            afternoon. The scene should feel candid, warm, and full of life."""


with open(file_path, 'rb') as file:
    image_bytes = file.read()

image1 = types.Part.from_bytes(
  data=image_bytes,
  mime_type="image/png",
)

In [None]:
MODEL_NAME = "gemini-2.5-flash-image"
chat = client.chats.create(model=MODEL_NAME)

response = chat.send_message(
    message=[
        image1,
        prompt,
    ],
    config=types.GenerateContentConfig(
        response_modalities=["IMAGE"],
        image_config=types.ImageConfig(
            aspect_ratio="16:9",
        ),
    ),
)

data = image1
if response.candidates and response.candidates[0].content.parts:
      for part in response.candidates[0].content.parts:
        if part.inline_data and part.inline_data.data:
          display(Image.open(io.BytesIO(part.inline_data.data)))
          data = part.inline_data.data


In [None]:
modification_prompt = """Add a cheese tray and vegetables to the picnic food,
  change the background and turn this into a picnic at the
  beach on the sand during sunset."""
response = chat.send_message(
    message=[
        types.Part.from_bytes(
            data=data,
            mime_type="image/png",
        ),
        modification_prompt,
    ],
    config=types.GenerateContentConfig(
        response_modalities=["IMAGE"],
    ),
)

if response.candidates and response.candidates[0].content.parts:
    for part in response.candidates[0].content.parts:
      if part.inline_data and part.inline_data.data:
        display(Image.open(io.BytesIO(part.inline_data.data)))
        data = part.inline_data.data

## Scene Generation with Gemini

# Example 3: Generate the script with Gemini

It is very important to detail the characters and the scenes
Let's generate the script from the image?

In [None]:
from google import genai
from google.genai import types
import base64
import os

def generate():

  file_path = '/content/images/group_friends.png'

  with open(file_path, 'rb') as file:
      # .read() loads the entire file into memory as a 'bytes' object
      image_bytes = file.read()

  image1 = types.Part.from_bytes(
    data=image_bytes,
    mime_type="image/png",
  )


  prompt = """Generate detailed character profiles and scene details to create
    a 8 second video for one scene - the people discussing their Paris trip with
    just one dialogue. Provide the response as Markdown. Do not
    use tables in the output. For each character generate a Name, Role,
    Appearance, Personality and Wardrobe"""

  model = "gemini-2.5-flash"
  contents = [
    types.Content(
      role="user",
      parts=[
        image1,
        types.Part.from_text(text=prompt)
      ]
    )
  ]
  tools = [
    types.Tool(google_search=types.GoogleSearch()),
  ]

  generate_content_config = types.GenerateContentConfig(
    temperature = 1,
    top_p = 0.95,
    max_output_tokens = 65535,
    safety_settings = [types.SafetySetting(
      category="HARM_CATEGORY_HATE_SPEECH",
      threshold="OFF"
    ),types.SafetySetting(
      category="HARM_CATEGORY_DANGEROUS_CONTENT",
      threshold="OFF"
    ),types.SafetySetting(
      category="HARM_CATEGORY_SEXUALLY_EXPLICIT",
      threshold="OFF"
    ),types.SafetySetting(
      category="HARM_CATEGORY_HARASSMENT",
      threshold="OFF"
    )],
    tools = tools
  )

  response = client.models.generate_content(
    model = model,
    contents = contents,
    config = generate_content_config,
    )
  return(response.candidates[0].content.parts[0].text)


scene = generate()

In [None]:
display(Markdown(scene))

ðŸ“Œ Make sure to review the content generated by Gemini and tweak it to match your narrative and ideas

Create a prompt with the scene and the image generated

In [None]:
gemini_prompt = f"""
You are an expert prompt engineer for Google's Veo model. Analyze the provided
image and combine its content with the provided scene. Make sure to retain the
character identities. Integrate the image's
subject and scene with the requested motion and audio effects.
The final output must be ONLY the prompt itself,
with no preamble. Scene: {",".join(scene)}
"""
with open("/content/images/group_friends.png", "rb") as f:
    image = f.read()

response = client.models.generate_content(
    model=gemini_model,
    contents=[gemini_prompt, types.Part.from_bytes(data=image, mime_type="image/png")],
)

# Set Gemini's response in a prompt variable
prompt = response.text
display(Markdown(response.text))


ðŸ“Œ Make sure to review the prompt generated by Gemini and tweak it to match your narrative and ideas

# Example 4: Generate the video with Veo3

Generate the video with the prompt and a reference image

In [None]:
# prompt = types.SubjectReferenceImageDict
enhance_prompt = True  # @param {type: 'boolean'}
generate_audio = True  # @param {type: 'boolean'}

operation = client.models.generate_videos(
    model=video_model,
    prompt=prompt,
    config=types.GenerateVideosConfig(
        reference_images=[
            types.VideoGenerationReferenceImage(
                image=types.Image.from_file(location="/content/images/group_friends.png"),
                reference_type="asset",
            )
        ],
        aspect_ratio="16:9",
        number_of_videos=1,
        duration_seconds=8,
        resolution="1080p",
        person_generation="allow_adult",
        enhance_prompt=enhance_prompt,
        generate_audio=generate_audio,
    ),
)

while not operation.done:
    time.sleep(15)
    operation = client.operations.get(operation)
    print(operation)

if operation.response:
    show_video(operation.result.generated_videos[0].video.video_bytes)

# Example 5: Elevate the video with prompts

Elevate your clip to a cinematic shot by directing the camera and ambiance.

In [None]:
prompt = f"""A car driving in the city at night."""


operation = client.models.generate_videos(
    model=video_model,
    prompt=prompt,
    config=types.GenerateVideosConfig(
        aspect_ratio="16:9",
        number_of_videos=1,
        duration_seconds=8,
        resolution="1080p",
        person_generation="allow_adult",
    ),
)

while not operation.done:
    time.sleep(15)
    operation = client.operations.get(operation)
    print(operation)

if operation.response:
    show_video(operation.result.generated_videos[0].video.video_bytes)

In [None]:
prompt = f"""Cinematic wide shot of a vintage 1980s sports car, gleaming cherry
red, speeding through a rain-slicked neon alley at night. The camera tracks
smoothly alongside the car. Moody, high-contrast film noir lighting with
reflections of pink and blue neon signs on the wet asphalt."""

enhance_prompt = True  # @param {type: 'boolean'}
generate_audio = True  # @param {type: 'boolean'}

operation = client.models.generate_videos(
    model=video_model,
    prompt=prompt,
    config=types.GenerateVideosConfig(
        aspect_ratio="16:9",
        number_of_videos=1,
        duration_seconds=8,
        resolution="1080p",
        person_generation="allow_adult",
        enhance_prompt=enhance_prompt,
        generate_audio=generate_audio,
    ),
)

while not operation.done:
    time.sleep(15)
    operation = client.operations.get(operation)
    print(operation)

if operation.response:
    show_video(operation.result.generated_videos[0].video.video_bytes)