In [24]:
import google.generativeai as genai
from google.colab import userdata

# Securely store your API key in Colab secrets and access it here
GOOGLE_API_KEY = userdata.get('GOOGLE_API_KEY')
genai.configure(api_key=GOOGLE_API_KEY)

model = genai.GenerativeModel("gemini-2.5-flash")

In [25]:
time_duration = 30

In [26]:
file_path = '/content/john_jre.txt'

try:
    with open(file_path, 'r', encoding='utf-8') as f:
        transcript_text = f.read()
    print("Successfully read the file. First 500 characters:")
    print(transcript_text[:500])
except FileNotFoundError:
    print(f"Error: The file {file_path} was not found.")
except Exception as e:
    print(f"An error occurred: {e}")

Successfully read the file. First 500 characters:
# tactiq.io free youtube transcript
# No title found
# https://www.youtube.com/watch/sJ4Ho4ccuFg

00:00:01.920 Joe Rogan podcast. Check it out. The Joe Rogan Experience. Train by day. Joe Rogan podcast by
00:00:08.400 night. All day. We're rolling. What's up, John Cena in
00:00:15.440 the [ __ ] house on? Yeah. Yeah. Let's put these on. Pretend we're professional. What's up? Good to see
00:00:21.600 you, man. Thanks so much for having me. Appreciate my pleasure being here. And there's no way I'm


In [27]:
user_prompt = f"""
You are selecting a short transcript snippet from a longer podcast transcript for use in a YouTube Short.

SOURCE TRANSCRIPT (authoritative; do not add new content):
\"\"\"{transcript_text}\"\"\"

TASK:
Extract a single, continuous snippet directly from the source transcript that would make the most compelling short-form podcast clip.

STRICT RULES (must follow):
1. Use ONLY sentences or sentence fragments that appear verbatim in the source transcript.
   - Do NOT paraphrase.
   - Do NOT summarize.
   - Do NOT add commentary or narration.
2. The snippet must be continuous (no skipping around).
3. Total spoken length must be less than {time_duration} seconds.
4. The snippet must BEGIN at a naturally interesting or provocative moment that immediately grabs attention.
5. Preserve the original speaking order and wording.
6. Replace all speaker names with generic labels (e.g., "Person 1", "Person 2").

OUTPUT FORMAT:
- Output ONLY the selected transcript snippet.
- Do NOT include explanations, timestamps, or metadata.
- Do NOT restate the rules.

Goal:
Select the most engaging moment that captures the core idea, tension, or insight of the conversation and works as a viral short.
"""

In [28]:
user_prompt

'\nYou are selecting a short transcript snippet from a longer podcast transcript for use in a YouTube Short.\n\nSOURCE TRANSCRIPT (authoritative; do not add new content):\n"""# tactiq.io free youtube transcript\n# No title found\n# https://www.youtube.com/watch/sJ4Ho4ccuFg\n\n00:00:01.920 Joe Rogan podcast. Check it out. The Joe Rogan Experience. Train by day. Joe Rogan podcast by\n00:00:08.400 night. All day. We\'re rolling. What\'s up, John Cena in\n00:00:15.440 the [\xa0__\xa0] house on? Yeah. Yeah. Let\'s put these on. Pretend we\'re professional. What\'s up? Good to see\n00:00:21.600 you, man. Thanks so much for having me. Appreciate my pleasure being here. And there\'s no way I\'m having a pro wrestler on without Tony Hinchcliffe.\n00:00:28.720 possible. He\'s the expert. He knows more about pro wrestling than I know about UFC. Yeah, sometimes I translate little\n00:00:35.440 things here and there. That\'s cool. It\'s all right. Yeah, he has to. He has to. And he\'s a giant fan o

In [29]:
response = model.generate_content(
    contents=user_prompt,
)
print(response.text)

Person 1: "And then they heard me rap in the back of the bus and was like, 'Man, Stephanie heard me rap in the back of the bus.'"
Person 2: "Yeah."
Person 1: "And was like, 'Yo, you want to do that on TV?' I'm like, 'Lose my job or [ __ ] rap?' 'Yeah, let's go. Let's Let's do this.'"


In [30]:
short_text = response.text
short_text

'Person 1: "And then they heard me rap in the back of the bus and was like, \'Man, Stephanie heard me rap in the back of the bus.\'"\nPerson 2: "Yeah."\nPerson 1: "And was like, \'Yo, you want to do that on TV?\' I\'m like, \'Lose my job or [ __ ] rap?\' \'Yeah, let\'s go. Let\'s Let\'s do this.\'"'

In [36]:
visual_prompt = f"""
You are an expert AI Cinematographer and Visual Director. Your goal is to translate raw podcast transcripts into detailed, photorealistic video generation prompts optimized for Google Veo.

Transcript:
{short_text}

**YOUR OBJECTIVE:**
You will be given a segment of text (a podcast transcript). You must hallucinate the visual context. Do not visualize people sitting in a room recording a podcast. Instead, visualize the *story* they are telling. If the text is abstract (e.g., philosophy), visualize a metaphorical or "B-roll" style scene that fits the mood.

**STEP 1: ANALYZE**
Read the transcript to determine:
1. The Core Subject (What is the physical noun? A car? A forest? A city?)
2. The Action (Movement, speed, direction)
3. The Mood (Lighting, color palette, atmosphere)

**STEP 2: GENERATE PROMPT**
Generate a single, dense paragraph that follows this specific formula:
`[Camera Movement/Angle] + [Subject Description] + [Action/Movement] + [Environment/Lighting] + [Film Style/Aesthetics]`

**GUIDELINES FOR GENERATION:**
* **Camera:** Use specific terms like "Low-angle dolly shot," "Aerial drone view," "Close-up macro shot," or "Handheld tracking shot."
* **Lighting:** Describe the light source (e.g., "Golden hour sunlight," "Neon cyberpunk lighting," "Harsh fluorescent flickering," "Soft cinematic diffusion").
* **Style:** Specify a look (e.g., "35mm film grain," "4k sharp digital," "Vintage 1980s VHS," "Cinematic documentary style").
* **Negative Constraints:** Implicitly avoid text. The video should contain NO subtitles and NO overlaid text.

**HANDLING DIFFERENT SCENARIOS:**
* *Scenario A (Concrete Story):* "I walked into the abandoned house." -> **Visual:** "Slow push-in camera movement towards a rotting Victorian door, peeling paint, overgrown vines, eerie blue moonlight, suspenseful atmosphere."
* *Scenario B (Abstract Concept):* "The economy is crashing." -> **Visual:** "Time-lapse of a busy city street blurring into chaos, red lighting casting long shadows, frantic energy, high contrast, cinematic newsreel style."
* *Scenario C (Conversation/Dialogue):* "I told him he was wrong." -> **Visual:** "Over-the-shoulder shot of two silhouettes arguing in the rain, dramatic rim lighting, intense emotional atmosphere, shallow depth of field."

**OUTPUT FORMAT:**
Return ONLY the visual prompt string. Do not output explanations or the original text.
"""

In [37]:
vresponse = model.generate_content(
    contents=user_prompt,
)
print(vresponse.text)

vi_text = vresponse.text

Person 1: everybody was like, what the [ __ ] did you just say? We don't that's not how we do it over here. And again, just cuz like my takeaway and it was a it it was a pretty tense moment for me. Like I had to apologize to China. And in apologizing to China, I I pissed off my home country.


In [38]:
import time
from google import genai
from google.genai import types

client = genai.Client(api_key=GOOGLE_API_KEY)

video_prompt = f"""
You are generating a short-form video clip to accompany an EXISTING audio track.

CRITICAL AUDIO CONSTRAINT (MUST FOLLOW):
- The audio for this video is PRE-RECORDED and comes DIRECTLY from the provided transcript.
- Do NOT generate any audio, vocals, speech, singing, rapping, narration, or sound effects.
- The video MUST be SILENT by itself.
- The transcript is NOT dialogue to be acted out — it is a fixed voiceover.

TRANSCRIPT (audio reference only; do not reinterpret):
\"\"\"{vi_text}\"\"\"

PRIMARY TASK:
Generate visuals that SHOW the SITUATION, actions, or scenario being described in the transcript, as if the viewer is watching what the speaker is talking about.

HARD CONSTRAINTS:
- Maximum duration: {time_duration} seconds.
- No visible people speaking, singing, or facing the camera.
- No lip movement of any kind.
- No musical performance, rapping, or rhythmic vocalization.
- No on-screen text, subtitles, captions, or lyrics.
- No podcast studios, microphones, or interview setups.

VISUAL GUIDELINES:
- Translate spoken ideas into concrete visual scenes (environments, motion, events, symbolic actions).
- People may appear ONLY as background actors or performing actions, never speaking.
- Camera movement should feel cinematic and intentional.
- Maintain a consistent visual style and continuity.

STYLE & PACING:
- Designed for short-form social media (YouTube Shorts).
- Visually engaging within the first 2 seconds.
- Clear visual progression from start to finish.

OUTPUT REQUIREMENT:
- One continuous, silent video clip.
- The video must align temporally with the transcript audio when the audio is added externally.
"""

print(video_prompt)

operation = client.models.generate_videos(
    model="veo-3.1-generate-preview",
    prompt=video_prompt,
)

# Poll the operation status until the video is ready.
while not operation.done:
    print("Waiting for video generation to complete...")
    time.sleep(10)
    operation = client.operations.get(operation)

# --- Diagnostic Start ---
print("Operation done. Checking response:")
print(operation)
print(operation.response)
if operation.response and hasattr(operation.response, 'generated_videos'):
    if operation.response.generated_videos:
        # Download the generated video.
        generated_video = operation.response.generated_videos[0]
        client.files.download(file=generated_video.video)
        generated_video.video.save("john_jre_short2.mp4")
        print("Generated video saved to john_jre_short2.mp4")
    else:
        print("No generated videos found in the response.")
else:
    print("Operation response does not contain 'generated_videos' attribute or is None.")
# --- Diagnostic End ---


You are generating a short-form video clip to accompany an EXISTING audio track.

CRITICAL AUDIO CONSTRAINT (MUST FOLLOW):
- The audio for this video is PRE-RECORDED and comes DIRECTLY from the provided transcript.
- Do NOT generate any audio, vocals, speech, singing, rapping, narration, or sound effects.
- The video MUST be SILENT by itself.
- The transcript is NOT dialogue to be acted out — it is a fixed voiceover.

TRANSCRIPT (audio reference only; do not reinterpret):
"""Person 1: everybody was like, what the [ __ ] did you just say? We don't that's not how we do it over here. And again, just cuz like my takeaway and it was a it it was a pretty tense moment for me. Like I had to apologize to China. And in apologizing to China, I I pissed off my home country."""

PRIMARY TASK:
Generate visuals that SHOW the SITUATION, actions, or scenario being described in the transcript, as if the viewer is watching what the speaker is talking about.

HARD CONSTRAINTS:
- Maximum duration: 30 seco