<a href="https://colab.research.google.com/github/cavemansblog/googlecolabrepo/blob/main/SixthExperiment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
# Enable CUDA_LAUNCH_BLOCKING for better error reporting
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

from IPython import get_ipython
from IPython.display import display
#AIzaSyBmEm1DuO2Rk1OTpx6RUUYkVmcbKDEVNIY
# 🚀 AI-Powered Motivation Video Generator (Enhanced Cinematic Edition)

# 📦 Install Dependencies
!pip install openai google-cloud-texttospeech moviepy diffusers transformers accelerate torchaudio --quiet

# 🔧 Configuration
from google.generativeai import GenerativeModel, configure
configure(api_key="AIzaSyBmEm1DuO2Rk1OTpx6RUUYkVmcbKDEVNIY")

from diffusers import StableDiffusionPipeline
from transformers import MusicgenForConditionalGeneration, MusicgenProcessor
from google.cloud import texttospeech
from moviepy.editor import *
from moviepy.audio.fx.all import audio_fadein, audio_fadeout
import torchaudio, torch, os, re
from PIL import Image

# 🧠 Step 1: Script Generation
model = GenerativeModel("gemini-1.5-flash")
def generate_script(topic="Motivation for success"):
    prompt = f"dark background, 5-scene quotes for men about '{topic}'. Each scene should be one short sentence. Make it inspiring for men with thought-provoking language."
    response = model.generate_content(prompt)
    return [line.strip() for line in response.text.split('\n') if line.strip()] if response.text else []

# 🎨 Step 2: Image Generation
# Try loading with float32 if float16 causes CUDA memory issues
# Keep float32 for Stable Diffusion as it was explicitly chosen
pipe = StableDiffusionPipeline.from_pretrained("Lykon/dreamshaper-8", torch_dtype=torch.float32) # Load without moving to cuda initially

# Attempt to move tokenizer and text encoder to CPU first as suggested by similar errors [1]
try:
    if hasattr(pipe, 'tokenizer'):
        pipe.tokenizer.to('cpu')
    if hasattr(pipe, 'text_encoder'):
         pipe.text_encoder.to('cpu')
    print("Tokenizer and text encoder moved to CPU.")
except Exception as e:
    print(f"Could not move tokenizer/text encoder to CPU: {e}")

# Now move the rest of the pipeline to CUDA
try:
    pipe.to("cuda")
    print("StableDiffusionPipeline moved to CUDA.")
except Exception as e:
    print(f"Error moving StableDiffusionPipeline to CUDA after attempting CPU move: {e}")
    # If the error persists here, the issue is likely with other components of the pipeline or the CUDA environment.
    # Consider reducing the model size, checking GPU memory, or verifying CUDA driver installation.


# Test if the pipeline works on CUDA after loading
try:
    # Create a dummy prompt tensor and move it to CUDA
    dummy_prompt = ["a test image"]
    # The pipe expects text prompts as a list of strings,
    # the tensor conversion happens internally by the tokenizer and text encoder.
    # So, no need to manually move the prompt string to CUDA.
    # Let's just run a minimal inference step to check
    print("Testing StableDiffusionPipeline inference on CUDA...")
    _ = pipe(dummy_prompt, num_inference_steps=1).images[0]
    print("StableDiffusionPipeline test successful on CUDA.")
except Exception as e:
    print(f"Error during StableDiffusionPipeline CUDA inference test: {e}")
    # If the error happens here, the issue is likely with the pipeline itself or environment setup.
    # Consider trying a different model or ensuring CUDA drivers are correctly installed.


def generate_image(prompt, i):
    # The prompt is a string and doesn't need to be explicitly moved to CUDA.
    # The pipeline handles tokenization and moving the encoded text to the device.
    image = pipe(f"{prompt}, cinematic lighting, ultra high def, dramatic male energy, 8k", height=768, width=768, guidance_scale=9.0, num_inference_steps=40).images[0]
    image.save(f"scene_{i}.jpg")

# 🔊 Step 3: Voiceover
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "my-key.json"
tts_client = texttospeech.TextToSpeechClient()
def synthesize_speech(text, filename):
    synthesis_input = texttospeech.SynthesisInput(text=text)
    voice = texttospeech.VoiceSelectionParams(language_code="en-US", name="en-US-Wavenet-D", ssml_gender=texttospeech.SsmlVoiceGender.MALE)
    audio_config = texttospeech.AudioConfig(audio_encoding=texttospeech.AudioEncoding.MP3, speaking_rate=1.15, pitch=2.0)
    response = tts_client.synthesize_speech(input=synthesis_input, voice=voice, audio_config=audio_config)
    with open(filename, "wb") as out:
        out.write(response.audio_content)

# 🎶 Step 3.5: Background Music
# Clear CUDA cache and potentially delete the stable diffusion pipeline before loading Musicgen
# This part should only run AFTER image generation is complete
# del pipe # Moved this deletion
if torch.cuda.is_available():
    # torch.cuda.empty_cache() # Moved this cache clear
    pass # Keep this if statement for later use


# Load Musicgen with float16 to save memory
# music_model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small", attn_implementation="eager", torch_dtype=torch.float16) # Moved this loading
# music_model.to("cuda") # Moved this to CUDA
# print("Musicgen model loaded and moved to CUDA with float16.") # Moved this print


# Initialize the processor here
# processor = MusicgenProcessor.from_pretrained("facebook/musicgen-small") # Moved this initialization

# def generate_background_music(prompt="motivational ambient cinematic", duration=30, output_file="bg_music.wav"): # Moved this function
#     # Ensure processor is accessible. It's initialized outside the function.
#     # Ensure inputs match the model's dtype (float16 if loaded with float16)
#     inputs = processor(text=[prompt], return_tensors="pt")
#     # Move inputs to CUDA and ensure dtype matches the model
#     inputs = {k: v.to("cuda", dtype=music_model.dtype) for k, v in inputs.items()}
#     audio_values = music_model.generate(**inputs, max_new_tokens=duration * 50)
#     sampling_rate = music_model.config.audio_encoder.sampling_rate
#     torchaudio.save(output_file, audio_values[0].cpu(), sampling_rate)


# 🎬 Step 4: Compile Video
from moviepy.video.fx.all import crop

def build_video(scene_indices):
    clips = []
    for i in scene_indices:
        img = ImageClip(f"scene_{i}.jpg").resize(height=720).set_fps(24).set_duration(AudioFileClip(f"scene_{i}.mp3").duration + 1.5)
        zoomed = img.fx(crop, width=img.w*0.95, height=img.h*0.95, x_center=img.w/2, y_center=img.h/2).set_position("center")
        voice = AudioFileClip(f"scene_{i}.mp3")
        silence = AudioClip(lambda t: 0, duration=1.5).set_fps(voice.fps)
        combined_audio = CompositeAudioClip([voice, silence.set_start(voice.duration)])
        final_clip = zoomed.set_audio(combined_audio)
        clips.append(final_clip)

    final = concatenate_videoclips(clips, method="compose")
    if os.path.exists("bg_music.wav"):
        bg_music = AudioFileClip("bg_music.wav").subclip(0, final.duration).volumex(0.3)
        final = final.set_audio(CompositeAudioClip([final.audio, audio_fadein(bg_music, 2).audio_fadeout(2)]))
    final.write_videofile("final_output.mp4", fps=24, codec='libx264', audio_codec="aac")

# 🚀 Step 5: Run Pipeline
topic = "Motivation for success"
scenes = generate_script(topic)
scene_indices = []

print(f"Pipe exists before image generation loop: {'pipe' in globals()}") # Debugging print

for i, line in enumerate(scenes):
    clean_text = re.sub(r"^(Scene\s*\d+[:.\s-]*|\d+[:.\s-]*)", "", line.strip(), flags=re.IGNORECASE)
    generate_image(clean_text, i)
    synthesize_speech(clean_text, f"scene_{i}.mp3")
    scene_indices.append(i)

# Delete image generation pipeline after use to free up memory before generating music
del pipe # Stable Diffusion pipeline is no longer needed
if torch.cuda.is_available():
    torch.cuda.empty_cache()
print("Deleted StableDiffusionPipeline after image generation and cleared CUDA cache.")


# Moved Musicgen loading and generation logic here
try:
    music_model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small", attn_implementation="eager", torch_dtype=torch.float16)
    music_model.to("cuda")
    print("Musicgen model loaded and moved to CUDA with float16.")
except Exception as e:
     print(f"Error loading or moving Musicgen model to CUDA: {e}")
     # Fallback to float32 if float16 fails, although this might still lead to OOM
     try:
         print("Attempting to load Musicgen with float32 as float16 failed.")
         music_model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small", attn_implementation="eager", torch_dtype=torch.float32)
         music_model.to("cuda")
         print("Musicgen model loaded and moved to CUDA with float32.")
     except Exception as e_float32:
          print(f"Error loading or moving Musicgen model to CUDA even with float32: {e_float32}")
          raise # Re-raise the exception if both attempts fail

processor = MusicgenProcessor.from_pretrained("facebook/musicgen-small")

def generate_background_music(prompt="motivational ambient cinematic", duration=30, output_file="bg_music.wav"):
    # Ensure processor is accessible. It's initialized outside the function.
    inputs = processor(text=[prompt], return_tensors="pt")

    # Move inputs to CUDA. Ensure input_ids remains as LongTensor.
    cuda_inputs = {}
    for k, v in inputs.items():
        # input_ids should be LongTensor, other tensors might need to match model dtype
        if k == 'input_ids':
            cuda_inputs[k] = v.to("cuda", dtype=torch.long) # Keep input_ids as Long
        else:
            # Cast other tensors to the model's dtype (float16 or float32)
            cuda_inputs[k] = v.to("cuda", dtype=music_model.dtype)


    audio_values = music_model.generate(**cuda_inputs, max_new_tokens=duration * 50)
    sampling_rate = music_model.config.audio_encoder.sampling_rate
    torchaudio.save(output_file, audio_values[0].cpu(), sampling_rate)

# Reduce the duration to prevent overly long sequences
# Calculate a duration based on the number of scenes and a shorter multiplier, e.g., 5 seconds per scene
generate_background_music(duration=len(scene_indices)*5 + 5) # Changed multiplier from 10 to 5
build_video(scene_indices)

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]




Could not move tokenizer/text encoder to CPU: CLIPTokenizer has no attribute to
Error moving StableDiffusionPipeline to CUDA after attempting CPU move: CUDA error: device-side assert triggered
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.

Testing StableDiffusionPipeline inference on CUDA...


  0%|          | 0/1 [00:00<?, ?it/s]

StableDiffusionPipeline test successful on CUDA.
Pipe exists before image generation loop: True


  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

KeyboardInterrupt: 