# **PIP Installs**

In [24]:
!pip uninstall -y huggingface_hub peft diffusers transformers
!pip install huggingface_hub peft diffusers transformers
!pip install --upgrade pip
!pip install git+https://github.com/openai/whisper.git

Found existing installation: huggingface-hub 0.28.1
Uninstalling huggingface-hub-0.28.1:
  Successfully uninstalled huggingface-hub-0.28.1
Found existing installation: peft 0.14.0
Uninstalling peft-0.14.0:
  Successfully uninstalled peft-0.14.0
Found existing installation: diffusers 0.32.2
Uninstalling diffusers-0.32.2:
  Successfully uninstalled diffusers-0.32.2
Found existing installation: transformers 4.48.3
Uninstalling transformers-4.48.3:
  Successfully uninstalled transformers-4.48.3
Collecting huggingface_hub
  Using cached huggingface_hub-0.28.1-py3-none-any.whl.metadata (13 kB)
Collecting peft
  Using cached peft-0.14.0-py3-none-any.whl.metadata (13 kB)
Collecting diffusers
  Using cached diffusers-0.32.2-py3-none-any.whl.metadata (18 kB)
Collecting transformers
  Using cached transformers-4.48.3-py3-none-any.whl.metadata (44 kB)
Using cached huggingface_hub-0.28.1-py3-none-any.whl (464 kB)
Using cached peft-0.14.0-py3-none-any.whl (374 kB)
Using cached diffusers-0.32.2-py3-n

Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-kgjumw6l
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-kgjumw6l
  Resolved https://github.com/openai/whisper.git to commit 517a43ecd132a2089d85f4ebc044728a71d49f6e
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


# **Import Statements**

In [None]:
# Core Libraries
import os
import gc
import numpy as np
import matplotlib.pyplot as plt
import librosa.display
import IPython.display as ipd

# PyTorch and Diffusers
import torch
from diffusers import AnimateDiffPipeline, MotionAdapter, EulerDiscreteScheduler
from safetensors.torch import load_file
from huggingface_hub import hf_hub_download

# Audio Processing
import librosa
import soundfile as sf
import whisper

# Video Processing
from moviepy.editor import VideoFileClip, AudioFileClip, concatenate_videoclips
import imageio

# GPT Integration
from openai import OpenAI

# **Plotting waveform bar plots and chunking**

**This function manages GPU memory to prevent out-of-memory errors**

In [3]:
def reset_cuda_memory():
    if torch.cuda.is_available():
        torch.cuda.empty_cache() #frees unused cached memory
        gc.collect() #triggers garbage collection
        torch.cuda.reset_peak_memory_stats() #resest peak memory tracking stats

**This analyzes the audio and creates Waveform, Spectral Centroid, and Energy(RMS) plots**

In [2]:
def analyze_audio_with_plots(audio_segment, sr, segment_number):
    """Analyze audio and create visualization plots"""
    # Get features
    features = analyze_audio_emotion(audio_segment, sr)
    emotions = interpret_features(features)

    # Create subplot figure
    plt.figure(figsize=(15, 10))

    # Waveform - Shows amplitude over time
    plt.subplot(3, 1, 1)
    librosa.display.waveshow(audio_segment, sr=sr)
    plt.title(f'Waveform (Segment {segment_number})')
    plt.xlabel('Time (s)')
    plt.ylabel('Amplitude')

    # Spectral Centroid - Displays frequency characteristics, indicating brightness/timbre
    plt.subplot(3, 1, 2)
    cent = librosa.feature.spectral_centroid(y=audio_segment, sr=sr)[0]
    times = librosa.times_like(cent)
    plt.plot(times, cent)
    plt.title(f'Spectral Centroid (Brightness: {features["spectral_centroid_mean"]:.2f})')
    plt.xlabel('Time (s)')
    plt.ylabel('Frequency (Hz)')

    # 3. Energy (RMS) - Shows volume/intensity variations, indicating mood
    plt.subplot(3, 1, 3)
    rms = librosa.feature.rms(y=audio_segment)[0]
    times = librosa.times_like(rms)
    plt.plot(times, rms)
    plt.title(f'Energy (RMS) - Level: {features["energy_mean"]:.4f}, Mood: {emotions["arousal"]}')
    plt.xlabel('Time (s)')
    plt.ylabel('Energy')

    plt.tight_layout()
    plt.savefig(f'segment_{segment_number}_analysis.png')
    plt.close()

**Analyzing emotions behind the audio**

In [4]:
#Using RMS, Pitch, Tempo, Spectral Centroid, and Zero Crossing Rate (for voice texture) this function extracts key audio features.

def analyze_audio_emotion(audio_segment, sr):
    features = {}

    # Energy and Intensity Features
    rms = librosa.feature.rms(y=audio_segment)[0]
    features['energy_mean'] = float(np.mean(rms))

    # Pitch Features
    pitches, magnitudes = librosa.piptrack(y=audio_segment, sr=sr)
    pitch_means = []
    for i in range(pitches.shape[1]):
        pitches_t = pitches[:, i]
        mags_t = magnitudes[:, i]
        if len(mags_t) > 0:
            pitch_means.append(pitches_t[mags_t == mags_t.max()][0])

    if pitch_means:
        features['pitch_mean'] = float(np.mean(pitch_means))

    # Rhythm Features
    tempo, _ = librosa.beat.beat_track(y=audio_segment, sr=sr)
    features['tempo'] = float(np.mean(tempo))

    # Spectral Features
    cent = librosa.feature.spectral_centroid(y=audio_segment, sr=sr)[0]
    features['spectral_centroid_mean'] = float(np.mean(cent))

    # Zero Crossing Rate
    zcr = librosa.feature.zero_crossing_rate(audio_segment)[0]
    features['zero_crossing_rate'] = float(np.mean(zcr))

    return features


#Using the audio features extracted in the analyze_audio_emotion() function this function converts those numberical values into equivalent emotional characteristics

def interpret_features(features):
    """Interpret the extracted features to determine emotional characteristics"""
    emotions = {}

    # Analyze energy (arousal level)
    energy_level = features['energy_mean']
    if energy_level > 0.1:
        emotions['arousal'] = 'high'
    elif energy_level > 0.05:
        emotions['arousal'] = 'medium'
    else:
        emotions['arousal'] = 'low'

    # Analyze pitch (emotional intensity)
    if 'pitch_mean' in features:
        pitch_mean = features['pitch_mean']
        if pitch_mean > 500:
            emotions['pitch_emotion'] = 'excited/stressed'
        elif pitch_mean > 300:
            emotions['pitch_emotion'] = 'neutral/positive'
        else:
            emotions['pitch_emotion'] = 'calm/serious'
    else:
        emotions['pitch_emotion'] = 'neutral'

    # Analyze tempo
    tempo = features['tempo']
    if tempo > 120:
        emotions['tempo_indication'] = 'energetic/happy'
    elif tempo > 90:
        emotions['tempo_indication'] = 'moderate/neutral'
    else:
        emotions['tempo_indication'] = 'slow/calm/sad'

    # Analyze spectral characteristics (timbre)
    brightness = features['spectral_centroid_mean']
    if brightness > 2000:
        emotions['timbre'] = 'bright/sharp'
    else:
        emotions['timbre'] = 'dark/warm'

    return emotions

In [5]:
class VideoGenerator:
    def __init__(self, device="cuda", dtype=torch.float16):
        self.device = device
        self.dtype = dtype
        self.setup_pipeline()

    #Sets up the AnimateDiff and pipeline woith motion adapter
    def setup_pipeline(self):
        reset_cuda_memory()

        if torch.cuda.is_available():
            torch.cuda.set_per_process_memory_fraction(0.6)

        repo = "ByteDance/AnimateDiff-Lightning"
        ckpt = "animatediff_lightning_2step_diffusers.safetensors"
        base = "emilianJR/epiCRealism"

        adapter = MotionAdapter().to(self.device, self.dtype)
        adapter.load_state_dict(load_file(hf_hub_download(repo, ckpt), device=self.device))

        reset_cuda_memory()

        self.pipe = AnimateDiffPipeline.from_pretrained(
            base,
            motion_adapter=adapter,
            torch_dtype=self.dtype,
            low_cpu_mem_usage=True
        ).to(self.device)

        self.pipe.enable_attention_slicing(slice_size=1)
        self.pipe.enable_vae_slicing()
        self.pipe.enable_model_cpu_offload()

        self.pipe.scheduler = EulerDiscreteScheduler.from_config(
            self.pipe.scheduler.config,
            timestep_spacing="trailing",
            beta_schedule="linear"
        )

        reset_cuda_memory()

    # Takes a text prompt and generates a 24fps short video
    def generate_video_segment(self, prompt: str, output_path: str):
        try:
            reset_cuda_memory()

            output = self.pipe(
                prompt=prompt,
                guidance_scale=7.5,
                num_inference_steps=20,
                num_frames=24,
                height=512,
                width=512
            )

            frames = [np.array(frame) for frame in output.frames[0]]
            del output
            reset_cuda_memory()

            imageio.mimsave(output_path, frames, fps=8)

            del frames
            reset_cuda_memory()

            return True

        except RuntimeError as e:
            print(f"Error generating video: {str(e)}")
            reset_cuda_memory()
            return False

In [6]:
def create_music_video(audio_file: str, output_path: str, start_time: float = 0.0, duration: float = 18.0):
    """Create music video with emotional analysis-driven prompts"""
    print(f"Creating music video from {start_time}s to {start_time + duration}s...")

    reset_cuda_memory()

    # Load audio section
    y, sr = librosa.load(audio_file, offset=start_time, duration=duration)
    temp_audio_section = "temp_audio_section.wav"
    sf.write(temp_audio_section, y, sr)

    segment_duration = 3.0
    num_segments = int(np.ceil(duration / segment_duration))

    generator = VideoGenerator()
    model = whisper.load_model("base")

    video_clips = []
    for i in range(num_segments):
        print(f"\nProcessing segment {i+1}/{num_segments}")
        print(f"Time range: {start_time + i*segment_duration:.2f}s - {start_time + min((i+1)*segment_duration, duration):.2f}s")

        # Extract and analyze audio segment
        start_sample = int(i * segment_duration * sr)
        end_sample = int(min((i + 1) * segment_duration * sr, len(y)))
        segment = y[start_sample:end_sample]

        # Create analysis plots
        analyze_audio_with_plots(segment, sr, i+1)

        # Analyze audio emotion
        features = analyze_audio_emotion(segment, sr)
        emotions = interpret_features(features)

        # Save temporary audio segment for transcription
        temp_audio = f"temp_segment_{i}.wav"
        sf.write(temp_audio, segment, sr)

        # Transcribe lyrics
        try:
            result = model.transcribe(temp_audio)
            lyrics = result['text']
            print(f"Transcribed lyrics: {lyrics}")
        except Exception as e:
            print(f"Transcription error: {str(e)}")
            lyrics = ""

        # Create GPT-4 prompt
        context = {
            'mood': emotions['arousal'],
            'emotional_tone': emotions['pitch_emotion'],
            'pacing': emotions['tempo_indication'],
            'atmosphere': emotions['timbre'],
            'energy_level': f"{features['energy_mean']:.4f}",
            'pitch': f"{features.get('pitch_mean', 'N/A')} Hz",
            'tempo': f"{features['tempo']:.2f} BPM",
            'brightness': f"{features['spectral_centroid_mean']:.2f}",
            'zero_crossing_rate': f"{features['zero_crossing_rate']:.4f}"
        }

        different_prompts = [
                      """Generate an abstract prompt for video generation which is black and white, where where the scene description is based on these lyrics:
                      "{lyrics}"

                      Detailed Audio Analysis:
                      1. Energy & Intensity:
                      - Energy Level: {context['energy_level']}
                      - Overall Mood: {context['mood']} energy
                      - Zero Crossing Rate: {context['zero_crossing_rate']} (voice texture)

                      2. Tonal Qualities:
                      - Emotional Tone: {context['emotional_tone']}
                      - Average Pitch: {context['pitch']}
                      - Brightness Level: {context['brightness']}
                      - Timbre: {context['atmosphere']}

                      3. Rhythmic Elements:
                      - Tempo: {context['tempo']}
                      - Pacing: {context['pacing']}

                      Create a cinematic scene that precisely matches:
                      - The emotional intensity indicated by the energy levels
                      - The mood suggested by the pitch and timbre
                      - The pacing implied by the tempo
                      - The visual atmosphere that complements these audio characteristics
                      - The narrative conveyed by the lyrics

                      Provide specific details about:
                      1. Visual tone (lighting, color palette)
                      2. Camera movements
                      3. Scene composition
                      4. Key visual elements
                      5. Transitions and effects

                      Overall, still keep the prompt abstract and concise without losing any crucial information."""

                      ,


                      """Generate a concise prompt for video generation where the scene description is based on these lyrics:
                      "{lyrics}"

                      Detailed Audio Analysis:
                      1. Energy & Intensity:
                      - Energy Level: {context['energy_level']}
                      - Overall Mood: {context['mood']} energy
                      - Zero Crossing Rate: {context['zero_crossing_rate']} (voice texture)

                      2. Tonal Qualities:
                      - Emotional Tone: {context['emotional_tone']}
                      - Average Pitch: {context['pitch']}
                      - Brightness Level: {context['brightness']}
                      - Timbre: {context['atmosphere']}

                      3. Rhythmic Elements:
                      - Tempo: {context['tempo']}
                      - Pacing: {context['pacing']}

                      Create a cinematic scene that precisely matches:
                      - The emotional intensity indicated by the energy levels
                      - The mood suggested by the pitch and timbre
                      - The pacing implied by the tempo
                      - The visual atmosphere that complements these audio characteristics
                      - The narrative conveyed by the lyrics

                      Provide specific details about:
                      1. Visual tone (lighting, color palette)
                      2. Camera movements
                      3. Scene composition
                      4. Key visual elements
                      5. Transitions and effects

                      Overall, still keep the prompt concise without losing any crucial information."""]


        gpt_prompt = f"""{different_prompts[0]}"""

        # Get GPT response
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[{"role": "user", "content": gpt_prompt}]
        )
        prompt = response.choices[0].message.content

        print(f"Generated prompt: {prompt}")
        print(f"Emotional analysis: {emotions}")

        # Generate video
        temp_video = f"temp_video_{i}.mp4"
        success = generator.generate_video_segment(prompt, temp_video)

        if success:
            clip = VideoFileClip(temp_video)
            segment_length = (end_sample - start_sample) / sr
            clip = clip.set_duration(segment_length)
            video_clips.append(clip)

        reset_cuda_memory()

    if video_clips:
        print("\nCombining video clips and adding audio...")
        final_video = concatenate_videoclips(video_clips, method="compose")
        audio = AudioFileClip(temp_audio_section)
        final_video = final_video.set_audio(audio)

        final_video.write_videofile(output_path, fps=30, audio_codec='aac')

        # Cleanup
        for clip in video_clips:
            clip.close()
        audio.close()
        final_video.close()

    # Cleanup temporary files
    for i in range(num_segments):
        for prefix in ['temp_segment_', 'temp_video_']:
            temp_file = f"{prefix}{i}.wav" if 'segment' in prefix else f"{prefix}{i}.mp4"
            if os.path.exists(temp_file):
                os.remove(temp_file)

    if os.path.exists(temp_audio_section):
        os.remove(temp_audio_section)

    reset_cuda_memory()

if __name__ == "__main__":
    reset_cuda_memory()

    # Initialize OpenAI client
    OPENAI_API_KEY = ""  # Replace with your key
    client = OpenAI(api_key=OPENAI_API_KEY)
    create_music_video(
        audio_file="audio3.mp3",
        output_path="short_music_video_1.mp4",
        start_time=42.0,
        duration=6.0
    )

Creating music video from 42.0s to 48.0s...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.



Loading pipeline components...:   0%|          | 0/6 [00:00<?, ?it/s]


  checkpoint = torch.load(fp, map_location=device)




Processing segment 1/2
Time range: 42.00s - 45.00s
Transcribed lyrics: 
Generated prompt: **Abstract Prompt for Black and White Video Generation:**

---

**Scene Description:**

In a monochrome world where shadows dance and light whispers, the ambiance unfolds—a tapestry woven from the lyrics’ narrative, dancing between softness and raw intensity.

**Detailed Audio Analysis Alignment:**

1. **Energy & Intensity:**
   - **Energy Level & Mood:** Capture a symphony of contrasts—silent storms within gentle breezes, yearning framed in tranquility.
   - **Voice Texture:** Visualize the voice as delicate ripples across a still lake, the imagery fading in and out with each breath.

2. **Tonal Qualities:**
   - **Emotional Tone:** Portray a grayscale emotional landscape, bathed in misty nostalgia, where each note is a step into the past.
   - **Average Pitch & Brightness:** Illustrate shadows cast by faint glimmers of light, where whispers echo between chiaroscuros.
   - **Timbre & Atmosphere:

Token indices sequence length is longer than the specified maximum sequence length for this model (461 > 77). Running this sequence through the model will result in indexing errors
The following part of your input was truncated because CLIP can only handle sequences up to 77 tokens: ["capture a symphony of contrasts — silent storms within gentle breezes, yearning framed in tranquility. - ** voice texture :** visualize the voice as delicate ripples across a still lake, the imagery fading in and out with each breath. 2. ** tonal qualities :** - ** emotional tone :** portray a grayscale emotional landscape, bathed in misty nostalgia, where each note is a step into the past. - ** average pitch & brightness :** illustrate shadows cast by faint glimmers of light, where whispers echo between chiaroscuros. - ** timbre & atmosphere :** encapsulate the scene in a veil of ethereal mist, where each element is intangible but vividly felt. 3. ** rhythmic elements :** - ** tempo & pacing :** employ g

  0%|          | 0/20 [00:00<?, ?it/s]


Processing segment 2/2
Time range: 45.00s - 48.00s
Transcribed lyrics:  Hey you
Generated prompt: **Abstract Video Generation Prompt:**

---

**Visual Tone:**
Create a monochromatic landscape, oscillating between deep shadows and pale highlights to mirror the nuanced intensity of the music. The scene should exude a stark, high-contrast atmosphere, reminiscent of early film noir, reflecting the complex emotional tone.

**Camera Movements:**
Employ smooth, gradual pans and tracking shots to echo the seamless flow suggested by the tempo and pacing. Utilize slow zooms to capture and emphasize the subtleties in expression, aligning with the intricate audio texture.

**Scene Composition:**
Design a series of evocative, minimalist tableaux, where light and shadow carve out surreal geometry. Position solitary figures within vast, empty spaces, allowing their silhouettes to convey solitude and introspection, echoing the song's lyrical narrative.

**Key Visual Elements:**
Incorporate abstract m

The following part of your input was truncated because CLIP can only handle sequences up to 77 tokens: ["to echo the seamless flow suggested by the tempo and pacing. utilize slow zooms to capture and emphasize the subtleties in expression, aligning with the intricate audio texture. ** scene composition :** design a series of evocative, minimalist tableaux, where light and shadow carve out surreal geometry. position solitary figures within vast, empty spaces, allowing their silhouettes to convey solitude and introspection, echoing the song's lyrical narrative. ** key visual elements :** incorporate abstract motifs like rippling water or drifting smoke, whose fluidity matches the described energy levels. let these elements transform and merge, illustrating the emotional transitions within the song. ** transitions and effects :** use cross - dissolves to signify shifts in emotional tone, and apply a subtle grain effect to underscore the timbre ’ s vintage and atmospheric qualities. let th

  0%|          | 0/20 [00:00<?, ?it/s]


Combining video clips and adding audio...
Moviepy - Building video short_music_video_1.mp4.
MoviePy - Writing audio in short_music_video_1TEMP_MPY_wvf_snd.mp4




MoviePy - Done.
Moviepy - Writing video short_music_video_1.mp4





Moviepy - Done !
Moviepy - video ready short_music_video_1.mp4


In [21]:
from moviepy.editor import VideoFileClip, concatenate_videoclips

clips = [
   VideoFileClip("short_music_video1.mp4"),
   VideoFileClip("short_music_video2.mp4"),
]

final_clip = concatenate_videoclips(clips)
final_clip.write_videofile("combined_music_video_new.mp4", audio_codec='aac')

# Cleanup
for clip in clips:
   clip.close()
final_clip.close()

Moviepy - Building video combined_music_video_new.mp4.
MoviePy - Writing audio in combined_music_video_newTEMP_MPY_wvf_snd.mp4




MoviePy - Done.
Moviepy - Writing video combined_music_video_new.mp4





Moviepy - Done !
Moviepy - video ready combined_music_video_new.mp4
