# Download The Song From Internet

In [1]:
%pip install yt_dlp

Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
import yt_dlp

ydl_opts = {
    'format': 'bestaudio/best',
    'postprocessors': [{
        'key': 'FFmpegExtractAudio',
        'preferredcodec': 'mp3',
        'preferredquality': '192',
    }],
    'outtmpl': 'music.%(ext)s'
}

url = 'https://music.youtube.com/watch?v=r7zTKRonHXM'

base_path = os.path.join(os.getcwd(), "data", "results")

try:
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        info = ydl.extract_info(url, download=True)
        
        # Get the artist and title of the song and delete special characters
        artist = info['artist'].replace(" ", "_").replace("/", "_")
        title = info['title'].replace(" ", "_").replace("/", "_")

        # Create folder for the song
        folder = f"{artist}/{title}"
        base_path = os.path.join(base_path, folder)
        os.makedirs(base_path, exist_ok=True)

        # Move the song to the folder
        os.rename("music.mp3", f"{base_path}/music.mp3")

        base_path = os.path.join(base_path, folder)
except Exception as e:
    print(f"An error occurred: {e}")

[youtube] Extracting URL: https://music.youtube.com/watch?v=r7zTKRonHXM
[youtube] r7zTKRonHXM: Downloading webpage
[youtube] r7zTKRonHXM: Downloading ios player API JSON
[youtube] r7zTKRonHXM: Downloading mweb player API JSON
[youtube] r7zTKRonHXM: Downloading ios music player API JSON
[youtube] r7zTKRonHXM: Downloading m3u8 information
[youtube] r7zTKRonHXM: Downloading m3u8 information
[info] r7zTKRonHXM: Downloading 1 format(s): 251
[download] Destination: music.webm
[download] 100% of    3.90MiB in 00:00:00 at 6.12MiB/s   
[ExtractAudio] Destination: music.mp3
Deleting original file music.webm (pass -k to keep)


# Split The Vocal And Instrument

In [3]:
%pip install audio-separator onnxruntime

Note: you may need to restart the kernel to use updated packages.


In [5]:
from audio_separator.separator import Separator

# Initialize the Separator class (with optional configuration properties, below)
separator = Separator()

# Load a machine learning model (if unspecified, defaults to 'model_mel_band_roformer_ep_3005_sdr_11.4360.ckpt')
separator.load_model()

# Define the path to the audio file
music_path = os.path.join(base_path, 'music.mp3')

# Perform the separation on specific audio files without reloading the model
output_files = separator.separate(music_path)

# Rename the output files
os.rename(output_files[0], 'instrumental.wav')
os.rename(output_files[1], 'vocal.wav')

# Move the output files to the folder
os.rename('instrumental.wav', f"{base_path}/instrumental.wav")
os.rename('vocal.wav', f"{base_path}/vocal.wav")

2024-10-11 13:41:57,971 - INFO - separator - Separator version 0.21.2 instantiating with output_dir: None, output_format: WAV
2024-10-11 13:41:57,971 - INFO - separator - Output directory not specified. Using current working directory.
2024-10-11 13:41:57,972 - INFO - separator - Operating System: Linux #1 SMP Fri Mar 29 23:14:13 UTC 2024
2024-10-11 13:41:57,973 - INFO - separator - System: Linux Node: f31cc37a9b15 Release: 5.15.153.1-microsoft-standard-WSL2 Machine: x86_64 Proc: x86_64
2024-10-11 13:41:57,973 - INFO - separator - Python Version: 3.11.6
2024-10-11 13:41:57,974 - INFO - separator - PyTorch Version: 2.4.1+cu121
2024-10-11 13:41:58,009 - INFO - separator - FFmpeg installed: ffmpeg version 4.4.2-0ubuntu0.22.04.1 Copyright (c) 2000-2021 the FFmpeg developers
2024-10-11 13:41:58,011 - INFO - separator - ONNX Runtime CPU package installed with version: 1.19.2
2024-10-11 13:41:58,012 - INFO - separator - No hardware acceleration could be configured, running in CPU mode
2024-10

# Generate Subtitle From Vocal Audio

In [4]:
%pip install whisper openai-whisper

Note: you may need to restart the kernel to use updated packages.


In [None]:
import whisper
import json

# Load the whisper model
model = whisper.load_model("large")

# Transcribe the audio file
transcript = model.transcribe('vocal.wav',word_timestamps=True)

# Save the transcript to a text file
wordlevel_info = []

for each in transcript['segments']:
  words = each['words']
  for word in words:
    wordlevel_info.append({'word':word['word'].strip(),'start':word['start'],'end':word['end']})

# Save the transcript to a json file
transcript_path = os.path.join(base_path, 'transcript.json')
with open(transcript_path, 'w') as f:
    json.dump(wordlevel_info, f,indent=4)

  checkpoint = torch.load(fp, map_location=device)


In [1]:
def split_text_into_lines(data):
    MaxChars = 120
    #maxduration in seconds
    MaxDuration = 2.0
    #Split if nothing is spoken (gap) for these many seconds
    MaxGap = 1.0

    subtitles = []
    line = []
    line_duration = 0
    line_chars = 0


    for idx,word_data in enumerate(data):
        word = word_data["word"]
        start = word_data["start"]
        end = word_data["end"]

        line.append(word_data)
        line_duration += end - start

        temp = " ".join(item["word"] for item in line)


        # Check if adding a new word exceeds the maximum character count or duration
        new_line_chars = len(temp)

        duration_exceeded = line_duration > MaxDuration
        chars_exceeded = new_line_chars > MaxChars
        if idx>0:
          gap = word_data['start'] - data[idx-1]['end']
          # print (word,start,end,gap)
          maxgap_exceeded = gap > MaxGap
        else:
          maxgap_exceeded = False


        if duration_exceeded or chars_exceeded or maxgap_exceeded:
            if line:
                subtitle_line = {
                    "word": " ".join(item["word"] for item in line),
                    "start": line[0]["start"],
                    "end": line[-1]["end"],
                    "textcontents": line
                }
                subtitles.append(subtitle_line)
                line = []
                line_duration = 0
                line_chars = 0


    if line:
        subtitle_line = {
            "word": " ".join(item["word"] for item in line),
            "start": line[0]["start"],
            "end": line[-1]["end"],
            "textcontents": line
        }
        subtitles.append(subtitle_line)

    return subtitles

In [5]:
import json

with open(transcript_path, 'r') as f:
    wordlevel_info_modified = json.load(f)

linelevel_subtitles = split_text_into_lines(wordlevel_info_modified)

In [7]:
%pip install moviepy imageio

%cat /etc/ImageMagick-6/policy.xml | sed 's/none/read,write/g'> /etc/ImageMagick-6/policy.xml

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


'apt' is not recognized as an internal or external command,UsageError: Line magic function `%cat` not found.

operable program or batch file.


In [21]:
from moviepy.editor import TextClip, CompositeVideoClip, ColorClip
import numpy as np

def create_caption(textJSON, framesize, font="Helvetica-Bold", fontsize=80, color='white', bgcolor='blue'):
    wordcount = len(textJSON['textcontents'])
    full_duration = textJSON['end'] - textJSON['start']

    word_clips = []
    xy_textclips_positions = []

    frame_width, frame_height = framesize
    x_buffer = frame_width * 1/10
    y_buffer = frame_height * 1/5

    # Calculate total width and height of the text
    total_width = 0
    total_height = 0
    current_line_width = 0
    max_line_width = frame_width - 2 * x_buffer
    line_height = 0

    for wordJSON in textJSON['textcontents']:
        word_clip = TextClip(wordJSON['word'], font=font, fontsize=fontsize, color=color)
        word_width, word_height = word_clip.size
        space_width = TextClip(" ", font=font, fontsize=fontsize, color=color).w

        if current_line_width + word_width + space_width > max_line_width:
            total_height += line_height + 40
            total_width = max(total_width, current_line_width)
            current_line_width = word_width + space_width
            line_height = word_height
        else:
            current_line_width += word_width + space_width
            line_height = max(line_height, word_height)

    total_height += line_height
    total_width = max(total_width, current_line_width)

    # Calculate starting position to center the text
    start_x = (frame_width - total_width) / 2
    start_y = frame_height - total_height - y_buffer

    x_pos = start_x
    y_pos = start_y

    for wordJSON in textJSON['textcontents']:
        duration = wordJSON['end'] - wordJSON['start']
        word_clip = TextClip(wordJSON['word'], font=font, fontsize=fontsize, color=color, stroke_color='black', stroke_width=2)
        word_clip_space = TextClip(" ", font=font, fontsize=fontsize, color=color)
        word_width, word_height = word_clip.size
        space_width = word_clip_space.w

        if x_pos + word_width + space_width > frame_width - x_buffer:
            x_pos = start_x
            y_pos += word_height + 40

        xy_textclips_positions.append({
            "x_pos": x_pos,
            "y_pos": y_pos,
            "width": word_width,
            "height": word_height,
            "word": wordJSON['word'],
            "start": wordJSON['start'],
            "end": wordJSON['end'],
            "duration": duration
        })

        word_clip = word_clip.set_position((x_pos, y_pos)).set_start(textJSON['start']).set_duration(full_duration)
        word_clip_space = word_clip_space.set_position((x_pos + word_width, y_pos)).set_start(textJSON['start']).set_duration(full_duration)

        word_clips.append(word_clip)
        word_clips.append(word_clip_space)

        x_pos += word_width + space_width

    for highlight_word in xy_textclips_positions:
        word_clip_highlight = TextClip(highlight_word['word'], font=font, fontsize=fontsize, color=color, bg_color=bgcolor, stroke_color='black', stroke_width=2)
        word_clip_highlight = word_clip_highlight.set_position((highlight_word['x_pos'], highlight_word['y_pos'])).set_start(highlight_word['start']).set_duration(highlight_word['duration'])
        word_clips.append(word_clip_highlight)

    return word_clips

# Render The Video

In [14]:
import urllib.request


background_url = 'https://wallpapers.com/images/hd/music-color-full-hd-d3s2dz1k58xbndmh.jpg'
background_filename = 'background.jpg'

urllib.request.urlretrieve(background_url, background_filename)

('background.jpg', <http.client.HTTPMessage at 0x7e3c887bb820>)

In [25]:
from moviepy.editor import CompositeVideoClip, ImageClip, AudioFileClip
from moviepy.video.fx.resize import resize

frame_size = (1920,1080)

all_linelevel_splits=[]

for line in linelevel_subtitles:
  out = create_caption(line,frame_size)
  all_linelevel_splits.extend(out)

# Load audio
audio = AudioFileClip("music.mp3")

# Get the duration of the audio
audio_duration = audio.duration

image_clip = ImageClip("background.jpg").set_duration(audio_duration)

# Resize the image_clip object to fit within a 1980x1080 frame while maintaining its aspect ratio
clip_video = resize(image_clip, width=1980, height=1080)

final_video = CompositeVideoClip([clip_video] + all_linelevel_splits)

# Set the audio of the final video to be the same as the input video
final_video = final_video.set_audio(audio)

# Save the final clip as a video file with the audio included
final_video.write_videofile("output.mp4", fps=24, codec="libx264", audio_codec="aac")

Moviepy - Building video output.mp4.
MoviePy - Writing audio in outputTEMP_MPY_wvf_snd.mp4




MoviePy - Done.
Moviepy - Writing video output.mp4





Moviepy - Done !
Moviepy - video ready output.mp4
