In [17]:
%%shell
# Install And Update linux library
sudo apt -y update
sudo apt install -y wget curl unzip imagemagick
# Download chrome and chromedriver
wget http://archive.ubuntu.com/ubuntu/pool/main/libu/libu2f-host/libu2f-udev_1.1.4-1_all.deb
dpkg -i libu2f-udev_1.1.4-1_all.deb
wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb
dpkg -i google-chrome-stable_current_amd64.deb
CHROME_DRIVER_VERSION=`curl -sS chromedriver.storage.googleapis.com/LATEST_RELEASE`
wget -N https://chromedriver.storage.googleapis.com/$CHROME_DRIVER_VERSION/chromedriver_linux64.zip -P /tmp/
unzip -o /tmp/chromedriver_linux64.zip -d /tmp/
chmod +x /tmp/chromedriver
mv /tmp/chromedriver /usr/local/bin/chromedriver
# ImageMagick policy
sudo cat /etc/ImageMagick-6/policy.xml | sed 's/none/read,write/g'> /etc/ImageMagick-6/policy.xml

# Install python library
pip install yt_dlp
pip install audio-separator onnxruntime
pip install whisper openai-whisper
pip install moviepy==2.0.0.dev2
pip install imageio==2.25.1
pip install diffusers transformers accelerate
pip install selenium webdriver_manager

Note: you may need to restart the kernel to use updated packages.
Collecting numpy<2,>=1.23 (from audio-separator)
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
  Downloading numpy-1.24.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.6 kB)
Downloading numpy-1.24.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.3/17.3 MB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 2.1.0
    Uninstalling numpy-2.1.0:
      Successfully uninstalled numpy-2.1.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
crawl4ai 0.3.5 requires numpy<2.1.1,>=1.26.0, but you have numpy 1.24.4 which

In [18]:
import yt_dlp
import shutil
import os

def download_music(url, base_path):
    # Define the destination file
    music_path = os.path.join(base_path, "music.mp3")

    if os.path.exists(music_path):
        return music_path

    ydl_opts = {
        'format': 'bestaudio/best',
        'postprocessors': [{
            'key': 'FFmpegExtractAudio',
            'preferredcodec': 'mp3',
            'preferredquality': '192',
        }],
        'outtmpl': 'music.%(ext)s'
    }

    try:
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            ydl.download([url])

            # Move the song to the folder
            shutil.move("music.mp3", music_path)
    except Exception as e:
        print(f"An error occurred: {e}")
        raise e
    finally:
        return music_path

In [19]:
from audio_separator.separator import Separator

def separate_music(music_path, base_path):
  # Define result path
  instrumental_path = os.path.join(base_path, 'instrumental.wav')
  vocal_path = os.path.join(base_path, 'vocal.wav')

  if os.path.exists(instrumental_path) and os.path.exists(vocal_path):
    return instrumental_path, vocal_path

  # Initialize the Separator class (with optional configuration properties, below)
  separator = Separator()

  # Load a machine learning model (if unspecified, defaults to 'model_mel_band_roformer_ep_3005_sdr_11.4360.ckpt')
  separator.load_model()

  # Perform the separation on specific audio files without reloading the model
  output_files = separator.separate(music_path)

  # Rename the output files
  os.rename(output_files[0], 'instrumental.wav')
  os.rename(output_files[1], 'vocal.wav')

  # Move the output files to the folderL
  shutil.move('instrumental.wav', instrumental_path)
  shutil.move('vocal.wav', vocal_path)

  return instrumental_path, vocal_path

In [20]:
import whisper
import json

def transcribe_music(vocal_path, base_path):
  # Define the path to the transcript file
  transcript_path = os.path.join(base_path, 'transcript.json')

  if os.path.exists(transcript_path):
    return transcript_path

  # Load the whisper model
  model = whisper.load_model("large")

  # Define the path to the vocal audio file
  vocal_path = os.path.join(base_path, 'vocal.wav')

  # Transcribe the audio file
  transcript = model.transcribe(vocal_path, word_timestamps=True)

  # Save the transcript to a text file
  wordlevel_info = []

  for each in transcript['segments']:
    words = each['words']
    for word in words:
      wordlevel_info.append({'word':word['word'].strip(),'start':word['start'],'end':word['end']})

  # Save the transcript to a json file
  with open(transcript_path, 'w') as f:
      json.dump(wordlevel_info, f,indent=4)

  return transcript_path

In [21]:
def split_text_into_lines(data):
    MaxChars = 120
    #maxduration in seconds
    MaxDuration = 2.0
    #Split if nothing is spoken (gap) for these many seconds
    MaxGap = 1.0

    subtitles = []
    line = []
    line_duration = 0
    line_chars = 0

    for idx,word_data in enumerate(data):
        word = word_data["word"]
        start = word_data["start"]
        end = word_data["end"]

        line.append(word_data)
        line_duration += end - start

        temp = " ".join(item["word"] for item in line)

        # Check if adding a new word exceeds the maximum character count or duration
        new_line_chars = len(temp)

        duration_exceeded = line_duration > MaxDuration
        chars_exceeded = new_line_chars > MaxChars
        if idx>0:
          gap = word_data['start'] - data[idx-1]['end']
          # print (word,start,end,gap)
          maxgap_exceeded = gap > MaxGap
        else:
          maxgap_exceeded = False

        if duration_exceeded or chars_exceeded or maxgap_exceeded:
            if line:
                subtitle_line = {
                    "word": " ".join(item["word"] for item in line),
                    "start": line[0]["start"],
                    "end": line[-1]["end"],
                    "textcontents": line
                }
                subtitles.append(subtitle_line)
                line = []
                line_duration = 0
                line_chars = 0

    if line:
        subtitle_line = {
            "word": " ".join(item["word"] for item in line),
            "start": line[0]["start"],
            "end": line[-1]["end"],
            "textcontents": line
        }
        subtitles.append(subtitle_line)

    return subtitles

In [22]:
import json

def generate_subtitles(transcript_path):
  with open(transcript_path, 'r') as f:
      wordlevel_info_modified = json.load(f)

  return split_text_into_lines(wordlevel_info_modified)

In [23]:
from moviepy.editor import TextClip

def create_caption(textJSON, framesize, font="Helvetica-Bold", fontsize=80, color='white', bgcolor='blue'):
    wordcount = len(textJSON['textcontents'])
    full_duration = textJSON['end'] - textJSON['start']

    word_clips = []
    xy_textclips_positions = []

    frame_width, frame_height = framesize
    x_buffer = frame_width * 1/10
    y_buffer = frame_height * 1/5

    # Calculate total width and height of the text
    total_width = 0
    total_height = 0
    current_line_width = 0
    max_line_width = frame_width - 2 * x_buffer
    line_height = 0

    for wordJSON in textJSON['textcontents']:
        word_clip = TextClip(wordJSON['word'], font=font, fontsize=fontsize, color=color)
        word_width, word_height = word_clip.size
        space_width = TextClip(" ", font=font, fontsize=fontsize, color=color).w

        if current_line_width + word_width + space_width > max_line_width:
            total_height += line_height + 40
            total_width = max(total_width, current_line_width)
            current_line_width = word_width + space_width
            line_height = word_height
        else:
            current_line_width += word_width + space_width
            line_height = max(line_height, word_height)

    total_height += line_height
    total_width = max(total_width, current_line_width)

    # Calculate starting position to center the text
    start_x = (frame_width - total_width) / 2
    start_y = frame_height - total_height - y_buffer

    x_pos = start_x
    y_pos = start_y

    for wordJSON in textJSON['textcontents']:
        duration = wordJSON['end'] - wordJSON['start']
        word_clip = TextClip(wordJSON['word'], font=font, fontsize=fontsize, color=color, stroke_color='black', stroke_width=2)
        word_clip_space = TextClip(" ", font=font, fontsize=fontsize, color=color)
        word_width, word_height = word_clip.size
        space_width = word_clip_space.w

        if x_pos + word_width + space_width > frame_width - x_buffer:
            x_pos = start_x
            y_pos += word_height + 40

        xy_textclips_positions.append({
            "x_pos": x_pos,
            "y_pos": y_pos,
            "width": word_width,
            "height": word_height,
            "word": wordJSON['word'],
            "start": wordJSON['start'],
            "end": wordJSON['end'],
            "duration": duration
        })

        word_clip = word_clip.set_position((x_pos, y_pos)).set_start(textJSON['start']).set_duration(full_duration)
        word_clip_space = word_clip_space.set_position((x_pos + word_width, y_pos)).set_start(textJSON['start']).set_duration(full_duration)

        word_clips.append(word_clip)
        word_clips.append(word_clip_space)

        x_pos += word_width + space_width

    for highlight_word in xy_textclips_positions:
        word_clip_highlight = TextClip(highlight_word['word'], font=font, fontsize=fontsize, color=color, bg_color=bgcolor, stroke_color='black', stroke_width=2)
        word_clip_highlight = word_clip_highlight.set_position((highlight_word['x_pos'], highlight_word['y_pos'])).set_start(highlight_word['start']).set_duration(highlight_word['duration'])
        word_clips.append(word_clip_highlight)

    return word_clips

In [24]:
from moviepy.editor import CompositeVideoClip, ImageClip, AudioFileClip
from moviepy.video.fx.resize import resize

def render_video(subtitles, backgrund_path, sound_path, base_path, output_file_name):
  # Define the path to the audio file
  video_path = os.path.join(base_path, f'{output_file_name}.mp4')

  if os.path.exists(video_path):
    return video_path

  frame_size = (1920,1080)

  all_linelevel_splits=[]

  for line in subtitles:
    out = create_caption(line,frame_size)
    all_linelevel_splits.extend(out)

  # Load audio
  audio = AudioFileClip(sound_path)

  # Get the duration of the audio
  audio_duration = audio.duration

  image_clip = ImageClip(backgrund_path).set_duration(audio_duration)

  # Resize the image_clip object to fit within a 1980x1080 frame while maintaining its aspect ratio
  clip_video = resize(image_clip, width=1980, height=1080)

  final_video = CompositeVideoClip([clip_video] + all_linelevel_splits)

  # Set the audio of the final video to be the same as the input video
  final_video = final_video.set_audio(audio)

  # Save the final clip as a video file with the audio included
  final_video.write_videofile(video_path, fps=24, codec="libx264", audio_codec="aac")

  return video_path

In [25]:
# Import pustaka yang diperlukan
import torch
from diffusers import DiffusionPipeline
from IPython.display import display, Image
import random
import os

# Inisialisasi pipeline SDXL
model_id = "stabilityai/stable-diffusion-xl-base-1.0"
pipe = DiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16, use_safetensors=True, variant="fp16")

# Check if CUDA is available
if torch.cuda.is_available():
    print("CUDA is available. Using GPU.")
    os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
    # Clear the PyTorch cache
    torch.cuda.empty_cache()
    pipe = pipe.to("cuda")
else:
    print("CUDA is not available. Using CPU.")


# Define style image
list_images = ["snowy", "rocky", "green", "misty", "foggy", "desert", "tropical", "arctic", "volcanic", "beachy", "forest", "urban"]
list_atmosphere = ["serene", "peaceful", "mysterious", "dreamy", "vibrant", "moody", "dramatic", "calm", "energetic", "whimsical", "melancholic", "adventurous"]

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

CUDA is not available. Using CPU.


In [26]:
def generate_background(pipe, prompt, base_path, image_name, num_inference_steps=50, guidance_scale=7.5):
    image_path = os.path.join(base_path, f"{image_name}.png")
    if os.path.exists(image_path):
        return image_path

    print(f"Generating image for prompt: {prompt}")

    # Menghasilkan gambar
    with torch.no_grad(): # Menonaktifkan penghitungan gradien untuk inference
        torch.manual_seed(random.randint(0, 1000))  
        image = pipe(prompt, num_inference_steps=num_inference_steps, guidance_scale=guidance_scale, height=1080, width=1920).images[0]
    
    # Menyimpan gambar
    image.save(image_path)

    # Clear the PyTorch cache
    torch.cuda.empty_cache()
    
    return image_path

In [27]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager

options = Options()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36")
options.add_argument("--disable-blink-features=AutomationControlled")
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)

service = ChromeService(ChromeDriverManager().install())

In [28]:
def get_url_music(query_song):
  try:
    driver = webdriver.Chrome(service=service, options=options)
    # query_song = "NSYNC Bye Bye Bye"
    url = "https://music.youtube.com/search?q=" + query_song
    # Open a webpage
    driver.get(url)

    # Locate the element containing the dynamic href attribute
    element = driver.find_element(By.XPATH, '//a[contains(@href, "watch?v=")]')

    return element.get_attribute('href')
  except Exception as e:
      print(f"An error occurred: {e}")
      raise e
  finally:
      # Close the browser
      driver.quit()

In [29]:
def prepare_files(url, base_path):
  music_path, base_path = download_music(url, base_path)
  if not music_path and not os.path.exists(music_path):
      raise FileNotFoundError(f"Music file not found at path: {music_path}")

  # Split Vocal & Instrument
  instrument_path, vocal_path = separate_music(music_path, base_path)
  if not vocal_path and not os.path.exists(vocal_path):
      raise FileNotFoundError(f"Vocal file not found at path: {vocal_path}")

  return music_path, instrument_path, vocal_path, None
#   # Transcribe music
#   transcript_path = transcribe_music(vocal_path, base_path)
#   if not transcript_path and not os.path.exists(transcript_path):
#       raise FileNotFoundError(f"Transcript file not found at path: {transcript_path}")

#   # Generate subtitles
#   subtitles = generate_subtitles(transcript_path)
#   if not subtitles:
#       raise ValueError("No subtitles generated. Please check the transcript and try again.")

#   return music_path, instrument_path, vocal_path, subtitles

In [30]:
def create_folder(base_path, artist, title):
   # Get the artist and title of the song and delete special characters
  artist = artist.replace(" ", "_").replace("/", "_")
  title = title.replace(" ", "_").replace("/", "_")
  
  # Create folder for the song
  folder = f"{artist}/{title}"
  base_path = os.path.join(base_path, folder)
  if not os.path.exists(base_path):
    os.makedirs(base_path)

  return base_path

In [31]:
import os
import requests

base_path = os.path.join(os.getcwd(), "drive", "MyDrive", "data_notebook", "music", "results")
base_url = "https://api.deezer.com/chart/0/tracks?index=0"
response = requests.get(base_url)

if response.status_code == 200:
    data = response.json()
    print(data)

    for i, track in enumerate(data["data"]):
        artist = track["artist"]["name"]
        title = track["title"]
        print(f"{i+1}. {artist} - {title}")

        base_path_song = create_folder(base_path, artist, title)
        
        song_url = get_url_music(title + " " + artist)

        music_path, instrument_path, vocal_path, subtitles = prepare_files(song_url, base_path_song)

        # Render background for lyric video
        # prompt_template = f"Vector style image of a {random.choice(list_images)} range, flat design, cool colors, and a {random.choice(list_atmosphere)} atmosphere."
        # background_lyric = generate_background(pipe, prompt_template, base_path_song, "background_lyric")
        # Render lyric video
        # render_video(subtitles, background_lyric, music_path, base_path, "lyric_video")
else:
    print("Error:", response.status_code)
    raise Exception("Failed to fetch the top tracks")

{'data': [{'id': 3038842301, 'title': 'Super Tuna', 'title_short': 'Super Tuna', 'title_version': '', 'link': 'https://www.deezer.com/track/3038842301', 'duration': 114, 'rank': 100000, 'explicit_lyrics': False, 'explicit_content_lyrics': 0, 'explicit_content_cover': 0, 'preview': 'https://cdn-preview-d.dzcdn.net/stream/c-dd6e9eb01a86c3ea990cfee0c4bab1d3-1.mp3', 'md5_image': '708f5dcd86ca8a3ff6e88a53e25196d4', 'position': 1, 'artist': {'id': 150688792, 'name': 'Jin', 'link': 'https://www.deezer.com/artist/150688792', 'picture': 'https://api.deezer.com/artist/150688792/image', 'picture_small': 'https://e-cdns-images.dzcdn.net/images/artist/0193b597f99715da3becd690aaeacfa6/56x56-000000-80-0-0.jpg', 'picture_medium': 'https://e-cdns-images.dzcdn.net/images/artist/0193b597f99715da3becd690aaeacfa6/250x250-000000-80-0-0.jpg', 'picture_big': 'https://e-cdns-images.dzcdn.net/images/artist/0193b597f99715da3becd690aaeacfa6/500x500-000000-80-0-0.jpg', 'picture_xl': 'https://e-cdns-images.dzcdn.ne

UnboundLocalError: cannot access local variable 'driver' where it is not associated with a value