In [7]:
from __future__ import unicode_literals
from yt_dlp import YoutubeDL
import librosa
from madmom.features.beats import DBNBeatTrackingProcessor, RNNBeatProcessor
from madmom.features.downbeats import DBNDownBeatTrackingProcessor, RNNDownBeatProcessor
import soundfile as sf
import numpy as np
import os
from pydub import AudioSegment
import matplotlib.pyplot as plt
from scipy.spatial.distance import cdist
import openl3
from madmom.features.chords import DeepChromaChordRecognitionProcessor, majmin_targets_to_chord_labels
from madmom.evaluation.chords import encode as encode_chords, merge_chords, reduce_to_triads
from madmom.audio.chroma import DeepChromaProcessor
import csv



%load_ext autoreload
%autoreload 2

In [8]:
def download_mp3(url, out_path="./%(title)s.%(ext)s"):
    ydl_opts = {
        "format": "bestaudio/best",
        "outtmpl": out_path,
        "ffmpeg_location": r"C:\FFmpeg\bin",
        "postprocessors": [{
            "key": "FFmpegExtractAudio",
            "preferredcodec": "mp3",
                "preferredquality": "192",
        }],
        "quiet": False,
        "no_warnings": True, 
    }

    with YoutubeDL(ydl_opts) as ydl:
        ydl.download([url])

In [184]:
song_links = []
cover_links = []
with open("Dataset.csv", newline="") as f:
    reader = csv.DictReader(f)
    for row in reader:
        song_links.append(row["audio_link"])
        cover_links.append(row["cover_link"])

In [9]:
def get_beats_and_downbeats(y, sr):

    tempo, _ = librosa.beat.beat_track(y=y, sr=sr)
    #beat_tracker = BeatNet(1, mode="online", inference_model="PF", thread=False)
    #beats_np = beat_tracker.process(y)
    #print(beats_np)
    print(tempo)
    sf.write("test.wav",y,sr)
    act = RNNDownBeatProcessor()("test.wav")

    dbn = DBNDownBeatTrackingProcessor(beats_per_bar=4, fps=100, min_bpm = tempo * 0.85, max_bpm = tempo * 1.15)

    downbeats = dbn(act)

    
    beats = np.array([time for time, beat in downbeats])
    downbeats = np.array([time for time, beat in downbeats if beat == 1])
    print(60/np.mean(np.diff(beats)))
    os.remove("test.wav")
    #print(beats_fixed)
    #return beats
    return (beats, downbeats, act)

def load(path):
    return librosa.load(path)

def get_clicks(beats, sr):
    return librosa.clicks(times=beats, sr=sr)

def overlay(y, clicks, sr, path):
    sf.write("y.wav", y, sr, subtype='PCM_16')
    sf.write("clicks.wav", clicks, sr, subtype='PCM_16')

   
    y_audio = AudioSegment.from_wav("y.wav")
    clicks_audio = AudioSegment.from_wav("clicks.wav")
    audio = y_audio.overlay(clicks_audio)

    os.remove("y.wav")
    os.remove("clicks.wav")


    audio.export(path, format="wav")

def trim_silence(y):
    y_t, index = librosa.effects.trim(y, top_db=40)
    return y_t


def dtw_align(y_s_harmonic, y_s, y_p, downbeats_s, downbeats_p, sr, hop_length=512, n_beats=8, sim_metric='cosine', cost_threshold=1e3):
    aligned = []
    seg_s = np.array([])
    seg_p = np.array([])
    i = 0
    j = 0
    while i < len(downbeats_s):
        if i + n_beats >= min(len(downbeats_s), len(downbeats_p)) - 1:
            break
        #print(i)
        j = i
        #while j < len(downbeats_p):
        #for j in range(0, len(y_p)-16):
        #    if j + n_beats >= len(downbeats_p) - 1:
        #        break
            #print(i, j)

        t0_p, t1_p = downbeats_p[i], downbeats_p[i+n_beats]
        t0_s, t1_s = downbeats_s[i], downbeats_s[i+n_beats]

        s0_p, s1_p = int(t0_p * sr), int(t1_p * sr)
        s0_s, s1_s = int(t0_s * sr), int(t1_s * sr)

        y_chunk_p = y_p[s0_p: s1_p]
        y_chunk_s_harmonic = y_s_harmonic[s0_s: s1_s]
        y_chunk_s = y_s[s0_s: s1_s]

        emb_p, _ = openl3.get_audio_embedding(y_chunk_p, sr, content_type='music', embedding_size=512, input_repr='mel256')
        emb_s, _ = openl3.get_audio_embedding(y_chunk_s_harmonic, sr, content_type='music', embedding_size=512, input_repr='mel256')

        D_feat = cdist(emb_p, emb_s, metric=sim_metric)

        alpha = 0.03
        idx1 = np.arange(D_feat.shape[0])[:,None]
        idx2 = np.arange(D_feat.shape[1])[None,:]
        D_time = alpha * np.abs(idx1 - idx2)

        Cost = D_feat + D_time

        D, wp = librosa.sequence.dtw(C=Cost)
        cost = D[-1, -1]

        print(i, cost)
        if cost < cost_threshold:
            aligned.append((t0_p, t1_p, t0_s, t1_s, cost))
            seg_p = np.concatenate((seg_p, y_chunk_p))
            seg_s = np.concatenate((seg_s, y_chunk_s))
            #j += 8
            i += 8
        else:
            break
            #if len(downbeats_p) < len(downbeats_s):
            #    i += 1
            #else:
            #   j += 1
    
    return aligned, seg_p, seg_s

def export_aligned(y_s, y_p, sr, num, aligned):


    for idx, (t0_p, t1_p, t0_s, t1_s, _) in enumerate(aligned):
        s0_p, s1_p = int(t0_p * sr), int(t1_p * sr)
        s0_s, s1_s = int(t0_s * sr), int(t1_s * sr)

        seg_p = y_p[..., s0_p:s1_p]
        seg_s = y_s[..., s0_s:s1_s]

        #sf.write("align_test/" + f"seg{idx:02d}_p.wav", seg_p.T, sr)
        #sf.write("align_test/" + f"seg{idx:02d}_s.wav",  seg_s.T, sr)

    sf.write(f"piano_data/{num}.mp3", y_p, sr, bitrate_mode='VARIABLE', compression_level=0)
    sf.write(f"song_data/{num}.mp3",  y_s, sr, bitrate_mode='VARIABLE', compression_level=0)

In [175]:
download_mp3("https://www.youtube.com/watch?v=l5t9IXtTr6g", "song_data/1")
download_mp3("https://www.youtube.com/watch?v=-iZmfIcnlBM", "piano_data/1")

[youtube] Extracting URL: https://www.youtube.com/watch?v=l5t9IXtTr6g
[youtube] l5t9IXtTr6g: Downloading webpage
[youtube] l5t9IXtTr6g: Downloading tv client config
[youtube] l5t9IXtTr6g: Downloading tv player API JSON
[youtube] l5t9IXtTr6g: Downloading ios player API JSON
[youtube] l5t9IXtTr6g: Downloading player 59b252b9-main
[youtube] l5t9IXtTr6g: Downloading m3u8 information
[info] l5t9IXtTr6g: Downloading 1 format(s): 234
[hlsnative] Downloading m3u8 manifest
[hlsnative] Total fragments: 41
[download] Destination: song_data\1
[download] 100% of    3.72MiB in 00:00:01 at 2.11MiB/s                   
[ExtractAudio] Destination: song_data\1.mp3
Deleting original file song_data\1 (pass -k to keep)
[youtube] Extracting URL: https://www.youtube.com/watch?v=-iZmfIcnlBM
[youtube] -iZmfIcnlBM: Downloading webpage
[youtube] -iZmfIcnlBM: Downloading tv client config
[youtube] -iZmfIcnlBM: Downloading tv player API JSON
[youtube] -iZmfIcnlBM: Downloading ios player API JSON
[youtube] -iZmfIcn

In [176]:
y_p, sr = load("piano_data/1.mp3")
y_p = trim_silence(y_p)

y_s, sr = load("song_data/1.mp3")
y_s = trim_silence(y_s)

y_s_harmonic, y_s_percussive = librosa.effects.hpss(y_s)
y_p_harmonic, y_p_percussive = librosa.effects.hpss(y_p)

In [177]:
beats_p, downbeats_p, act_p = get_beats_and_downbeats(y_p, sr)

beats_s, downbeats_s, act_s = get_beats_and_downbeats(y_s, sr)

[103.359375]


  return array(a, dtype, copy=False, order=order)


100.38281582305402
[92.28515625]
92.52886081463733


In [178]:
clicks_p = get_clicks(beats_p, sr)

clicks_s = get_clicks(beats_s, sr)

In [179]:
overlay(y_p_harmonic, clicks_p, sr, "piano_click.wav")

overlay(y_s, clicks_s, sr, "song_click.wav")

In [2]:
aligned, seg_p, seg_s = dtw_align(y_s_harmonic=y_s_harmonic, y_s=y_s, y_p=y_p_harmonic, downbeats_s=downbeats_s, downbeats_p=downbeats_p, sr=sr, cost_threshold=12)
export_aligned(seg_s, seg_p, sr, aligned)

NameError: name 'dtw_align' is not defined

In [None]:
for i in range(0, len(aligned)):
    print(i, aligned[i])

0 (1.29, 16.29, 1.28, 16.26, 5.334491688546183)
1 (16.29, 31.29, 16.26, 31.27, 5.459741026942079)
2 (31.29, 46.29, 31.27, 46.27, 5.846066501778152)
3 (46.29, 61.29, 46.27, 61.27, 5.7083636063430285)
4 (61.29, 76.29, 61.27, 76.28, 5.404086645188884)
5 (76.29, 91.29, 76.28, 91.11, 5.34635182427443)
6 (91.29, 106.29, 91.11, 106.13, 5.606575113936737)
7 (106.29, 121.29, 106.13, 121.1, 5.199753430324616)
8 (121.29, 136.3, 121.1, 136.12, 5.465182928827428)
9 (136.3, 151.3, 136.12, 151.12, 4.870350664829438)
10 (151.3, 166.29, 151.12, 166.12, 5.089253672059576)
11 (166.29, 181.29, 166.12, 181.12, 5.375129957879572)
12 (181.29, 196.3, 181.12, 196.11, 4.776375577618357)
13 (196.3, 211.3, 196.11, 211.13, 4.7008008686572555)
14 (211.3, 226.3, 211.13, 226.12, 4.7842672465586515)


In [10]:
song_links = []
cover_links = []
with open("Dataset.csv", newline="") as f:
    reader = csv.DictReader(f)
    for row in reader:
        song_links.append(row["audio_link"])
        cover_links.append(row["cover_link"])

for i in range(len(song_links)):
    download_mp3(song_links[i], f"song_data/{i}")
    download_mp3(cover_links[i], f"piano_data/{i}")

    y_p, sr = load(f"piano_data/{i}.mp3")
    y_p = trim_silence(y_p)

    y_s, sr = load(f"song_data/{i}.mp3")
    y_s = trim_silence(y_s)

    y_s_harmonic, y_s_percussive = librosa.effects.hpss(y_s)
    y_p_harmonic, y_p_percussive = librosa.effects.hpss(y_p)

    beats_p, downbeats_p, act_p = get_beats_and_downbeats(y_p, sr)

    beats_s, downbeats_s, act_s = get_beats_and_downbeats(y_s, sr)

    aligned, seg_p, seg_s = dtw_align(y_s_harmonic=y_s_harmonic, y_s=y_s, y_p=y_p_harmonic, downbeats_s=downbeats_s, downbeats_p=downbeats_p, sr=sr, cost_threshold=11)
    export_aligned(seg_s, seg_p, sr, i, aligned)

[youtube] Extracting URL: https://www.youtube.com/watch?v=2S0QhGGO1gQ
[youtube] 2S0QhGGO1gQ: Downloading webpage
[youtube] 2S0QhGGO1gQ: Downloading tv client config
[youtube] 2S0QhGGO1gQ: Downloading tv player API JSON
[youtube] 2S0QhGGO1gQ: Downloading ios player API JSON
[youtube] 2S0QhGGO1gQ: Downloading player 59b252b9-main
[youtube] 2S0QhGGO1gQ: Downloading m3u8 information
[info] 2S0QhGGO1gQ: Downloading 1 format(s): 234
[hlsnative] Downloading m3u8 manifest
[hlsnative] Total fragments: 33
[download] Destination: song_data\0
[download] 100% of    2.69MiB in 00:00:01 at 1.93MiB/s                   
[ExtractAudio] Destination: song_data\0.mp3
Deleting original file song_data\0 (pass -k to keep)
[youtube] Extracting URL: https://www.youtube.com/watch?v=rmP2u0ymOqU
[youtube] rmP2u0ymOqU: Downloading webpage
[youtube] rmP2u0ymOqU: Downloading tv client config
[youtube] rmP2u0ymOqU: Downloading tv player API JSON
[youtube] rmP2u0ymOqU: Downloading ios player API JSON
[youtube] rmP2u0ym

  return array(a, dtype, copy=False, order=order)


125.99461561471732
[123.046875]
126.09457092819613
0 7.706603982443954
8 6.603095966992492
16 7.899924323444581


KeyboardInterrupt: 