In [1]:
import pandas as pd
import os
import librosa
import matplotlib.pyplot as plt
import numpy as np
from IPython.display import display, Audio
import linmdtw
import warnings
warnings.filterwarnings("ignore")
import IPython.display as ipd
import time
import pydub
import pyloudnorm as pyln
import soundfile as sf
from tqdm import tqdm

# get df

In [2]:
csv_path = "D:\projects\ProfanityFilter\explicits\youtube_links_new.txt"
df = pd.read_csv(csv_path, 
                 names=["uri", "ytid", "ytid_clean", "dur_yt", "dur_yt_clean"])

In [3]:
df = df.drop_duplicates(subset=["ytid"])
df = df.drop_duplicates(subset=["ytid_clean"])
df = df.drop_duplicates(subset=["uri"])
df = df.dropna()

In [4]:
df

Unnamed: 0,uri,ytid,ytid_clean,dur_yt,dur_yt_clean
0,spotify:track:3JvKfv6T31zO0ini8iNItO,9i_OWaiaF34,8vN0EMCjjP0,244.0,242.0
1,spotify:track:7lQ8MOhq6IN2w8EYcFNSUk,pyb13N80DZQ,6iUeFGPWHz0,291.0,290.0
2,spotify:track:3yfqSUWxFvZELEM4PmlwIR,BdfpV-cIkuA,L0V6jKCZ370,284.0,284.0
3,spotify:track:5TXDeTFVRVY7Cvt0Dw4vWW,CD_tD26E7k0,Ro32sQudsrY,121.0,120.0
4,spotify:track:7KA4W4McWYRpgf0fWsJZWB,1_OMDlzbIuM,AhvkKR0ero8,181.0,181.0
...,...,...,...,...,...
13179,spotify:track:3Q8JrzlwciRk3j8M8UFQvr,LWycFKbNl9c,XnOrV_Bhnv0,206.0,206.0
13205,spotify:track:7FbfvVLCqraAf8iPG67cwn,AsHtIbOuBaw,AsHtIbOuBaw,177.0,177.0
13219,spotify:track:3rbuetJnPXr4ecigGVBDBo,Bw_yUjNZ4bI,dmhBewedJwg,206.0,205.0
13223,spotify:track:06K4y898EOm1CkKUwPOgNo,cVYtEUnqW_M,ungpJtM7-wg,192.0,189.0


# Iterate over df

## funcs

In [5]:
from pydub.effects import normalize

def normalize(y, sr, peak=None, loud_peak=None):
    if peak is not None:
        norm_y = pyln.normalize.peak(y, peak)
    elif loud_peak is not None:
        meter = pyln.Meter(sr) # create BS.1770 meter
        loudness = meter.integrated_loudness(y)
        norm_y = pyln.normalize.loudness(y, loudness, loud_peak)
    return norm_y

def sync_audios(pathE, pathC, sr, hop_length, use_gpu=False, default_dtw=False, trim=False):
    xE, sr = linmdtw.load_audio(pathE, sr)
    xC, sr = linmdtw.load_audio(pathC, sr)
    if trim:
        xE, _ = librosa.effects.trim(xE)
        xC, _ = librosa.effects.trim(xC)
    xE = normalize(xE, sr, peak=-12)
    xC = normalize(xC, sr, peak=-12)
    # plt.plot(xC, alpha=0.2)
    # plt.plot(xE, alpha=0.2)
    plt.show()
    XE = linmdtw.get_mfcc_mod(xE, sr, hop_length)
    XC = linmdtw.get_mfcc_mod(xC, sr, hop_length)
    # metadata = {
    #     'totalCells':0, 
    #     'M':XE.shape[0], 
    #     'N':XC.shape[0], 
    #     'timeStart':time.time(), 
    #     'perc':10
    # }
    if default_dtw:
        print("Doing default DTW")
        D, path = librosa.sequence.dtw(XE.T, XC.T, subseq=True)
        path = path[::-1]
    else:
        # path = linmdtw.linmdtw(XE, XC, do_gpu=use_gpu, metadata=metadata)
        path = linmdtw.linmdtw(XE, XC, do_gpu=use_gpu)
    xsync = linmdtw.stretch_audio(xE, xC, sr, path, hop_length)
    return xsync

def search_timestamps(
    xsync, sr, hop_length, 
    n_stds=5, 
    threshold_max=0.95,
    mask_threshold_step=0.01, 
    timestamps_threshold=2,
    top_explicits=8,
    cut_by_sides=0.15,
):
    XEsync = linmdtw.get_mfcc_mod(xsync[:, 0], sr, hop_length)
    XCsync = linmdtw.get_mfcc_mod(xsync[:, 1], sr, hop_length)

    diff = np.abs(XEsync - XCsync)
    mean_diff = diff.max(axis=1)
    mean_diff = (mean_diff - mean_diff.min())/(mean_diff.max() - mean_diff.min())
    # mean_diff = (mean_diff - mean_diff.mean())/mean_diff.std()
    # mean_diff[mean_diff < 0] = 0
    if cut_by_sides is not None:
        cut  = int(cut_by_sides* mean_diff.shape[0])
        mean_diff[:cut] = 0
        mean_diff[-cut:] = 0
    mask_threshold = min(mean_diff.mean() + n_stds*mean_diff.std(), mean_diff.max() - 0.05)
    # plt.plot(mean_diff)
    # plt.axhline(mask_threshold, c='r')
    # plt.show()
    timestamps = [0]*1000
    while len(timestamps) >= top_explicits:
        mask = (mean_diff > mask_threshold).astype(np.int)
        indices = np.nonzero(mask)[0]
        if len(indices) == 0:
            return prev_timestamps
        where_explicit = librosa.times_like(XEsync.T, sr=sr, hop_length=hop_length)[indices]

        timestamps = merge_indices(
            (where_explicit*sr).astype(np.int), 
            timestamps_threshold, 
            sr
        )  
        mask_threshold += mask_threshold_step
        prev_timestamps = timestamps
    return timestamps
    
def merge_indices(indices, threshold=2, sr=16000):
    if len(indices) == 1:
        return [(indices[0], indices[0])]
    thr = threshold * sr
    p1 = 0
    timestamps = []
    for i in range(1, len(indices)):
        if indices[i] - indices[i - 1] <= thr:
            continue
        else:
            timestamps.append((indices[p1], indices[i-1]))
            p1 = i
    timestamps.append((indices[p1], indices[i]))
    return timestamps

In [30]:
def save_timestamps(
    row, 
    timestamps,
    audio,
    sr=16000,
    txt_path = "D:\projects\ProfanityFilter\explicits\csvs\\timestamps.txt",
    ts_dir = "D:\projects\ProfanityFilter\data\\aligned_explicit"
):
    with open(txt_path, 'a') as wf:
        for i, (l, r) in enumerate(timestamps):
            # writing txt
            line = f"{row.uri},{i},{l},{r}\n"
            wf.write(line)
            
            # writing data
            l -= sr // 2
            r += sr // 2
            fname = f"{row.uri.rsplit(':', 1)[-1]}.wav"
            fpath = os.path.join(ts_dir, fname)
            # sf.write(fpath, audio[l:r, 0], samplerate=16000)
            sf.write(fpath, audio[:, 0], samplerate=16000)            

## main

In [31]:
dirE = "D:\projects\ProfanityFilter\data\\wav16k\explicit"
dirC = "D:\projects\ProfanityFilter\data\\wav16k\clean"

sr=16000
hop_length=4000


In [None]:
for idx, row in tqdm(df.iterrows(), total=len(df)):
    pathE = os.path.join(dirE, f"{row.ytid}.wav")
    pathC = os.path.join(dirC, f"{row.ytid_clean}.wav")
    # sync
    try:
        xsync = sync_audios(pathE, pathC, sr, hop_length)
    except Exception as e:
        try:
            xsync = sync_audios(pathE, pathC, sr, hop_length, trim=True)
        except Exception as e:
            print(e)
            continue
    # search
    # timestamps = search_timestamps(xsync, sr, hop_length)
    # if len(timestamps) > 0 :
    save_timestamps(row, timestamps, xsync)
    # else:
        # print("Nothing found")

  3%|███▍                                                                                                          | 74/2345 [05:08<3:10:33,  5.03s/it]

could not broadcast input array from shape (3516565,) into shape (3514880,)


  8%|█████████                                                                                                    | 194/2345 [14:11<3:01:04,  5.05s/it]

could not broadcast input array from shape (3453631,) into shape (3450880,)


 15%|████████████████▏                                                                                            | 347/2345 [23:07<1:32:04,  2.76s/it]

Audio buffer is not finite everywhere


 16%|█████████████████▊                                                                                           | 383/2345 [24:52<2:06:58,  3.88s/it]

could not broadcast input array from shape (4487478,) into shape (4488704,)


 21%|██████████████████████▋                                                                                      | 488/2345 [30:34<2:05:18,  4.05s/it]

could not broadcast input array from shape (3903600,) into shape (3907072,)


 22%|███████████████████████▌                                                                                     | 508/2345 [31:46<1:55:55,  3.79s/it]

could not broadcast input array from shape (3305695,) into shape (3309056,)


 22%|███████████████████████▋                                                                                     | 509/2345 [31:52<2:15:22,  4.42s/it]

could not broadcast input array from shape (4494381,) into shape (4492800,)


 25%|███████████████████████████▍                                                                                 | 590/2345 [37:36<2:20:56,  4.82s/it]

could not broadcast input array from shape (2972578,) into shape (2975232,)


 26%|████████████████████████████▌                                                                                | 614/2345 [39:40<2:35:04,  5.38s/it]

could not broadcast input array from shape (4236541,) into shape (4221098,)


 28%|██████████████████████████████                                                                               | 648/2345 [42:19<2:11:25,  4.65s/it]

could not broadcast input array from shape (3510485,) into shape (3576426,)


 28%|██████████████████████████████▍                                                                              | 654/2345 [42:44<1:50:07,  3.91s/it]

could not broadcast input array from shape (3038506,) into shape (3045044,)


 29%|███████████████████████████████▏                                                                             | 670/2345 [43:46<2:10:01,  4.66s/it]

could not broadcast input array from shape (3850365,) into shape (3861408,)


 33%|████████████████████████████████████▍                                                                        | 785/2345 [53:22<2:46:40,  6.41s/it]

could not broadcast input array from shape (2886852,) into shape (2888608,)


 34%|████████████████████████████████████▌                                                                        | 786/2345 [53:33<3:19:41,  7.69s/it]

could not broadcast input array from shape (4160449,) into shape (4182016,)


 36%|██████████████████████████████████████▌                                                                    | 846/2345 [1:00:12<3:09:20,  7.58s/it]

could not broadcast input array from shape (3984266,) into shape (3991204,)


 37%|███████████████████████████████████████▎                                                                   | 861/2345 [1:01:33<2:01:55,  4.93s/it]