In [None]:
%%capture
!pip install speechbrain
!pip install transformers

In [1]:
import os

import librosa
import soundfile
import torchaudio
from speechbrain.inference.VAD import VAD
from speechbrain.dataio.dataio import read_audio
from speechbrain.inference.separation import SepformerSeparation as separator

from IPython.display import Audio

import pandas as pd
from tqdm import tqdm

In [2]:
VAD = VAD.from_hparams(source="speechbrain/vad-crdnn-libriparty", savedir='./.cache/vad-crdnn-libriparty', run_opts={"device":"cuda"})
model = separator.from_hparams(source="speechbrain/sepformer-libri2mix", savedir='./.cache/sepformer-libri2mix', run_opts={"device":"cuda"})

In [None]:
est_sources = model.separate_file(path='speechbrain/sepformer-wsj02mix/test_mixture.wav')
est_sources.shape

In [None]:
Audio("./audio_cache/test_mixture.wav", rate=8000)

In [None]:
signal = read_audio("./audio_cache/test_mixture.wav").squeeze()
Audio(signal, rate=8000)

In [None]:
est_sources.shape

In [None]:
Audio(est_sources[:, :, 0].detach().cpu().squeeze(), rate=8000)

In [None]:
Audio(est_sources[:, :, 1].detach().cpu().squeeze(), rate=8000)

In [None]:
est_sources[:, :, 0]

In [4]:
df = pd.read_csv("./data/test.csv")

In [5]:
df

Unnamed: 0,id,path
0,TEST_00000,./test/TEST_00000.ogg
1,TEST_00001,./test/TEST_00001.ogg
2,TEST_00002,./test/TEST_00002.ogg
3,TEST_00003,./test/TEST_00003.ogg
4,TEST_00004,./test/TEST_00004.ogg
...,...,...
49995,TEST_49995,./test/TEST_49995.ogg
49996,TEST_49996,./test/TEST_49996.ogg
49997,TEST_49997,./test/TEST_49997.ogg
49998,TEST_49998,./test/TEST_49998.ogg


In [6]:
df['id']

0        TEST_00000
1        TEST_00001
2        TEST_00002
3        TEST_00003
4        TEST_00004
            ...    
49995    TEST_49995
49996    TEST_49996
49997    TEST_49997
49998    TEST_49998
49999    TEST_49999
Name: id, Length: 50000, dtype: object

In [None]:
if not os.path.isdir("./data/test_separated"):
    os.mkdir("./data/test_separated")

In [None]:
def save_audio(tensor, file_path):
    cache = "./audio_cache/temp.wav"

    # save tensor
    torchaudio.save(cache, tensor.detach().cpu(), 8000)

    # resampling - actual file
    y, sr = librosa.load(cache, sr=8000)
    y_resampled = librosa.resample(y, orig_sr=8000, target_sr=32000)
    soundfile.write(file_path, y_resampled, 32000, format="ogg")

    # resampling - temporary file
    y, sr = librosa.load(cache, sr=8000)
    y_resampled = librosa.resample(y, orig_sr=8000, target_sr=16000)
    soundfile.write(file_path.replace("ogg", "16hz.ogg"), y_resampled, 16000, format="ogg")

    # extract label
    boundaries = VAD.get_speech_segments(file_path.replace("ogg", "16hz.ogg"), activation_th=0.4)
    label = "noise"
    last_end = 0
    for i in range(boundaries.shape[0]):
        begin_value = boundaries[i, 0]
        end_value = boundaries[i, 1]
        if last_end == begin_value:
            label = "speech"
        last_end = end_value

    return file_path, label

In [None]:
labels = dict(id=[], path0=[], path1=[], label0=[], label1=[])

iterator = iter(tqdm(df['id']))

for i in zip(range(0), iterator):
    pass

for _id in iterator:
    est_sources = model.separate_file(path=f"./data/test/{_id}.ogg")
    
    if not os.path.isdir(f"./data/test_separated/{_id}/"):
        os.mkdir(f"./data/test_separated/{_id}/")
    
    labels['id'].append(_id)
    
    file0_path = f"./data/test_separated/{_id}/0.ogg"
    path0, label0 = save_audio(est_sources[:, :, 0], file0_path)
    labels['path0'].append(f"./test_separated/{_id}/0.ogg")
    labels['label0'].append(label0)
    print("Audio extracted successfully -", path0, label0)

    file1_path = f"./data/test_separated/{_id}/1.ogg"
    path1, label1 = save_audio(est_sources[:, :, 1], file1_path)
    labels['path1'].append(f"./test_separated/{_id}/1.ogg")
    labels['label1'].append(label1)
    print("Audio extracted successfully -", path1, label1)

pd.DataFrame(labels).to_csv(f"./data/test_separated.csv", index=False)