In [1]:
#%%capture
#!pip install --upgrade resemble-enhance #datasets 

In [2]:
#make sure to login to HF (i used CLI: huggingface-cli login)

In [3]:
import torch
from datasets import load_dataset, Audio
from resemble_enhance.enhancer.inference import denoise, enhance


device = "cuda" if torch.cuda.is_available() else "cpu"

ModuleNotFoundError: No module named 'torch'

In [None]:
dataset_name = "anyantudre/MooreSpeechCorpora"
split = "train"

dataset = load_dataset(dataset_name, split=split)
dataset

Downloading readme:   0%|          | 0.00/429 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/292M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/249M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2553 [00:00<?, ? examples/s]

Dataset({
    features: ['audio', 'text', 'duration', 'voice', 'speaker_id'],
    num_rows: 2553
})

In [None]:
def denoise_enhance_audio(
        batch, 
        solver='midpoint', 
        nfe=90, 
        tau=0.6, 
        denoise_before_enhancement=False,
        apply_enhance=False
    ):

    lambd = 0.9 if denoise_before_enhancement else 0.1

    denoised_list = []
    enhanced_list = []

    #process each audio sample in the batch
    for audio in batch['audio']:
        audio_array   = audio['array']
        sampling_rate = audio['sampling_rate']

        #convert audio to torch tensor
        audio_tensor = torch.tensor(audio_array, dtype=torch.float32)

        #denoising
        try:
            denoised_audio, _ = denoise(audio_tensor, sampling_rate, device)
        except Exception as e:
            print(f"Error during denoising: {e}")
            denoised_audio = audio_tensor  #fallback to original if there's an error

        denoised_list.append(denoised_audio.cpu().numpy())

        if apply_enhance:
            #enhancing
            try:
                enhanced_audio, _ = enhance(audio_tensor, sampling_rate, device, nfe=nfe, solver=solver, lambd=lambd, tau=tau)
            except Exception as e:
                print(f"Error during enhancement: {e}")
                enhanced_audio = audio_tensor  #fallback to original if there's an error

            enhanced_list.append(enhanced_audio.cpu().numpy())

    result = {
        "denoised_audio": denoised_list
    }
    if apply_enhance:
        result["enhanced_audio"] = enhanced_list

    return result

In [None]:
processed_dataset = dataset.map(
    denoise_enhance_audio,
    batched=True,  #process row by row
    apply_enhance=False
)

Map:   0%|          | 0/2553 [00:00<?, ? examples/s]

100%|██████████| 1/1 [00:00<00:00,  3.59it/s]
100%|██████████| 1/1 [00:05<00:00,  5.78s/it]
100%|██████████| 1/1 [00:00<00:00,  3.86it/s]
100%|██████████| 1/1 [00:05<00:00,  5.47s/it]
100%|██████████| 1/1 [00:00<00:00,  6.07it/s]
100%|██████████| 1/1 [00:03<00:00,  3.88s/it]
100%|██████████| 1/1 [00:00<00:00,  3.58it/s]
100%|██████████| 1/1 [00:05<00:00,  5.81s/it]
100%|██████████| 1/1 [00:00<00:00,  8.95it/s]
100%|██████████| 1/1 [00:02<00:00,  2.66s/it]
100%|██████████| 1/1 [00:00<00:00,  4.17it/s]
100%|██████████| 1/1 [00:05<00:00,  5.21s/it]
100%|██████████| 1/1 [00:00<00:00,  7.35it/s]
100%|██████████| 1/1 [00:02<00:00,  2.83s/it]
100%|██████████| 1/1 [00:00<00:00,  3.54it/s]
100%|██████████| 1/1 [00:05<00:00,  5.89s/it]
100%|██████████| 1/1 [00:00<00:00,  8.97it/s]
100%|██████████| 1/1 [00:02<00:00,  2.60s/it]
100%|██████████| 1/1 [00:00<00:00,  4.20it/s]
100%|██████████| 1/1 [00:05<00:00,  5.20s/it]
100%|██████████| 1/1 [00:00<00:00,  8.96it/s]
100%|██████████| 1/1 [00:02<00:00,

In [None]:
#cast new audio cols to HF Audio format
processed_dataset = processed_dataset.cast_column("denoised_audio", Audio())
#processed_dataset = processed_dataset.cast_column("enhanced_audio", Audio())
processed_dataset

In [None]:
#push to HF hub
processed_dataset.push_to_hub("anyantudre/MooreSpeechCorpora")