In [1]:
from df.enhance import enhance, init_df, load_audio, save_audio
from pathlib import Path
from tqdm import tqdm
import glob
import os
from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks
import torch
import soundfile as sf
import pyloudnorm as pyln
from scipy.io.wavfile import write
import librosa


OUTPUT_DIR = Path("train_denoised")

# make the OUTPUT_DIR if it doesn't exist
os.makedirs(OUTPUT_DIR, exist_ok=True)
frcrn = pipeline(Tasks.acoustic_noise_suppression, model='speech_frcrn_ans_cirm_16k')
# Load default model
model, df_state, model_name_suffix = init_df(model_base_dir="DeepFilterNet3/")
# Get our SpeakerID audio
speakerID_audio_folder = "train"
audio_paths = glob.glob(f"{speakerID_audio_folder}/*.wav")
# Enhance each audio
for audio_path in tqdm(audio_paths):
    # get the audio filename without .wav extension
    audio_filename = os.path.splitext(os.path.basename(audio_path))[0]
    output_path = OUTPUT_DIR/f"{audio_filename}_enhanced.wav"
    frcrn(audio_path, output_path=output_path)

    data, rate = sf.read(output_path) # load audio
    # peak normalize audio to -0.1 dB as frcrn tend to output very soft
    peak_normalized_audio = pyln.normalize.peak(data, -0.1)  # not using loudness norm here as it causes a bit of clipping
    sf.write(output_path, peak_normalized_audio, rate)
    if 'PALMTREE' not in audio_path:
        audio, _ = load_audio(output_path, sr=df_state.sr())
        # Denoise the audio
        enhanced = enhance(model, df_state, audio)
        # Save for listening
        save_audio(output_path, enhanced, df_state.sr(), dtype=torch.float16)  # default is torch.int16 which causes clipping on some audios
    audio, sr = librosa.load(output_path, sr=22050)  # downsample to 22.05Khz
    sf.write(output_path, audio, sr)  # save the downsampled one

    data, rate = sf.read(output_path) # load audio
    if 'PALMTREE' not in audio_path:
        # peak normalize audio to -0.1 dB as frcrn tend to output very soft
        normalized_audio = pyln.normalize.peak(data, -0.1)  # not using loudness norm here as it causes a bit of clipping on non palmtree clips
    else:  # PALMTREE ones need louder
        # measure the loudness first 
        meter = pyln.Meter(rate) # create BS.1770 meter
        loudness = meter.integrated_loudness(data)
        # loudness normalize audio to -18 dB LUFS
        normalized_audio = pyln.normalize.loudness(data, loudness, -18.0)
    sf.write(output_path, normalized_audio, rate)
# TOMATOFARMER_memberD_train_enhanced.wav is denosied using DeepfilterNet2 https://huggingface.co/spaces/hshr/DeepFilterNet2 then normalized peak to -0.1db using Adobe Audition


2023-06-13 00:43:35,703 - modelscope - INFO - PyTorch version 2.0.1+cu118 Found.
2023-06-13 00:43:35,709 - modelscope - INFO - Loading ast index from /root/.cache/modelscope/ast_indexer
2023-06-13 00:43:35,727 - modelscope - INFO - Loading done! Current index file version is 1.6.1, with md5 2d7845052188f90f8e56b158b6752809 and a total number of 849 components indexed
  from .autonotebook import tqdm as notebook_tqdm
2023-06-13 00:43:36,812 - modelscope - INFO - initiate model from speech_frcrn_ans_cirm_16k
2023-06-13 00:43:36,814 - modelscope - INFO - initiate model from location speech_frcrn_ans_cirm_16k.
2023-06-13 00:43:36,823 - modelscope - INFO - initialize model from speech_frcrn_ans_cirm_16k


[32m2023-06-13 00:43:37[0m | [1mINFO    [0m | [36mDF[0m | [1mRunning on torch 2.0.1+cu118[0m
[32m2023-06-13 00:43:37[0m | [1mINFO    [0m | [36mDF[0m | [1mRunning on host Desktop[0m
[32m2023-06-13 00:43:37[0m | [1mINFO    [0m | [36mDF[0m | [1mLoading model settings of DeepFilterNet3[0m


fatal: not a git repository (or any of the parent directories): .git


[32m2023-06-13 00:43:37[0m | [1mINFO    [0m | [36mDF[0m | [1mInitializing model `deepfilternet3`[0m
[32m2023-06-13 00:43:39[0m | [1mINFO    [0m | [36mDF[0m | [1mFound checkpoint DeepFilterNet3/checkpoints/model_120.ckpt.best with epoch 120[0m
[32m2023-06-13 00:43:40[0m | [1mINFO    [0m | [36mDF[0m | [1mRunning on device cuda:0[0m
[32m2023-06-13 00:43:40[0m | [1mINFO    [0m | [36mDF[0m | [1mModel loaded[0m


  0%|          | 0/32 [00:00<?, ?it/s]

inputs:(1, 240000)
padding: 24000
inputs after padding:(1, 264000)


  3%|▎         | 1/32 [00:03<01:34,  3.05s/it]

inputs:(1, 240000)
padding: 24000
inputs after padding:(1, 264000)


  6%|▋         | 2/32 [00:03<00:54,  1.81s/it]

inputs:(1, 240000)
padding: 24000
inputs after padding:(1, 264000)


  9%|▉         | 3/32 [00:04<00:41,  1.43s/it]

inputs:(1, 240000)
padding: 24000
inputs after padding:(1, 264000)


 12%|█▎        | 4/32 [00:05<00:34,  1.25s/it]

inputs:(1, 240000)
padding: 24000
inputs after padding:(1, 264000)


 16%|█▌        | 5/32 [00:06<00:30,  1.13s/it]

inputs:(1, 240000)
padding: 24000
inputs after padding:(1, 264000)


 19%|█▉        | 6/32 [00:07<00:27,  1.05s/it]

inputs:(1, 240000)
padding: 24000
inputs after padding:(1, 264000)


 22%|██▏       | 7/32 [00:08<00:26,  1.06s/it]

inputs:(1, 240000)
padding: 24000
inputs after padding:(1, 264000)


 25%|██▌       | 8/32 [00:09<00:25,  1.06s/it]

inputs:(1, 240000)
padding: 24000
inputs after padding:(1, 264000)


 28%|██▊       | 9/32 [00:10<00:24,  1.05s/it]

inputs:(1, 240000)
padding: 24000
inputs after padding:(1, 264000)


 31%|███▏      | 10/32 [00:11<00:22,  1.04s/it]

inputs:(1, 240000)
padding: 24000
inputs after padding:(1, 264000)


 34%|███▍      | 11/32 [00:12<00:21,  1.03s/it]

inputs:(1, 240000)
padding: 24000
inputs after padding:(1, 264000)


 38%|███▊      | 12/32 [00:13<00:20,  1.03s/it]

inputs:(1, 240000)
padding: 24000
inputs after padding:(1, 264000)


 41%|████      | 13/32 [00:15<00:19,  1.03s/it]

inputs:(1, 240000)
padding: 24000
inputs after padding:(1, 264000)


 44%|████▍     | 14/32 [00:16<00:18,  1.03s/it]

inputs:(1, 240000)
padding: 24000
inputs after padding:(1, 264000)


 47%|████▋     | 15/32 [00:17<00:17,  1.03s/it]

inputs:(1, 240000)
padding: 24000
inputs after padding:(1, 264000)


 50%|█████     | 16/32 [00:18<00:16,  1.03s/it]

inputs:(1, 240000)
padding: 24000
inputs after padding:(1, 264000)


 53%|█████▎    | 17/32 [00:19<00:15,  1.03s/it]

inputs:(1, 240000)
padding: 24000
inputs after padding:(1, 264000)


 56%|█████▋    | 18/32 [00:20<00:14,  1.04s/it]

inputs:(1, 240000)
padding: 24000
inputs after padding:(1, 264000)


 59%|█████▉    | 19/32 [00:21<00:13,  1.03s/it]

inputs:(1, 240000)
padding: 24000
inputs after padding:(1, 264000)


 62%|██████▎   | 20/32 [00:22<00:12,  1.03s/it]

inputs:(1, 240000)
padding: 24000
inputs after padding:(1, 264000)


 66%|██████▌   | 21/32 [00:23<00:11,  1.02s/it]

inputs:(1, 240000)
padding: 24000
inputs after padding:(1, 264000)


 69%|██████▉   | 22/32 [00:24<00:10,  1.01s/it]

inputs:(1, 240000)
padding: 24000
inputs after padding:(1, 264000)


 72%|███████▏  | 23/32 [00:25<00:09,  1.01s/it]

inputs:(1, 240000)
padding: 24000
inputs after padding:(1, 264000)


 75%|███████▌  | 24/32 [00:26<00:08,  1.03s/it]

inputs:(1, 240000)
padding: 24000
inputs after padding:(1, 264000)


 78%|███████▊  | 25/32 [00:27<00:06,  1.06it/s]

inputs:(1, 240000)
padding: 24000
inputs after padding:(1, 264000)


 81%|████████▏ | 26/32 [00:27<00:05,  1.11it/s]

inputs:(1, 240000)
padding: 24000
inputs after padding:(1, 264000)


 84%|████████▍ | 27/32 [00:28<00:04,  1.15it/s]

inputs:(1, 240000)
padding: 24000
inputs after padding:(1, 264000)


 88%|████████▊ | 28/32 [00:29<00:03,  1.18it/s]

inputs:(1, 240000)
padding: 24000
inputs after padding:(1, 264000)


 91%|█████████ | 29/32 [00:30<00:02,  1.12it/s]

inputs:(1, 240000)
padding: 24000
inputs after padding:(1, 264000)


 94%|█████████▍| 30/32 [00:31<00:01,  1.08it/s]

inputs:(1, 240000)
padding: 24000
inputs after padding:(1, 264000)


 97%|█████████▋| 31/32 [00:32<00:00,  1.04it/s]

inputs:(1, 240000)
padding: 24000
inputs after padding:(1, 264000)


100%|██████████| 32/32 [00:33<00:00,  1.05s/it]
