In [2]:
from df.enhance import enhance, init_df, load_audio, save_audio
from pathlib import Path
from tqdm import tqdm
import glob
import os
from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks
import torch
import soundfile as sf
import pyloudnorm as pyln
from scipy.io.wavfile import write
import librosa


OUTPUT_DIR = Path("val_denoised")

# make the OUTPUT_DIR if it doesn't exist
os.makedirs(OUTPUT_DIR, exist_ok=True)
frcrn = pipeline(Tasks.acoustic_noise_suppression, model='speech_frcrn_ans_cirm_16k')
# Load default model
model, df_state, model_name_suffix = init_df(model_base_dir="DeepFilterNet3/")
# Get our SpeakerID audio
speakerID_audio_folder = "val"
audio_paths = glob.glob(f"{speakerID_audio_folder}/*.wav")
# Enhance each audio
for audio_path in tqdm(audio_paths):
    # get the audio filename without .wav extension
    audio_filename = os.path.splitext(os.path.basename(audio_path))[0]
    output_path = OUTPUT_DIR/f"{audio_filename}_enhanced.wav"
    frcrn(audio_path, output_path=output_path)

    data, rate = sf.read(output_path) # load audio
    # peak normalize audio to -0.1 dB as frcrn tend to output very soft
    peak_normalized_audio = pyln.normalize.peak(data, -0.1)  # not using loudness norm here as it causes a bit of clipping
    sf.write(output_path, peak_normalized_audio, rate)
    if 'PALMTREE' not in audio_path:
        audio, _ = load_audio(output_path, sr=df_state.sr())
        # Denoise the audio
        enhanced = enhance(model, df_state, audio)
        # Save for listening
        save_audio(output_path, enhanced, df_state.sr(), dtype=torch.float16)  # default is torch.int16 which causes clipping on some audios
    audio, sr = librosa.load(output_path, sr=16000)  # downsample to 16Khz
    sf.write(output_path, audio, sr)  # save the downsampled one

    data, rate = sf.read(output_path) # load audio
    if 'PALMTREE' not in audio_path:
        # peak normalize audio to -0.1 dB as frcrn tend to output very soft
        normalized_audio = pyln.normalize.peak(data, -0.1)  # not using loudness norm here as it causes a bit of clipping on non palmtree clips
    else:  # PALMTREE ones need louder
        # measure the loudness first 
        meter = pyln.Meter(rate) # create BS.1770 meter
        loudness = meter.integrated_loudness(data)
        # loudness normalize audio to -18 dB LUFS
        normalized_audio = pyln.normalize.loudness(data, loudness, -18.0)
    sf.write(output_path, normalized_audio, rate)
# TOMATOFARMER_memberD_train_enhanced.wav is denosied using DeepfilterNet2 https://huggingface.co/spaces/hshr/DeepFilterNet2 then normalized peak to -0.1db using Adobe Audition


2023-06-13 23:06:48,248 - modelscope - INFO - initiate model from speech_frcrn_ans_cirm_16k
2023-06-13 23:06:48,252 - modelscope - INFO - initiate model from location speech_frcrn_ans_cirm_16k.
2023-06-13 23:06:48,268 - modelscope - INFO - initialize model from speech_frcrn_ans_cirm_16k


[32m2023-06-13 23:06:49[0m | [1mINFO    [0m | [36mDF[0m | [1mLoading model settings of DeepFilterNet3[0m
[32m2023-06-13 23:06:49[0m | [1mINFO    [0m | [36mDF[0m | [1mInitializing model `deepfilternet3`[0m
[32m2023-06-13 23:06:49[0m | [1mINFO    [0m | [36mDF[0m | [1mFound checkpoint DeepFilterNet3/checkpoints/model_120.ckpt.best with epoch 120[0m
[32m2023-06-13 23:06:49[0m | [1mINFO    [0m | [36mDF[0m | [1mRunning on device cuda:0[0m
[32m2023-06-13 23:06:49[0m | [1mINFO    [0m | [36mDF[0m | [1mModel loaded[0m


  0%|          | 0/5 [00:00<?, ?it/s]

inputs:(1, 240000)
padding: 24000
inputs after padding:(1, 264000)


 20%|██        | 1/5 [00:01<00:05,  1.32s/it]

inputs:(1, 240000)
padding: 24000
inputs after padding:(1, 264000)


 40%|████      | 2/5 [00:02<00:03,  1.02s/it]

inputs:(1, 240000)
padding: 24000
inputs after padding:(1, 264000)


 60%|██████    | 3/5 [00:02<00:01,  1.07it/s]

inputs:(1, 240000)
padding: 24000
inputs after padding:(1, 264000)


 80%|████████  | 4/5 [00:03<00:00,  1.15it/s]

inputs:(1, 240000)
padding: 24000
inputs after padding:(1, 264000)


100%|██████████| 5/5 [00:04<00:00,  1.17it/s]
