In [2]:
from df.enhance import enhance, init_df, load_audio, save_audio
from pathlib import Path
from tqdm import tqdm
import glob
import os
from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks
import torch
import soundfile as sf
import pyloudnorm as pyln
import librosa

In [12]:
OUTPUT_DIR = "output"

# make the OUTPUT_DIR if it doesn't exist
os.makedirs(OUTPUT_DIR, exist_ok=True)
frcrn = pipeline(Tasks.acoustic_noise_suppression, model='speech_frcrn_ans_cirm_16k')
# Load default model
model, df_state, model_name_suffix = init_df(model_base_dir="DeepFilterNet3")
# Get our SpeakerID audio
speakerID_audio_folder = "."
audio_paths = glob.glob(f"{speakerID_audio_folder}/*.wav")

2024-05-17 11:55:40,559 - modelscope - INFO - initiate model from speech_frcrn_ans_cirm_16k
2024-05-17 11:55:40,560 - modelscope - INFO - initiate model from location speech_frcrn_ans_cirm_16k.
2024-05-17 11:55:40,567 - modelscope - INFO - initialize model from speech_frcrn_ans_cirm_16k


[32m2024-05-17 11:55:41[0m | [1mINFO    [0m | [36mDF[0m | [1mLoading model settings of DeepFilterNet3[0m
[32m2024-05-17 11:55:41[0m | [1mINFO    [0m | [36mDF[0m | [1mUsing DeepFilterNet3 model at C:\Users\alien\AppData\Local\DeepFilterNet\DeepFilterNet\Cache\DeepFilterNet3[0m
[32m2024-05-17 11:55:41[0m | [1mINFO    [0m | [36mDF[0m | [1mInitializing model `deepfilternet3`[0m
[32m2024-05-17 11:55:41[0m | [1mINFO    [0m | [36mDF[0m | [1mFound checkpoint C:\Users\alien\AppData\Local\DeepFilterNet\DeepFilterNet\Cache\DeepFilterNet3\checkpoints\model_120.ckpt.best with epoch 120[0m
[32m2024-05-17 11:55:41[0m | [1mINFO    [0m | [36mDF[0m | [1mRunning on device cuda:0[0m
[32m2024-05-17 11:55:41[0m | [1mINFO    [0m | [36mDF[0m | [1mModel loaded[0m


In [22]:
audio_filename = os.path.splitext(os.path.basename(audio_paths[0]))[0]
output_path = f"{OUTPUT_DIR}/{audio_filename}-loudnormed.wav"

In [14]:
data, rate = sf.read(audio_paths[0]) # load audio
# peak normalize audio to -0.1 dB as frcrn tend to output very soft
peak_normalized_audio = pyln.normalize.peak(data, -0.1)  # not using loudness norm here as it causes a bit of clipping
sf.write(output_path, peak_normalized_audio, rate)

In [23]:
frcrn_output_path = os.path.join(OUTPUT_DIR, audio_filename+'frcrn'+'.wav')
frcrn_processed = frcrn(output_path, output_path=frcrn_output_path)['output_pcm']

inputs:(1, 122463)
padding: 26463
inputs after padding:(1, 148926)


In [24]:
frcrn_loudnormed_output_path = os.path.join(OUTPUT_DIR, audio_filename+'frcrn-loudnormed'+'.wav')
data, rate = sf.read(frcrn_output_path) # load audio
# peak normalize audio to -0.1 dB as frcrn tend to output very soft
peak_normalized_audio = pyln.normalize.peak(data, -0.1)  # not using loudness norm here as it causes a bit of clipping
sf.write(frcrn_loudnormed_output_path, peak_normalized_audio, rate)

In [26]:
df3_output_path = os.path.join(OUTPUT_DIR, audio_filename+'df3'+'.wav')
audio, _ = load_audio(output_path, sr=df_state.sr())
enhanced = enhance(model, df_state, audio)
save_audio(df3_output_path, enhanced, df_state.sr(), dtype=torch.float16)

  return ta_resample(audio, orig_sr, new_sr, **params)


In [27]:
data, rate = sf.read(df3_output_path) # load audio
# peak normalize audio to -0.1 dB as frcrn tend to output very soft
peak_normalized_audio = pyln.normalize.peak(data, -0.1)  # not using loudness norm here as it causes a bit of clipping
df3_loudnormed_output_path = os.path.join(OUTPUT_DIR, audio_filename+'df3-loudnormed'+'.wav')
sf.write(df3_loudnormed_output_path, peak_normalized_audio, rate)

In [None]:
# Enhance each audio
for audio_path in tqdm(audio_paths):
    # get the audio filename without .wav extension
    audio_filename = os.path.splitext(os.path.basename(audio_path))[0]
    output_path = f"{OUTPUT_DIR}/{audio_filename}.wav"
    frcrn(audio_path, output_path=output_path)

    data, rate = sf.read(output_path) # load audio
    # peak normalize audio to -0.1 dB as frcrn tend to output very soft
    peak_normalized_audio = pyln.normalize.peak(data, -0.1)  # not using loudness norm here as it causes a bit of clipping
    sf.write(output_path, peak_normalized_audio, rate)
    audio, _ = load_audio(output_path, sr=df_state.sr())
    # Denoise the audio
    enhanced = enhance(model, df_state, audio)
    # Save for listening
    save_audio(output_path, enhanced, df_state.sr(), dtype=torch.float16)  # default is torch.int16 which causes clipping on some audios

    data, rate = sf.read(output_path) # load audio
    normalized_audio = pyln.normalize.peak(data, -0.1)  # not using loudness norm here as it causes a bit of clipping on non palmtree clips
    sf.write(output_path, normalized_audio, rate)
