In [19]:
import urllib.request

# URL of the audio file
url = "https://raw.githubusercontent.com/bpbpublications/Building-Transformer-Models-with-PyTorch/main/chapter10/audio_noisy.wav"
filename = "audio_noisy.wav"

# Download the file from `url` and save it locally under `filename`:
urllib.request.urlretrieve(url, filename)

('audio_noisy.wav', <http.client.HTTPMessage at 0x1908224d0>)

In [20]:
import torch
import torchaudio
from speechbrain.pretrained import SpectralMaskEnhancement
from IPython.display import Audio

enhance_model = SpectralMaskEnhancement.from_hparams(
    source="speechbrain/metricgan-plus-voicebank",
    savedir="pretrained_models/metricgan-plus-voicebank",
)
waveform, sample_rate = torchaudio.load(filename)

# If your waveform is stereo (2 channels) you can convert it to mono (1 channel) like this:
waveform = torch.mean(waveform, dim=0, keepdim=True)

# Usually, the SpeechBrain's pre-trained models expect audio at 16kHz,
# so you might need to resample your audio if it's not at 16kHz:
if sample_rate != 16000:
    resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
    waveform = resampler(waveform)

# Now your waveform tensor is ready to be used with the enhancement model.
# But remember to normalize the audio data before using it:
noisy = waveform / torch.max(torch.abs(waveform))

# Listen to the noisy audio
print("Noisy audio:")
display(Audio(noisy.squeeze().detach().numpy(), rate=16000))


Noisy audio:


In [21]:
# Add relative length tensor
enhanced = enhance_model.enhance_batch(noisy, lengths=torch.tensor([1.]))

# Saving enhanced signal on disk
torchaudio.save('enhanced.wav', enhanced.cpu(), 16000)
# Load and listen to the enhanced audio
print("Enhanced audio:")
enhanced_audio = torchaudio.load('enhanced.wav')[0]
torchaudio.save('enhanced.wav', enhanced.cpu(), 16000)
display(Audio(enhanced_audio.detach().numpy(), rate=16000))

Enhanced audio:
