In [48]:
%pip install transformers torch torchaudio soundfile noisereduce numpy

Note: you may need to restart the kernel to use updated packages.


In [49]:
import torch
import noisereduce as nr
import soundfile as sf
import numpy as np
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC


In [50]:
# Load model and processor, this model is fine-tuned to noisy corpus telephone data and trained to be robust

processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-robust-ft-swbd-300h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-robust-ft-swbd-300h")


Some weights of the model checkpoint at facebook/wav2vec2-large-robust-ft-swbd-300h were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-robust-ft-swbd-300h and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You s

In [56]:
# Load an audio file using soundfile
audio_path = "data/audio/audio_3499.wav"
waveform, sample_rate = sf.read(audio_path)

# Ensure the audio is in the right format (e.g., mono)
if len(waveform.shape) > 1:
    waveform = waveform.mean(axis=1)  # Convert to mono by averaging channels

# Normalize the audio input to the range [-1.0, 1.0]
waveform = waveform / np.max(np.abs(waveform))

# Apply noise reduction
waveform_denoised = nr.reduce_noise(y=waveform, sr=sample_rate)

# Convert to tensor and ensure it has shape [batch_size, sequence_length]
waveform_denoised = torch.tensor(waveform_denoised, dtype=torch.float32).unsqueeze(0)

# Tokenize the audio input
input_values = processor(waveform_denoised, sampling_rate=sample_rate, return_tensors="pt", padding=True).input_values

# Ensure the shape of input_values is [batch_size, sequence_length]
input_values = input_values.squeeze(1)  # Remove unnecessary dimensions if needed

# Retrieve logits
with torch.no_grad():
    logits = model(input_values).logits

# Take argmax and decode
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.batch_decode(predicted_ids)

# Print the transcription
print(transcription)

['HATTING IN FEDA ORT A LI DOTIGAD IS LACK AND BONII UTWEDO DEPLOY IS ANGTY AT ORG']


In [57]:
# Load an audio file using soundfile
audio_path = "data/audio/audio_3499.wav"
audio_input, sample_rate = sf.read(audio_path)
audio_input = torch.tensor(audio_input).unsqueeze(0)  # Convert to tensor and add batch dimension

# Process the audio input
inputs = processor(audio_input.squeeze(), sampling_rate=sample_rate, return_tensors="pt", padding=True)

# Perform ASR
with torch.no_grad():
    logits = model(inputs.input_values).logits

# Decode the output
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.batch_decode(predicted_ids)
print(transcription)

['HEDING IS SEA FORTFA  DATIGA IS BLACK AND GONLAI  C ELOY IS ATY AT RD']
