In [1]:
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import torch
import torchaudio

In [9]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-hi")
Tmodel = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-hi")

In [11]:
import torch
import torchaudio
import sounddevice as sd
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
import noisereduce as nr

# Load pre-trained model and processor
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")

# Parameters for recording
duration = 5  # seconds
sampling_rate = 16000

# Function to normalize the audio
def normalize(audio):
    audio = audio / torch.max(torch.abs(audio))
    return audio

# Function to preprocess the audio
def preprocess_audio(audio, sampling_rate):
    # Resample if necessary
    if sampling_rate != 16000:
        resampler = torchaudio.transforms.Resample(sampling_rate, 16000)
        audio = resampler(audio)
        sampling_rate = 16000
    
    # Convert to numpy array for noise reduction
    audio_np = audio.numpy()

    # Apply noise reduction
    reduced_noise_array = nr.reduce_noise(y=audio_np, sr=sampling_rate)
    
    # Convert back to tensor
    audio = torch.tensor(reduced_noise_array)
    
    # Normalize the audio
    audio = normalize(audio)
    
    return audio, sampling_rate


# Record audio from the microphone
src_text = []
print("Recording...")
speech_array = sd.rec(int(duration * sampling_rate), samplerate=sampling_rate, channels=1, dtype='float32')
sd.wait()  # Wait until recording is finished
print("Recording finished.")

# Convert the recorded speech to a tensor
speech_array = torch.tensor(speech_array).squeeze()

# Preprocess the audio
speech_array, sampling_rate = preprocess_audio(speech_array, sampling_rate)

# Process input values
input_values = processor(speech_array, sampling_rate=sampling_rate, return_tensors="pt", padding="longest").input_values

# Perform ASR
with torch.no_grad():
    logits = model(input_values).logits

# Decode the predicted ids to text
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.decode(predicted_ids[0])
print("Transcription:", transcription)

src_text.append(str(transcription))
translated = Tmodel.generate(**tokenizer(src_text, return_tensors="pt", padding=True))
[tokenizer.decode(t, skip_special_tokens=True) for t in translated]

Some weights of the model checkpoint at facebook/wav2vec2-base-960h were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.masked_spec_embed']
You sho

Recording...
Recording finished.
Transcription: AY MYTY ISATE WORTHRO


['सच्ची खुशी और संतोष मिलता है']