In [28]:
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from whisperx import load_audio
from whisperx.vads import Vad, Silero, Pyannote, SileroCustom

In [29]:
model_name = "openai/whisper-large-v3"
model = WhisperForConditionalGeneration.from_pretrained(model_name, device_map="auto")
processor = WhisperProcessor.from_pretrained(model_name)

In [32]:
vad = SileroCustom(
    device=model.device,
    chunk_size=1000,
    vad_onset=0.3,
    vad_offset=0.2,
    vad_onnx=True,
    silero_merge_cutoff=0.5,
)


>>Performing voice activity detection using Silero...


Using cache found in /home/ubuntu/.cache/torch/hub/snakers4_silero-vad_master


In [33]:
import os
import librosa

audios = []

def get_vad_segments(waveform, sr):
    vad_input = {'waveform': waveform, 'sample_rate': sr}
    vad_segments = vad(vad_input)
    temp = [segment for i, segment in enumerate(vad_segments)]
    print(len(temp))
    vad_segments = vad.merge_chunks(
        vad_segments,
        chunk_size=10,
        onset=0.5,
        offset=0.383,
    )
    audios = []
    for segment in vad_segments:
        start = int(segment['start'] * sr)
        end = int(segment['end'] * sr)
        audios.append(
            waveform[start:end]
        )
    return audios

audios = []
for file in os.listdir("examples") + [
    '/home/ubuntu/v2v-voice-library/data/fisher/audios/000/fe_03_00001.wav',
]:
    if file.endswith(".wav") or file.endswith(".mp3"):
        if file == '/home/ubuntu/v2v-voice-library/data/fisher/audios/000/fe_03_00001.wav':
            waveform, sr = librosa.load(file, sr=None, mono=False)
            if sr != 16000:
                waveform = librosa.resample(waveform, orig_sr=sr, target_sr=16000)
                sr = 16000
            waveform1 = waveform[0]
            waveform2 = waveform[1]
            print(file)
            vad_segments = get_vad_segments(waveform1, sr)
            print(len(vad_segments))
            audios.extend(vad_segments)
            vad_segments = get_vad_segments(waveform2, sr)
            print(len(vad_segments))
            audios.extend(vad_segments)
        else:
            print(file)
            vad_segments = get_vad_segments(load_audio(f"examples/{file}"), 16000)
            print(len(vad_segments))
            audios.extend(vad_segments)
        

tamil.mp3
1
1
kannada.wav


1
1
hindi2.wav
1
1
hindi.wav
1
1
example.wav
3
3
/home/ubuntu/v2v-voice-library/data/fisher/audios/000/fe_03_00001.wav
146
146
125
125


In [18]:
print(len(audios))

2


In [17]:
from whisperx.audio import N_SAMPLES, SAMPLE_RATE, load_audio, log_mel_spectrogram

def preprocess_audio(audio_data):
    """Preprocess audio data for the model."""
    # The HF WhisperFeatureExtractor uses 80 mel bins by default
    # Access it from the feature_extractor's config
    
    n_mels = 128  if 'v3' in model_name else 80 # Default value for Whisper models
    if hasattr(processor, "feature_extractor") and hasattr(processor.feature_extractor, "config"):
        n_mels = getattr(processor.feature_extractor.config, "num_mel_bins", 80)
    
    features = log_mel_spectrogram(
        audio_data,
        n_mels=n_mels,
        padding=N_SAMPLES - audio_data.shape[0] if audio_data.shape[0] < N_SAMPLES else 0,
    )
    # Convert features to match model's dtype
    return features.to(device=model.device, dtype=model.dtype)

In [15]:
print(audios)

[array([-3.3992382e-03, -2.3423466e-03, -5.0927320e-04, ...,
       -3.0292205e-05,  2.3657558e-06,  4.1739375e-05],
      shape=(9668032,), dtype=float32), array([-0.06818755, -0.05907486, -0.03997428, ...,  0.00322126,
        0.00317312,  0.00298542], shape=(9534400,), dtype=float32)]


In [10]:
import torch
input_features = []
for audio in audios:
    input_features.append(preprocess_audio(audio))

# input_features = torch.stack(input_features).to(model.device)


In [14]:
print(len(input_features))

2


In [22]:
gen_kwargs = {
    "max_new_tokens": None,
    "num_beams": 5,
    "num_return_sequences": 1,
    "temperature": 0.0,
    "repetition_penalty": 1.0,
    "no_repeat_ngram_size": 0,
    "length_penalty": 1.0,
}

outputs = []
for batch in torch.split(input_features, 16):
    with torch.no_grad():
        output = model.generate(
            batch,
            **gen_kwargs
        )
    print(output)
    torch.cuda.empty_cache()
    break

tensor([[12008,  4516,  8094,  ...,    11, 12008,  4516],
        [23656, 17167,  2133,  ..., 50257, 50257, 50257],
        [34725,   229, 11891,  ..., 50257, 50257, 50257],
        ...,
        [ 1222,  1315,   311,  ...,  1044,   291,    13],
        [ 2425,    11, 13285,  ..., 50257, 50257, 50257],
        [ 1012,   366,   291,  ..., 50257, 50257, 50257]], device='cuda:0')


In [13]:
text = processor.decode(output[0], skip_special_tokens=False)

In [21]:
text = processor.decode(output[-4], skip_special_tokens=False)
print(text)

 eating at home. Thank you. Thank you. Thank you. Thank you. Thank you. Thank you. Thank you. Thank you. Thank you. Thank you. Thank you. Thank you. Thank you. Thank you. Thank you. Thank you. Thank you. Thank you. Thank you. Thank you. Thank you. Thank you. Thank you. Thank you. Thank you. Thank you. Thank you. Thank you. Thank you. Thank you. Thank you. Thank you. Thank you. Thank you. Thank you. Thank you. Thank you. Thank you. Thank you. Thank you. Thank you. Thank you. Thank you. Thank you. Thank you. Thank you. Thank you. Thank you. Thank you. Thank you. Thank you. Thank you. Thank you. Thank you. Thank you. Thank you. Thank you. Thank you. Thank you. Thank you. Thank you. Thank you. Thank you. Thank you. Thank you. Thank you. Thank you. Thank you. Thank you. Thank you. Thank you. Thank you. Thank you. Thank you. Thank you. Thank you. Thank you. Thank you. Thank you. Thank you. Thank you. Thank you. Thank you. Thank you. Thank you. Thank you. Thank you. Thank you. Thank you. Than