In [2]:
from pathlib import Path

import torch
import librosa
from datasets import Audio
from transformers import pipeline, AutoModelForSpeechSeq2Seq, AutoProcessor


token_path = Path('token.txt')
token = token_path.read_text().strip()

device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
model_id = "openai/whisper-large-v2"

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id,
    torch_dtype=torch_dtype,
    low_cpu_mem_usage=True,
    use_safetensors=True,
    attn_implementation="sdpa",
)

model.safetensors: 100%|██████████| 6.17G/6.17G [01:31<00:00, 67.3MB/s]
generation_config.json: 100%|██████████| 4.26k/4.26k [00:00<00:00, 5.02MB/s]


In [4]:
model.to(device)

WhisperForConditionalGeneration(
  (model): WhisperModel(
    (encoder): WhisperEncoder(
      (conv1): Conv1d(80, 1280, kernel_size=(3,), stride=(1,), padding=(1,))
      (conv2): Conv1d(1280, 1280, kernel_size=(3,), stride=(2,), padding=(1,))
      (embed_positions): Embedding(1500, 1280)
      (layers): ModuleList(
        (0-31): 32 x WhisperEncoderLayer(
          (self_attn): WhisperSdpaAttention(
            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)
            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)
            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)
            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1280, out_features=5120, bias=True)
          (fc2): Linear(in_features=5120, out_features=1280, bias

In [5]:
processor = AutoProcessor.from_pretrained(model_id)

preprocessor_config.json: 100%|██████████| 185k/185k [00:00<00:00, 8.87MB/s]
tokenizer_config.json: 100%|██████████| 805/805 [00:00<00:00, 8.57MB/s]
vocab.json: 100%|██████████| 1.04M/1.04M [00:00<00:00, 27.8MB/s]
tokenizer.json: 100%|██████████| 2.48M/2.48M [00:00<00:00, 12.9MB/s]
merges.txt: 100%|██████████| 494k/494k [00:00<00:00, 52.4MB/s]
normalizer.json: 100%|██████████| 52.7k/52.7k [00:00<00:00, 50.8MB/s]
added_tokens.json: 100%|██████████| 34.6k/34.6k [00:00<00:00, 99.5MB/s]
special_tokens_map.json: 100%|██████████| 2.08k/2.08k [00:00<00:00, 21.6MB/s]


In [6]:
asr = pipeline(
        "automatic-speech-recognition",
        model=model,
        tokenizer=processor.tokenizer,
        feature_extractor=processor.feature_extractor,
        max_new_tokens=128,
        chunk_length_s=15,
        batch_size=4,
        torch_dtype=torch_dtype,
        device=device,
    )

In [7]:
audio_path = "/Users/amansingh/Documents/ssr_audio_dataset/audio/6412ba078a3197d8284889fa.wav"

In [8]:
waveform, _ = librosa.load(audio_path)

In [12]:
print(waveform)

[2.1837007e-03 2.5585922e-03 2.2247941e-03 ... 3.7024587e-05 5.7741247e-05
 0.0000000e+00]


In [14]:
audio_dict = {
        'array': waveform,
        'path': audio_path,
        'sampling_rate': 16000
    }


In [15]:
audio_dict

{'array': array([2.1837007e-03, 2.5585922e-03, 2.2247941e-03, ..., 3.7024587e-05,
        5.7741247e-05, 0.0000000e+00], dtype=float32),
 'path': '/Users/amansingh/Documents/ssr_audio_dataset/audio/6412ba078a3197d8284889fa.wav',
 'sampling_rate': 16000}

In [18]:
from datasets import Dataset
audio_dataset = Dataset.from_dict({"audio": [audio_path]}).cast_column("audio", Audio(sampling_rate=16000))

In [19]:
audio_dataset

Dataset({
    features: ['audio'],
    num_rows: 1
})

In [21]:
audio_dataset[0]["audio"]

{'path': '/Users/amansingh/Documents/ssr_audio_dataset/audio/6412ba078a3197d8284889fa.wav',
 'array': array([ 2.31933594e-03,  2.38037109e-03,  2.31933594e-03, ...,
        -3.05175781e-05,  0.00000000e+00,  6.10351562e-05]),
 'sampling_rate': 16000}

In [22]:
asr(audio_dataset[0]["audio"])

KeyboardInterrupt: 