### Automatic Speech Recognition with Whisper

In [1]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
sample_audio =  "Sample Board Recording.wav"

In [3]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

model_id = "distil-whisper/distil-small.en"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, use_safetensors=True
)
model.to(device)

processor = AutoProcessor.from_pretrained(model_id)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    max_new_tokens=128,
    torch_dtype=torch_dtype,
    device=device,
)

In [5]:
pipe(sample_audio)

{'text': " Good morning everyone. Let's call this meeting to order. First on our agenda is the review of last quarter's financial performance. CFO, could you please lead us through the main highlights? Thank you. I'm pleased to report that we've seen a 7% increase in revenue compared to the previous quarter. This is largely due to our successful launch of the new product line, which has been well received in the market."}

In [8]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

model_id = "openai/whisper-large-v3"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)

processor = AutoProcessor.from_pretrained(model_id)

pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    max_new_tokens=128,
    chunk_length_s=30,
    batch_size=16,
    return_timestamps=True,
    torch_dtype=torch_dtype,
    device=device,
)

result = pipe(sample_audio, generate_kwargs={"language": "english"})

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [9]:
result

{'text': " Good morning everyone. Let's call this meeting to order. First on our agenda is the review of last quarter's financial performance. CFO, could you please lead us through the main highlights? Thank you. I'm pleased to report that we've seen a 7% increase in revenue compared to the previous quarter. This is largely due to our successful launch of the new product line, which has been well received in the market.",
 'chunks': [{'timestamp': (0.0, 8.0),
   'text': " Good morning everyone. Let's call this meeting to order. First on our agenda is the review of last quarter's financial performance."},
  {'timestamp': (8.0, 12.0),
   'text': ' CFO, could you please lead us through the main highlights?'},
  {'timestamp': (12.0, 19.0),
   'text': " Thank you. I'm pleased to report that we've seen a 7% increase in revenue compared to the previous quarter."},
  {'timestamp': (19.0, 25.46),
   'text': ' This is largely due to our successful launch of the new product line, which has been w

## To Do: Add Pyannote into the details

# instantiate the pipeline
from pyannote.audio import Pipeline
pipeline = Pipeline.from_pretrained(
  "pyannote/speaker-diarization-3.1",
  use_auth_token="HUGGINGFACE_ACCESS_TOKEN_GOES_HERE")

# run the pipeline on an audio file
diarization = pipeline("audio.wav")m