<a href="https://colab.research.google.com/github/brendanfitz/fitz-ai/blob/main/Personal_Voice_Memo_Transcriber.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!apt-get update
!apt-get install ffmpeg -y
!pip install torchaudio pydub

In [None]:
from pathlib import Path
import datetime as dt
import numpy as np
import torchaudio
from transformers import pipeline
from pydub import AudioSegment
import torch

audio_wd = Path('/content/drive/MyDrive/Voice Memos')

In [None]:
pipe = pipeline('automatic-speech-recognition', model='openai/whisper-large-v3')

In [None]:
def read_audio(file_path):
  # Get the sampling rate expected by the model
  sampling_rate = pipe.feature_extractor.sampling_rate

  audio = AudioSegment.from_file(str(file_path), format="m4a")

  # Convert pydub AudioSegment to numpy array
  audio_np = np.array(audio.get_array_of_samples()).astype(np.float32)

  # If the audio is stereo, it will have two channels. We need to average them for the model.
  if audio.channels == 2:
      audio_np = audio_np.reshape((-1, 2)).mean(axis=1)

  # Resample if necessary (pydub's default is often 44.1kHz, whisper needs 16kHz)
  if audio.frame_rate != sampling_rate:
      # Using torchaudio for resampling as it's efficient and common in audio processing pipelines
      # Convert numpy array to torch tensor for torchaudio
      audio_tensor = torch.from_numpy(audio_np)
      resampler = torchaudio.transforms.Resample(orig_freq=audio.frame_rate, new_freq=sampling_rate)
      audio_resampled = resampler(audio_tensor)
      audio_np = audio_resampled.numpy()

  return audio_np

In [None]:
for file_path in audio_wd.glob('*'):
  try:
    audio_np = read_audio(file_path)
    result = pipe(audio_np, return_timestamps=True)
  except ValueError as e:
    file_dt = dt.datetime.fromtimestamp(file_path.stat().st_ctime)
    print(file_path.stem)
    raise e

  print(file_path.stem)
  print('-' * 40)
  print(result['text'].strip())