## Dependencies Installation

In [1]:
# %pip install torch torchaudio transformers datasets jiwer librosa matplotlib
# uncomment this section if you haven't installed the required packages

Collecting torchaudio
  Using cached torchaudio-2.6.0-cp311-cp311-win_amd64.whl.metadata (6.7 kB)
Collecting transformers
  Using cached transformers-4.48.2-py3-none-any.whl.metadata (44 kB)
Collecting datasets
  Using cached datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting jiwer
  Using cached jiwer-3.1.0-py3-none-any.whl.metadata (2.6 kB)
Collecting librosa
  Using cached librosa-0.10.2.post1-py3-none-any.whl.metadata (8.6 kB)
Collecting huggingface-hub<1.0,>=0.24.0 (from transformers)
  Using cached huggingface_hub-0.28.1-py3-none-any.whl.metadata (13 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Using cached tokenizers-0.21.0-cp39-abi3-win_amd64.whl.metadata (6.9 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Using cached multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting numba>=0.51.0 (from librosa)
  Using cached numba-0.61.0-cp311-cp311-win_amd64.whl.metadata (2.8 kB)
Collecting soundfile>=0.12.1 (from librosa)
  Using cached sou


[notice] A new release of pip is available: 24.3.1 -> 25.0
[notice] To update, run: python.exe -m pip install --upgrade pip


## Importing Libraries

In [None]:
import pytorch        # deep learning
import torchaudio     # audio processing
import librosa        # audio analysis (can be alternative to torchaudio)
import jiwer          # for WER

from transformers import WhisperProcessor, WhisperForConditionalGeneration  # Whisper model
import matplotlib.pyplot as plt                                             # waveform visualization

## Model Loading

In [None]:
model_type = "openai/whisper-medium"  # medium for better accuracy on cebuano 
processor = WhisperProcessor.from_pretrained(model_type)
model = WhisperForConditionalGeneration.from_pretrained(model_type)
model.to("cuda" if pytorch.cuda.is_available() else "cpu") # use gpu if available, otherwise cpu

## Audio Preprocessing

In [None]:
def load_audio(path):
    waveform, sr = torchaudio.load(path)
    waveform = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)(waveform) # Whisper requires 16kHz
    return waveform.squeeze().numpy() # tensor to numpy array

In [None]:
path = "sound_file.sound" # rename to .sound file
audio = load_audio(path)  # loud audio