In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [None]:
!pip install transformers accelerate peft bitsandbytes evaluate

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
from peft import PeftModel, PeftConfig
from transformers import WhisperForConditionalGeneration, Seq2SeqTrainer, WhisperTokenizer, WhisperProcessor, pipeline

peft_model_id = "laksf/whisper-large-v3-LORA"
peft_config = PeftConfig.from_pretrained(peft_model_id)
model = WhisperForConditionalGeneration.from_pretrained(
    peft_config.base_model_name_or_path, load_in_8bit=True, device_map="auto"
)
model = PeftModel.from_pretrained(model, peft_model_id)

task = "transcribe"
tokenizer = WhisperTokenizer.from_pretrained(peft_config.base_model_name_or_path, task=task)
processor = WhisperProcessor.from_pretrained(peft_config.base_model_name_or_path, task=task)
feature_extractor = processor.feature_extractor
pipe = pipeline("automatic-speech-recognition", model=model, tokenizer=tokenizer, feature_extractor=feature_extractor)
#forced_decoder_ids = tokenizer.get_decoder_prompt_ids(language="ha", task="transcribe")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


adapter_config.json:   0%|          | 0.00/771 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.27k [00:00<?, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.90k [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/63.0M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.07k [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


preprocessor_config.json:   0%|          | 0.00/340 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
#audio = "/content/harvard.wav"
#text = pipe(audio, generate_kwargs={"forced_decoder_ids": forced_decoder_ids})["text"]
#text

'The stale smell of old beer lingers. It takes heat to bring out the odor. A cold dip restores health and zest. A salt pickle tastes fine with ham. Tacos al pastor are my favorite. A zestful food is the hot cross bun.'

In [None]:
!pip install -U openai-whisper

In [None]:
!pip install datasets

In [None]:
import io
import os
import numpy as np

try:
    import tensorflow  # required in Colab to avoid protobuf compatibility issues
except ImportError:
    pass

import torch
import pandas as pd
import urllib
import tarfile
import whisper
import torchaudio

from scipy.io import wavfile
from tqdm.notebook import tqdm


pd.options.display.max_rows = 100
pd.options.display.max_colwidth = 1000
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
def download(url: str, target_path: str):
    with urllib.request.urlopen(url) as source, open(target_path, "wb") as output:
        with tqdm(total=int(source.info().get("Content-Length")), ncols=80, unit='iB', unit_scale=True, unit_divisor=1024) as loop:
            while True:
                buffer = source.read(8192)
                if not buffer:
                    break

                output.write(buffer)
                loop.update(len(buffer))


class Fleurs(torch.utils.data.Dataset):
    """
    A simple class to wrap Fleurs and subsample a portion of the dataset as needed.
    """
    def __init__(self, lang, split="test", subsample_rate=1, device=DEVICE):
        url = f"https://storage.googleapis.com/xtreme_translations/FLEURS102/{lang}.tar.gz"
        tar_path = os.path.expanduser(f"~/.cache/fleurs/{lang}.tgz")
        os.makedirs(os.path.dirname(tar_path), exist_ok=True)

        if not os.path.exists(tar_path):
            download(url, tar_path)

        all_audio = {}
        with tarfile.open(tar_path, "r:gz") as tar:
            for member in tar.getmembers():
                name = member.name
                if name.endswith(f"{split}.tsv"):
                    labels = pd.read_table(tar.extractfile(member), names=("id", "file_name", "raw_transcription", "transcription", "_", "num_samples", "gender"))

                if f"/{split}/" in name and name.endswith(".wav"):
                    audio_bytes = tar.extractfile(member).read()
                    all_audio[os.path.basename(name)] = wavfile.read(io.BytesIO(audio_bytes))[1]

        self.labels = labels.to_dict("records")[::subsample_rate]
        self.all_audio = all_audio
        self.device = device

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, item):
        record = self.labels[item]
        audio = torch.from_numpy(self.all_audio[record["file_name"]].copy())
        text = record["transcription"]

        return (audio, text)

In [None]:
lang = "ha_ng"
dataset = Fleurs(lang, subsample_rate=10)  # subsample 10% of the dataset for a quick demo

  0%|                                              | 0.00/3.14G [00:00<?, ?iB/s]

In [None]:
ft_references = []
ft_transcriptions = []


for audio, text in tqdm(dataset):
    transcription = pipe(audio.numpy())["text"]

    ft_transcriptions.append(transcription)
    ft_references.append(text)

  0%|          | 0/63 [00:00<?, ?it/s]

In [None]:
ft_data = pd.DataFrame(dict(references=ft_references, transcriptions=ft_transcriptions))
ft_data

In [None]:
from whisper.normalizers import BasicTextNormalizer
normalizer = BasicTextNormalizer()
ft_data["references_clean"] = [normalizer(text) for text in ft_data["references"]]
ft_data["transcriptions_clean"] = [normalizer(text) for text in ft_data["transcriptions"]]
ft_data

In [None]:
import evaluate
metric = evaluate.load("wer")

ft_wer = metric.compute(predictions= ft_data["transcriptions_clean"], references = ftdata["references_clean"] )

print(f"WER: {ft_wer * 100:.2f} %")

In [None]:
model = whisper.load_model("large-v3")
print(
    f"Model is {'multilingual' if model.is_multilingual else 'English-only'} "
    f"and has {sum(np.prod(p.shape) for p in model.parameters()):,} parameters."
)

In [None]:
options = dict(language= "Hausa", beam_size=5, best_of=5)
transcribe_options = dict(task="transcribe", **options)

In [None]:
references = []
transcriptions = []


for audio, text in tqdm(dataset):
    transcription = model.transcribe(audio, **transcribe_options)["text"]

    transcriptions.append(transcription)
    references.append(text)

In [None]:
data = pd.DataFrame(dict(references=references, transcriptions=transcriptions))
data

In [None]:
from whisper.normalizers import BasicTextNormalizer
normalizer = BasicTextNormalizer()
data["references_clean"] = [normalizer(text) for text in data["references"]]
data["transcriptions_clean"] = [normalizer(text) for text in data["transcriptions"]]
data

In [None]:
wer = metric.compute(predictions= data["transcriptions_clean"], references = data["references_clean"] )

print(f"WER: {wer * 100:.2f} %")

In [None]:
#compare and contrast WER on other data