In [3]:
from datasets import load_dataset, Audio
minds = load_dataset("PolyAI/minds14", name="en-US", split="train[:100]")

In [4]:
minds = minds.train_test_split(test_size=0.2)

In [5]:
minds

DatasetDict({
    train: Dataset({
        features: ['path', 'audio', 'transcription', 'english_transcription', 'intent_class', 'lang_id'],
        num_rows: 80
    })
    test: Dataset({
        features: ['path', 'audio', 'transcription', 'english_transcription', 'intent_class', 'lang_id'],
        num_rows: 20
    })
})

In [6]:
minds = minds.remove_columns(["english_transcription", "intent_class", "lang_id"])

In [7]:
minds["train"][0]

{'path': '/home/user/.cache/huggingface/datasets/downloads/extracted/b63b156957c7ca71c29ec1b1c70b7c01f82e37338e69d6b00d0e88dc4d3d3ee4/en-US~JOINT_ACCOUNT/602bac2e963e11ccd901cda4.wav',
 'audio': {'path': '/home/user/.cache/huggingface/datasets/downloads/extracted/b63b156957c7ca71c29ec1b1c70b7c01f82e37338e69d6b00d0e88dc4d3d3ee4/en-US~JOINT_ACCOUNT/602bac2e963e11ccd901cda4.wav',
  'array': array([ 0.        ,  0.        , -0.00024414, ..., -0.00024414,
          0.00048828, -0.00024414]),
  'sampling_rate': 8000},
 'transcription': 'hi I was wondering how I would set up a joint account with my partner'}

In [8]:
from transformers import AutoProcessor
processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base")

2024-12-10 15:04:10.084142: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-12-10 15:04:10.401331: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-12-10 15:04:10.495470: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-12-10 15:04:10.527669: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-10 15:04:10.725498: I tensorflow/core/platform/cpu_feature_guar

tokenizer_config.json:   0%|          | 0.00/163 [00:00<?, ?B/s]



vocab.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

In [9]:
minds = minds.cast_column("audio", Audio(sampling_rate=16_000))

In [10]:
minds["train"][0]

{'path': '/home/user/.cache/huggingface/datasets/downloads/extracted/b63b156957c7ca71c29ec1b1c70b7c01f82e37338e69d6b00d0e88dc4d3d3ee4/en-US~JOINT_ACCOUNT/602bac2e963e11ccd901cda4.wav',
 'audio': {'path': '/home/user/.cache/huggingface/datasets/downloads/extracted/b63b156957c7ca71c29ec1b1c70b7c01f82e37338e69d6b00d0e88dc4d3d3ee4/en-US~JOINT_ACCOUNT/602bac2e963e11ccd901cda4.wav',
  'array': array([-2.99292151e-06,  5.17739973e-05,  4.47832281e-06, ...,
          2.52652564e-04, -2.09215432e-04, -3.20373743e-04]),
  'sampling_rate': 16000},
 'transcription': 'hi I was wondering how I would set up a joint account with my partner'}

In [11]:
def uppercase(example):
    return {"transcription": example["transcription"].upper()}

In [12]:
minds = minds.map(uppercase)

Map:   0%|          | 0/80 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

In [13]:
def prepare_dataset(batch):
    audio = batch["audio"]
    batch = processor(audio["array"], sampling_rate=audio["sampling_rate"], text=batch["transcription"])
    batch["input_length"] =  len(batch["input_values"][0])
    return batch

In [14]:
encoded_minds = minds.map(prepare_dataset, remove_columns=minds.column_names["train"], num_proc=4)

Map (num_proc=4):   0%|          | 0/80 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/20 [00:00<?, ? examples/s]

In [15]:
import torch
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

In [19]:
@dataclass
class DataCollatorCTCWithPaddnig:
    processor: AutoProcessor
    padding: Union[bool, str] = "longest"

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        input_features = [{"input_values": feature["input_values"][0]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(input, padding=self.padding, return_tensors="pt")

        labels_batch = self.processor.pad(labels=label, padding=self.padding, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["inputs_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch

In [21]:
data_collator = DataCollatorCTCWithPaddnig(processor=processor, padding="longest")

In [22]:
import evaluate

wer = evaluate.load("wer")

Downloading builder script:   0%|          | 0.00/4.49k [00:00<?, ?B/s]

In [23]:
import numpy as np

def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    wer = wer.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

In [27]:
from transformers import AutoModelForCTC, TrainingArguments, Trainer

model = AutoModelForCTC.from_pretrained(
    "facebook/wav2vec2-base",
    ctc_loss_reduction="mean",
    pad_token_id=processor.tokenzier.pad_token_id,
)

AttributeError: 'Wav2Vec2Processor' object has no attribute 'tokenzier'