In [1]:
!pip install datasets soundfile librosa jiwer pytorch-lightning transformers[torch] evaluate -q


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3[0m[39;49m -> [0m[32;49m23.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


In [26]:
import torch
import transformers
import numpy as np
from datasets import load_dataset, load_metric, Audio
from transformers import (
    Wav2Vec2FeatureExtractor,
    Wav2Vec2CTCTokenizer,
    Wav2Vec2Config,
    Wav2Vec2ConformerForCTC,
    Wav2Vec2ForCTC,
    Wav2Vec2Processor,
    TrainingArguments,
    Trainer,
    TrainerCallback,
    AutoProcessor,
    EarlyStoppingCallback#,
    #ModelCheckpoint
    
)
import pandas as pd
import datasets
import re
import json
from datasets import DatasetDict
from datasets import Dataset
from dataclasses import dataclass, field
import evaluate
from typing import Dict, List, Optional, Union

from transformers import IntervalStrategy
from transformers.models.wav2vec2_conformer import Wav2Vec2ConformerModel
from torch.nn.parallel import DistributedDataParallel

In [3]:
loaded_dataset = load_dataset("LIUM/tedlium","release3",split="train[:1%]")
eval_dataset = load_dataset("LIUM/tedlium","release3",split="validation")
test_dataset = load_dataset("LIUM/tedlium","release3",split="test")

Downloading builder script:   0%|          | 0.00/17.7k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/9.24k [00:00<?, ?B/s]

Downloading and preparing dataset tedlium/release3 to /home/ubuntu/.cache/huggingface/datasets/LIUM___tedlium/release3/1.0.1/3534cf671f9fe252aa91994765f9fbe95f9a077a67d56255dcd6645776ab997d...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/35.0G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/18.4G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/175M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/307M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset tedlium downloaded and prepared to /home/ubuntu/.cache/huggingface/datasets/LIUM___tedlium/release3/1.0.1/3534cf671f9fe252aa91994765f9fbe95f9a077a67d56255dcd6645776ab997d. Subsequent calls will reuse this data.


Found cached dataset tedlium (/home/ubuntu/.cache/huggingface/datasets/LIUM___tedlium/release3/1.0.1/3534cf671f9fe252aa91994765f9fbe95f9a077a67d56255dcd6645776ab997d)
Found cached dataset tedlium (/home/ubuntu/.cache/huggingface/datasets/LIUM___tedlium/release3/1.0.1/3534cf671f9fe252aa91994765f9fbe95f9a077a67d56255dcd6645776ab997d)


In [6]:
eval_dataset

Dataset({
    features: ['audio', 'text', 'speaker_id', 'gender', 'file', 'id'],
    num_rows: 591
})

In [5]:
def clean_data(inp):
    data = {
    "audio": [],
    "text": [],
    "speaker_id": [],
    "gender": [],
    "file": [],
    "id": []
    }
    for pt in inp:
        if pt["text"].upper() != "IGNORE_TIME_SEGMENT_IN_SCORING":
            data["audio"].append(pt["audio"])
            data["text"].append(pt["text"])
            data["speaker_id"].append(pt["speaker_id"])
            data["gender"].append(pt["gender"])
            data["file"].append(pt["file"])
            data["id"].append(pt["id"])
            
    num_rows = len(data["audio"])
    
    dataset = Dataset.from_dict(data)
    dataset.set_format(
        type="python",
        columns=["audio", "text","speaker_id","gender","file","id"]
    )
    
    return dataset

In [7]:
eval_dataset = clean_data(eval_dataset)
test_dataset = clean_data(test_dataset)

In [9]:
conversion_dict = {"train":loaded_dataset,"val":eval_dataset,"test":test_dataset}
dataset = DatasetDict(conversion_dict)

In [10]:
dataset

DatasetDict({
    train: Dataset({
        features: ['audio', 'text', 'speaker_id', 'gender', 'file', 'id'],
        num_rows: 2683
    })
    val: Dataset({
        features: ['audio', 'text', 'speaker_id', 'gender', 'file', 'id'],
        num_rows: 507
    })
    test: Dataset({
        features: ['audio', 'text', 'speaker_id', 'gender', 'file', 'id'],
        num_rows: 1155
    })
})

In [11]:
dataset = dataset.remove_columns(["speaker_id","id","gender"])

In [12]:
def remove_parts(string):
    pattern = r"\{.*?\}|\(.*?\)|\<.*?\>"
    result = re.sub(pattern, lambda x: ' ' if x.group(0).strip() else '', string)
    result = re.sub('\s+', ' ', result)  # Remove extra whitespace
    return result.strip()

In [14]:
chars_to_ignore_regex = r'[\,\?\.\!\-\;\:\"[\]()èüâéàê”“’]'
def remove_special_characters(batch):
    batch["text"] = re.sub(chars_to_ignore_regex, '', remove_parts(batch["text"])).upper() + " "
    return batch

In [15]:
dataset = dataset.map(remove_special_characters)

Map:   0%|          | 0/2683 [00:00<?, ? examples/s]

Map:   0%|          | 0/507 [00:00<?, ? examples/s]

Map:   0%|          | 0/1155 [00:00<?, ? examples/s]

In [13]:
tokenizer = Wav2Vec2CTCTokenizer.from_pretrained("facebook/wav2vec2-conformer-rope-large-960h-ft")
feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=False)
processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/257 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

In [17]:
import random
rand_int = random.randint(0, len(dataset["train"]))

print("Target text:", dataset["train"][rand_int]["text"])
print("Input array shape:", np.asarray(dataset["train"][rand_int]["audio"]["array"]).shape)
print("Sampling rate:", dataset["train"][rand_int]["audio"]["sampling_rate"])

Target text: WAS OTHER RESEARCH LAST YEAR THAT DETERMINED HUMILIATION WAS A MORE INTENSELY FELT EMOTION 
Input array shape: (125440,)
Sampling rate: 16000


In [18]:
def prepare_dataset(batch):
    audio = batch["audio"]
    batch["input_values"] = processor(audio["array"], sampling_rate=audio["sampling_rate"]).input_values[0]
    batch["input_length"] = len(batch["input_values"])
    
    with processor.as_target_processor():
        batch["labels"] = processor(batch["text"]).input_ids
    return batch

In [20]:
dataset = dataset.map(prepare_dataset, remove_columns=dataset.column_names["train"], num_proc=4)

Map (num_proc=4):   0%|          | 0/2683 [00:00<?, ? examples/s]



Map (num_proc=4):   0%|          | 0/507 [00:00<?, ? examples/s]



Map (num_proc=4):   0%|          | 0/1155 [00:00<?, ? examples/s]



In [21]:
max_input_length_in_sec = 4.0
dataset["train"] = dataset["train"].filter(lambda x: x < max_input_length_in_sec * processor.feature_extractor.sampling_rate, input_columns=["input_length"])

Filter:   0%|          | 0/2683 [00:00<?, ? examples/s]

In [22]:
import torch

from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

@dataclass
class DataCollatorCTCWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.Wav2Vec2Processor`)
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
    """

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lenghts and need
        # different padding methods
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            return_tensors="pt",
        )
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                return_tensors="pt",
            )

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch

In [23]:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)
wer_metric = load_metric("wer")

  wer_metric = load_metric("wer")


Downloading builder script:   0%|          | 0.00/1.90k [00:00<?, ?B/s]

In [24]:
def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    # we do not want to group tokens when computing the metrics
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    wer = wer_metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

In [28]:
model = Wav2Vec2ConformerForCTC.from_pretrained(
    "facebook/wav2vec2-conformer-rope-large-960h-ft",
    ctc_loss_reduction="mean", 
    pad_token_id=processor.tokenizer.pad_token_id,
)

In [30]:
for name,param in model.wav2vec2_conformer.feature_extractor.named_parameters():
    param.requires_grad = False

In [32]:
from transformers import TrainingArguments

training_args = TrainingArguments(
  output_dir="tedlium_conformer",
  group_by_length=True,
  per_device_train_batch_size=8,
  evaluation_strategy="steps",
  num_train_epochs=30,
  fp16=False,
  gradient_checkpointing=True,
  save_steps=500,
  eval_steps=500,
  logging_steps=500,
  learning_rate=1e-4,
  weight_decay=0.005,
  warmup_steps=1000,
  save_total_limit=2,
)

In [33]:
trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=dataset["train"],
    eval_dataset=dataset["val"],
    tokenizer=processor.feature_extractor,
)

In [34]:
trainer.train()



Step,Training Loss,Validation Loss,Wer
500,0.734,0.303937,0.083781
1000,0.4643,0.38185,0.079063
1500,0.3287,0.320343,0.090914
2000,0.2048,0.339898,0.092286
2500,0.1487,0.275344,0.083727




TrainOutput(global_step=2640, training_loss=0.3629995967402603, metrics={'train_runtime': 2279.6147, 'train_samples_per_second': 9.265, 'train_steps_per_second': 1.158, 'total_flos': 3.2861639225460756e+18, 'train_loss': 0.3629995967402603, 'epoch': 30.0})

In [35]:
def map_to_result(batch):
  with torch.no_grad():
    input_values = torch.tensor(batch["input_values"], device="cuda").unsqueeze(0)
    logits = model(input_values).logits

  pred_ids = torch.argmax(logits, dim=-1)
  batch["pred_str"] = processor.batch_decode(pred_ids)[0]
  batch["text"] = processor.decode(batch["labels"], group_tokens=False)
  
  return batch

In [38]:
val_res = dataset["val"].map(map_to_result)
test_res = dataset["test"].map(map_to_result)

Map:   0%|          | 0/507 [00:00<?, ? examples/s]

Map:   0%|          | 0/1155 [00:00<?, ? examples/s]

In [39]:
print("Val WER: {:.3f}".format(wer_metric.compute(predictions=val_res["pred_str"], references=val_res["text"])))
print("Test WER: {:.3f}".format(wer_metric.compute(predictions=test_res["pred_str"], references=test_res["text"])))

Val WER: 0.127
Test WER: 0.133


In [43]:
val_res[0]["pred_str"]

'AFTER I SHOWE THESE TWO SLIDES THAT DEMONSTRATE THAT THE ARCHI ICECAP WHICH FOR MOST OF THE LAST THREE MILLION YEARS HAS BEEN THE SIZE OF THE LOWER FORTY EIGHT STATES HAS SHRUNK BY FIRTY PERCENT'

In [44]:
val_res[0]["text"]

'LAST YEAR I SHOWED THESE TWO SLIDES SO THAT DEMONSTRATE THAT THE ARCTIC ICE CAP WHICH FOR MOST OF THE LAST THREE MILLION YEARS HAS BEEN THE SIZE OF THE LOWER FORTY EIGHT STATES HAS SHRUNK BY FORTY PERCENT'

In [45]:
test_res[14]["pred_str"]

"SEEING IF IT WORKS CHANGING IT WHEN IT DOESN 'T IS ONE OF THE GREAT ACCOMPLISHMENTS OF HUMANITY SO THAT 'S THE GOOD NEWS"

In [46]:
test_res[14]["text"]

"SEEING IF IT WORKS CHANGING IT WHEN IT DOESN 'T IS ONE OF THE GREAT ACCOMPLISHMENTS OF HUMANITY SO THAT 'S THE GOOD NEWS"