# ***Automatic Speech Recognition***

In [1]:
import os
import pandas as pd

In [2]:
import torch
import os 
os.environ["CUDA_VISIBLE_DEVICES"] = "3"

In [3]:
import huggingface_hub
huggingface_hub.login("")


Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid.
Your token has been saved to /jupyter/socse.user06/.cache/huggingface/token
Login successful


In [4]:
from datasets import load_dataset, DatasetDict

dataset=DatasetDict()
#basilkr/Malasar_Dict_only
dataset = load_dataset("basilkr/Malasar_Luke_Dict_Dataset",use_auth_token=True)

Found cached dataset parquet (/jupyter/socse.user06/.cache/huggingface/datasets/basilkr___parquet/basilkr--Malasar_Luke_Dict_Dataset-bdaa2be7889fc559/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
dataset['train'][0]

{'audio_path': {'path': 'luke00இவருகளோடவு.wav',
  'array': array([0.00000000e+00, 2.19607962e-14, 9.76593074e-15, ...,
         0.00000000e+00, 0.00000000e+00, 0.00000000e+00]),
  'sampling_rate': 24000},
 'sentence': 'இவருகளோடவு'}

In [6]:
from datasets import Audio

In [7]:
dataset = dataset.cast_column('audio_path',Audio(sampling_rate = 16000))

In [8]:
dataset['train'][0]

{'audio_path': {'path': 'luke00இவருகளோடவு.wav',
  'array': array([ 4.26325641e-14, -6.57252031e-14, -5.86197757e-14, ...,
          0.00000000e+00,  0.00000000e+00,  0.00000000e+00]),
  'sampling_rate': 16000},
 'sentence': 'இவருகளோடவு'}

In [9]:
dataset

DatasetDict({
    train: Dataset({
        features: ['audio_path', 'sentence'],
        num_rows: 16850
    })
    test: Dataset({
        features: ['audio_path', 'sentence'],
        num_rows: 1873
    })
})

In [10]:
from transformers import WhisperFeatureExtractor

feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-large-v2")


In [11]:
from transformers import WhisperTokenizer

tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-large-v2",  task="transcribe")


In [12]:
input_str = dataset["train"][0]["sentence"]
labels = tokenizer(input_str).input_ids
decoded_with_special = tokenizer.decode(labels, skip_special_tokens=False)
decoded_str = tokenizer.decode(labels, skip_special_tokens=True)

print(f"Input:                 {input_str}")
print(f"Decoded w/ special:    {decoded_with_special}")
print(f"Decoded w/out special: {decoded_str}")
print(f"Are equal:             {input_str == decoded_str}")


2023-06-21 23:41:44.142288: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Input:                 இவருகளோடவு
Decoded w/ special:    <|startoftranscript|><|transcribe|><|notimestamps|>இவருகளோடவு<|endoftext|>
Decoded w/out special: இவருகளோடவு
Are equal:             True


In [13]:
from transformers import WhisperProcessor

processor = WhisperProcessor.from_pretrained("openai/whisper-large-v2",  task="transcribe")


In [14]:
def prepare_dataset(batch):
    # load and resample audio data from 48 to 16kHz
    audio = batch["audio_path"]

    # compute input length
    #batch["input_length"] = len(batch["audio_path"])

    # compute log-Mel input features from input audio array 
    batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]

    # encode target text to label ids 
    batch["labels"] = tokenizer(batch["sentence"]).input_ids

    # compute labels length
    #batch["labels_length"] = len(tokenizer(batch["sentence"], add_special_tokens=False).input_ids)
    return batch



In [15]:
dataset = dataset.map(prepare_dataset, remove_columns= dataset.column_names["train"], num_proc=4)


Loading cached processed dataset at /jupyter/socse.user06/.cache/huggingface/datasets/basilkr___parquet/basilkr--Malasar_Luke_Dict_Dataset-bdaa2be7889fc559/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-36e6b0fe80b9cb80_*_of_00004.arrow
Loading cached processed dataset at /jupyter/socse.user06/.cache/huggingface/datasets/basilkr___parquet/basilkr--Malasar_Luke_Dict_Dataset-bdaa2be7889fc559/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-7c5055cd356ab4f8_*_of_00004.arrow


In [16]:
import torch

from dataclasses import dataclass
from typing import Any, Dict, List, Union

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch


In [17]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)


In [18]:
import evaluate

metric = evaluate.load("wer")


In [19]:
def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}


In [20]:
from transformers import WhisperForConditionalGeneration

model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v2")


In [21]:
model.config.forced_decoder_ids = None
model.config.suppress_tokens = []
#model.config.max_length = 500


In [22]:
# !nvidia-smi -c 0


In [23]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./Malasar_50_latest",  # change to a repo name of your choice
    per_device_train_batch_size=32,
    gradient_accumulation_steps=1,  # increase by 2x for every 2x decrease in batch size
    learning_rate=1e-5,
    warmup_steps=50,
    max_steps=1500,
    gradient_checkpointing=True,
    fp16=True,
    evaluation_strategy="steps",
    per_device_eval_batch_size=16,
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=500,
    eval_steps=50,
    logging_steps=25,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=True,
)


In [25]:
from transformers import Seq2SeqTrainer
import os
os.environ["HUGGINGFACE_HUB_NO_GIT"] = "1"
trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
)


In [None]:
trainer.train()

`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache = False`...


Step,Training Loss,Validation Loss,Wer
50,0.6768,0.159135,63.919886
100,0.1711,0.115213,52.989986
150,0.1392,0.10636,42.002861
200,0.1131,0.092185,40.200286
250,0.1208,0.08814,46.494993
300,0.1312,0.079986,37.51073
350,0.1112,0.077438,37.567954
400,0.1,0.083301,33.590844
450,0.094,0.076707,31.702432
500,0.0758,0.072794,32.331903


  next_tokens.tile(eos_token_id_tensor.shape[0], 1).ne(eos_token_id_tensor.unsqueeze(1)).prod(dim=0)


TrainOutput(global_step=1500, training_loss=0.1225989959637324, metrics={'train_runtime': 48803.1953, 'train_samples_per_second': 0.984, 'train_steps_per_second': 0.031, 'total_flos': 1.018527602098176e+20, 'train_loss': 0.1225989959637324, 'epoch': 2.85})

In [None]:
trainer.push_to_hub("basilkr/Malasar_ASR_Dict")

In [None]:
torch.cuda.memory_summary(device=None, abbreviated=False)