# **Whisper Model fine-tuning for ASR**
fine-tuning of Wav2vec2 for automatic speech recognition task. In this script, we try to collect all the processes, which consist of preparing data, making LM, fine-tuning the Wav2vec2, and evaluating with and without LM. 

*** For making the LM, you need a corpus.txt file. This file could be a big text file such as a corpus of the Gigaspeech dataset, [TEDLIUM](https://www.openslr.org/27/), a mix of all of them, or a file consisting of all the sentences of the dataset that you are working on in this script.

**** To prepare the train.csv and test.csv files, you can find the manifest style in our other repository for [Whisper_fine_tuning_ASR](https://github.com/areffarhadi/Whisper_fine_tuning_ASR). The manifests of these two scripts are the same.

## **Preparing Data to fine-tune**

In [45]:
import numpy as np
def dummy_npwarn_decorator_factory():
  def npwarn_decorator(x):
    return x
  return npwarn_decorator
np._no_nep50_warning = getattr(np, '_no_nep50_warning', dummy_npwarn_decorator_factory)

In [None]:
!pip install datasets
!pip install pandas
!pip install numpy
!pip install hazm
!pip install num2fawords
!pip install tqdm
!pip install scikit-learn
!pip install jiwer
!pip install transformers==4.28.0
!pip install urllib3==1.26.0
!pip install torchaudio
!pip install 
!pip install langchain
!pip install sentence-transformers==2.2.2

In [46]:
from datasets import load_dataset, load_metric

import pandas as pd
import numpy as np

#import hazm
from num2fawords import words, ordinal_words
from tqdm import tqdm

from sklearn.model_selection import train_test_split

import os
import string
import six
import re
import glob

In [47]:
common_voice_train = load_dataset("csv", data_files={"train": "train.csv"}, delimiter=",")["train"]
common_voice_test = load_dataset("csv", data_files={"test": "test.csv"}, delimiter=",")["test"]
print(common_voice_train)
print(common_voice_test)

Dataset({
    features: ['Path', 'Sentence'],
    num_rows: 14300
})
Dataset({
    features: ['Path', 'Sentence'],
    num_rows: 718
})


Extract language characters based on the Sentences of csv files. 
After that, make the tokenizer, feature extractor, and processor.

In [48]:
def extract_all_chars(batch):
    all_text = " ".join(batch["Sentence"])
    vocab = list(set(all_text))
    return {"vocab": [vocab], "all_text": [all_text]}

In [49]:
vocab_train = common_voice_train.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=common_voice_train.column_names)
vocab_test = common_voice_test.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=common_voice_test.column_names)

Map:   0%|          | 0/14300 [00:00<?, ? examples/s]

Map:   0%|          | 0/718 [00:00<?, ? examples/s]

In [50]:
vocab_list = list(sorted(set(vocab_train["vocab"][0]) ))
vocab_list = [vocab for vocab in vocab_list if vocab not in [" ", "\u0307","<",">"]]
print(len(vocab_list))
print(vocab_list)

28
["'", '-', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [51]:
special_vocab = ["<pad>", "<s>", "</s>", "<unk>", "|"]

In [52]:
vocab_dict = {v: k for k, v in enumerate(special_vocab + vocab_list)}
print(len(vocab_dict))
print(vocab_dict)

33
{'<pad>': 0, '<s>': 1, '</s>': 2, '<unk>': 3, '|': 4, "'": 5, '-': 6, 'a': 7, 'b': 8, 'c': 9, 'd': 10, 'e': 11, 'f': 12, 'g': 13, 'h': 14, 'i': 15, 'j': 16, 'k': 17, 'l': 18, 'm': 19, 'n': 20, 'o': 21, 'p': 22, 'q': 23, 'r': 24, 's': 25, 't': 26, 'u': 27, 'v': 28, 'w': 29, 'x': 30, 'y': 31, 'z': 32}


In [53]:
import json
with open('vocab.json', 'w') as vocab_file:
    json.dump(vocab_dict, vocab_file)

In this part, the model path and the save path of the final results are determined. 
Here, you can download the pretrained model [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53). However, you can also download another version of the wav2vec2 model. In addition, you can continue the training process based on your previous checkpoint.

In [54]:
model_name_or_path = "/home/rf/aref/Code_my_edit-20240208T101521Z-001/Code_my_edit/wav2vec2-large-xlsr-53"
#model_name_or_path = "/home/rf/aref/Code_my_edit-20240208T101521Z-001/Code_my_edit/save_dir/checkpoint-55000"

save_dir = "/home/rf/aref/Code_my_edit-20240208T101521Z-001/Code_my_edit/save_dir"

In [55]:
from transformers.trainer_utils import get_last_checkpoint

last_checkpoint = None
if os.path.exists(save_dir):
    last_checkpoint = get_last_checkpoint(save_dir)

print(last_checkpoint if last_checkpoint else str(None))

/home/rf/aref/Code_my_edit-20240208T101521Z-001/Code_my_edit/save_dir/checkpoint-55000


In [56]:
from transformers import Wav2Vec2CTCTokenizer

#if not os.path.exists(save_dir) and not model_name_or_path:
   # print("Load from scratch")
tokenizer = Wav2Vec2CTCTokenizer(
        "vocab.json",
        bos_token="<s>",
        eos_token="</s>",
        unk_token="<unk>",
        pad_token="<pad>",
        word_delimiter_token="|",
        do_lower_case=False)
#else:
  #  print(f"Load from {model_name_or_path}")
    #tokenizer = Wav2Vec2CTCTokenizer.from_pretrained("m3hrdadfi/wav2vec2-large-xlsr-persian-v3")


In [57]:
text = "do you hear the sleigh bells ringing"
print(" ".join(tokenizer.tokenize(text)))
print(tokenizer.decode(tokenizer.encode(text)))

d o | y o u | h e a r | t h e | s l e i g h | b e l l s | r i n g i n g
do you hear the sleigh bels ringing


In [58]:
from transformers import Wav2Vec2FeatureExtractor

if not os.path.exists(save_dir) and not model_name_or_path:
    print("Load from scratch")
    feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True)
else:
    print(f"Load from {model_name_or_path}")
    feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name_or_path)

Load from /home/rf/aref/Code_my_edit-20240208T101521Z-001/Code_my_edit/wav2vec2-large-xlsr-53


In [59]:

from transformers import Wav2Vec2Processor
processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)


In [60]:
if len(processor.tokenizer.get_vocab()) == len(processor.tokenizer):
    print(len(processor.tokenizer))

33


## Resample your data!
The sampling rate at which the audio files should be digitalized is 16000 hertz (Hz).
Since the pre-trained [Wav2Vec2](https://huggingface.co/facebook/wav2vec2-large-xlsr-53)  model is trained with 16000 Hz sample rate audio files, we must resample our data accordingly. If your data is in 16000 Hz in this cell. 

In [61]:
import torchaudio
#import librosa


target_sampling_rate = 16000

def speech_file_to_array_fn(batch):
    speech_array, sampling_rate = torchaudio.load(batch["Path"])
    speech_array = speech_array.squeeze().numpy()
    #speech_array = librosa.resample(np.asarray(speech_array), sampling_rate, target_sampling_rate)


    batch["speech"] = speech_array
    batch["sampling_rate"] = target_sampling_rate
    batch["duration_in_seconds"] = len(batch["speech"]) / target_sampling_rate
    batch["target_text"] = batch["Sentence"]
    return batch

In [62]:
common_voice_train = common_voice_train.map(speech_file_to_array_fn, remove_columns=common_voice_train.column_names, num_proc=24)
common_voice_test = common_voice_test.map(speech_file_to_array_fn, remove_columns=common_voice_test.column_names, num_proc=24)

In [63]:
print(common_voice_train[0]["sampling_rate"])
print(common_voice_test[0]["sampling_rate"])

16000
16000


In [65]:
import IPython.display as ipd
import numpy as np
import random

sample = common_voice_train
rand_int = random.randint(0, len(sample))

print("Target text:", sample[rand_int]["target_text"])
print("Input array shape:", np.asarray(sample[rand_int]["speech"]).shape)
print("Sampling rate:", sample[rand_int]["sampling_rate"])

ipd.Audio(data=np.asarray(sample[rand_int]["speech"]), autoplay=True, rate=16000)

Target text: a boring novel is a superb sleeping pill
Input array shape: (41497,)
Sampling rate: 16000


In [66]:

def prepare_dataset(batch):
    # Check that all files have the correct sampling rate
    assert (len(set(batch["sampling_rate"])) == 1), f"Make sure all inputs have the same sampling rate of {processor.feature_extractor.sampling_rate}."
    
    # Extract input values with padding
    inputs = processor(batch["speech"], sampling_rate=batch["sampling_rate"][0], padding=True, return_tensors="pt")
    batch["input_values"] = inputs.input_values
    
    # Process target text
    with processor.as_target_processor():
        labels = processor(batch["target_text"], padding=True, return_tensors="pt")
    batch["labels"] = labels.input_ids
    
    return batch

In [67]:
_common_voice_train = common_voice_train.map(
    prepare_dataset,
    remove_columns=common_voice_train.column_names,
    batch_size=8,
    num_proc=24,
    batched=True
)
_common_voice_test = common_voice_test.map(
    prepare_dataset,
    remove_columns=common_voice_test.column_names,
    batch_size=8,
    num_proc=24,
    batched=True
)

In [68]:
import torch

from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

@dataclass
class DataCollatorCTCWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.Wav2Vec2Processor`)
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
        max_length (:obj:`int`, `optional`):
            Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
        max_length_labels (:obj:`int`, `optional`):
            Maximum length of the ``labels`` returned list and optionally padding length (see above).
        pad_to_multiple_of (:obj:`int`, `optional`):
            If set will pad the sequence to a multiple of the provided value.
            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
            7.5 (Volta).
    """

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lenghts and need
        # different padding methods
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                max_length=self.max_length_labels,
                pad_to_multiple_of=self.pad_to_multiple_of_labels,
                return_tensors="pt",
            )

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch

In [69]:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

Use a metric to evaluate your fine-tuning task, here we've chosen WER, also you can use CER!

In [70]:
wer_metric = load_metric("wer")

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [71]:
import random


def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    # we do not want to group tokens when computing the metrics
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    if isinstance(label_str, list):
        if isinstance(pred_str, list) and len(pred_str) == len(label_str):
            for index in random.sample(range(len(label_str)), 3):
                print(f'reference: "{label_str[index]}"')
                print(f'predicted: "{pred_str[index]}"')

        else:
            for index in random.sample(range(len(label_str)), 3):
                print(f'reference: "{label_str[index]}"')
                print(f'predicted: "{pred_str}"')

    wer = wer_metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

# **Set your model config**

In [72]:
from transformers import Wav2Vec2ForCTC


model = Wav2Vec2ForCTC.from_pretrained(
    model_name_or_path,
    attention_dropout=0.05,
    activation_dropout=0.1,
    hidden_dropout=0.1,
    feat_proj_dropout=0.01249,
    final_dropout=0.0,
    mask_time_prob=0.05,
    mask_time_length=10,
    mask_feature_prob=0,
    mask_feature_length=10,
    layerdrop=0.01377,
    gradient_checkpointing=True,
    ctc_loss_reduction="mean",
    ctc_zero_infinity=True,
    bos_token_id=processor.tokenizer.bos_token_id,
    eos_token_id=processor.tokenizer.eos_token_id,
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size=len(processor.tokenizer.get_vocab())
)

Some weights of the model checkpoint at /home/rf/aref/Code_my_edit-20240208T101521Z-001/Code_my_edit/wav2vec2-large-xlsr-53 were not used when initializing Wav2Vec2ForCTC: ['quantizer.weight_proj.bias', 'project_q.weight', 'quantizer.codevectors', 'project_q.bias', 'project_hid.weight', 'quantizer.weight_proj.weight', 'project_hid.bias']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at /home/rf/aref/Code_my_edit-20240208T101521Z-001/Code_my_edit/wav2vec2-large-xlsr-53 and are newly initialized: ['lm_

In [73]:
model.freeze_feature_extractor()



Set the `TrainingArguments`specially` per_device_train_batch_size` & `per_device_eval_batch_size`to achieve best performance.

In [76]:
from transformers import TrainingArguments

# per_device_train_batch_size= 8,
training_args = TrainingArguments(
    output_dir=save_dir,
    group_by_length=True,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=2,
    evaluation_strategy="steps",
    num_train_epochs=30,
    fp16=True,
    save_steps=5000,
    eval_steps=1000,
    logging_steps=10,
    learning_rate=4e-4,
    warmup_steps=500,
    save_total_limit=2,
    dataloader_num_workers=24,
)
#fp16=True,
#dataloader_num_workers=24

# **Train**
Start training <br>
You have to use GPU!

In [77]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=_common_voice_train,
    eval_dataset=_common_voice_test,
    tokenizer=processor.feature_extractor,
)

In [78]:
print("step1 - training...")
train_result = trainer.train()
print("step2 - ")

metrics = train_result.metrics
print("step3")
max_train_samples = len(_common_voice_train)
metrics["train_samples"] = min(max_train_samples, len(_common_voice_train))
print("step4 - saving the model...")
trainer.save_model()
print("model created!")
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()
tokenizer.save_pretrained(training_args.output_dir)

step1 - training...




Step,Training Loss,Validation Loss,Wer
1000,2.9624,2.975894,1.0
2000,1.6243,1.546467,0.944976
3000,0.8752,1.244335,0.799227
4000,0.3531,1.146602,0.687707
5000,0.3891,0.752382,0.58502
6000,0.2041,0.693471,0.497792
7000,0.2217,0.674388,0.480861
8000,0.6867,0.525434,0.43099
9000,0.2104,0.645967,0.4424
10000,0.2594,0.653631,0.410563




reference: "women may never become completely equal to men"
predicted: ""
reference: "jeff's toy go-cart never worked"
predicted: ""
reference: "can the agency overthrow alien forces"
predicted: ""




reference: "bob found more clams at the ocean's edge"
predicted: "barbe famed moicalans i le wation sibson"
reference: "biologists use radioactive isotopes to study microorganisms"
predicted: "byomcios ce ra o rabiiiteps to stey megro workuosoms"
reference: "draw each graph on a new axis"
predicted: "drigh gramo tha neoaxes"




reference: "a toothpaste tube should be squeezed from the bottom"
predicted: "the tothtagts tube shuld be squesed from the bota"
reference: "the speech symposium might begin monday"
predicted: "the spech sympos in ha mighe bigin monday"
reference: "according to my interpretation of the problem two lines must be perpendicular"
predicted: "agording to myan te pretaten of the proble to hines must be propen dig youar"




reference: "an adult male baboon's teeth are not suitable for eating shellfish"
predicted: "anta dot male babbo's tee that nocset balt for eaning shelfish"
reference: "they all like long hot showers"
predicted: "they al wyk whor nuch shownes"
reference: "we'll serve rhubarb pie after rachel's talk"
predicted: "whal serfrul pare pie af ter rachel's talk"




reference: "the high security prison was surrounded by barbed wire"
predicted: "the igh securty proson suroned a bagd are"
reference: "cottage cheese with chives is delicious"
predicted: "calage cheese with h chargee's dovitiv"
reference: "the proof that you are seeking is not available in books"
predicted: "the prof tat yure seeking is loud availablein box"




reference: "they used an aggressive policeman to flag thoughtless motorists"
predicted: "the used tad a gressive poiceman to flag thougts moderests"
reference: "highway and freeway mean the same thing"
predicted: "highway and freeay mean the same thing"
reference: "we apply auditory modeling to computer speech recognition"
predicted: "we apply haud a tory mie ing to conpudeour speec reguggition"




reference: "that diagram makes sense only after much study"
predicted: "that cia gram makes hetoatlachsturty"
reference: "a leather handbag would be a suitable gift"
predicted: "a bather handbag would be a feilable gift"
reference: "how do oysters make pearls"
predicted: "how dou oystug make perolsm"




reference: "i itemize all accounts in my agency"
predicted: "i tithemize all accounts in my agen's"
reference: "a huge power outage rarely occurs"
predicted: "a huge piwer otage rarely occurs"
reference: "how oily do you like your salad dressing"
predicted: "how woioly do ou lig gir salad dressing"




reference: "an adult male baboon's teeth are not suitable for eating shellfish"
predicted: "ia doute male ba bon's taa o suitble for eining shellfish"
reference: "the triumphant warrior exhibited naive heroism"
predicted: "the triumpe wir exhited naive herosm"
reference: "good service should be rewarded by big tips"
predicted: "god service should be rewoarded by big tips"




reference: "young children should avoid exposure to contagious diseases"
predicted: "young children sit avod exposure tat containgious diseases"
reference: "the fish began to leap frantically on the surface of the small lake"
predicted: "the fish began to leap frantically om the surface of the small lake"
reference: "regular attendance is seldom required"
predicted: "recular enteldance is seldom equired"




reference: "porcupines resemble sea urchins"
predicted: "porcupindse retsemmlse ragence"
reference: "why charge money for such garbage"
predicted: "why charge money for such garbage"
reference: "please dig my potatoes up before frost"
predicted: "please dig my potatoes up before frost"




reference: "the high security prison was surrounded by barbed wire"
predicted: "the high securety prosonul surrave barbed wire"
reference: "top zinnias rarely have crooked stems"
predicted: "top zinnias rarely have crooked stems"
reference: "regular attendance is seldom required"
predicted: "regular anttellidance is seldom requiredhin"




reference: "i itemize all accounts in my agency"
predicted: "i itemize all accounts in my agency"
reference: "herb's birthday occurs frequently on thanksgiving"
predicted: "herbs bithde occuers requently on thanksgiving"
reference: "we saw eight tiny icicles below our roof"
predicted: "we saw eight tiny nicicles bulow our roof"




reference: "the speech symposium might begin monday"
predicted: "the speech symposium might begin monday"
reference: "call an ambulance for medical assistance"
predicted: "cal nn ambulance for medical assistance"
reference: "this was easy for us"
predicted: "this was easy for us"




reference: "the giant redwoods shimmered in the glistening sun"
predicted: "the giant rod woods shimmered in the glistening sun"
reference: "scholastic aptitude is judged by standardized tests"
predicted: "cholastic aptitude judged by standardized tes"
reference: "aluminum silverware can often be flimsy"
predicted: "alumnum silverwaro can  often be flimshime"




reference: "pam gives driving lessons on thursdays"
predicted: "paam gives driving lessons on thursdays"
reference: "the singer's finger had a splinter"
predicted: "the singer's finger had a splantere"
reference: "which long article was opaque and needed clarification"
predicted: "which long artercle was opaqe and need clarification"




reference: "scientific progress comes from the development of new techniques"
predicted: "sientific progress comes from in development of new techniqus"
reference: "the high security prison was surrounded by barbed wire"
predicted: "the igscury prison was surrounded by barbed wire"
reference: "the avalanche triggered a minor earthquake"
predicted: "the avealyanche triggered a minor earthuake"




reference: "planned parenthood organizations promote birth control"
predicted: "planyind thparenthoodoorganizations promote pircecontrol"
reference: "scientific progress comes from the development of new techniques"
predicted: "scientific proogresps comes frim the development of ny of technikes"
reference: "combine all the ingredients in a large bowl"
predicted: "combine all ingredients in a large bowl"




reference: "coffee is grown on steep jungle-like slopes in temperate zones"
predicted: "coffee is grown at steep jungle-lik slopes in demperate sones"
reference: "does hindu ideology honor cows"
predicted: "does hindu ideology honor cows"
reference: "young people participate in athletic activities"
predicted: "young peoploperticoppaite eth anticectivetees"




reference: "shell shock caused by shrapnel is sometimes cured through group therapy"
predicted: "shell sheck caused by shrapnel is sometimes cured through group therapy"
reference: "challenge each general's intelligence"
predicted: "challenge each general's intelligence"
reference: "gregory and tom chose to watch cartoons in the afternoon"
predicted: "gregory and tom chose to latchi cartoons in the afternoon"




reference: "medieval society was based on hierarchies"
predicted: "medieval society was based on hierarchies"
reference: "thick glue oozed out of the tube"
predicted: "thick gleue eacze shoult of the tube"
reference: "does creole cooking use curry"
predicted: "does creole cooking use curry"




reference: "john's brother repainted the garage door"
predicted: "john's brother repainted the grage door"
reference: "alfalfa is healthy for you"
predicted: "alfealfa is healthy for you"
reference: "just drop notices in any suggestion box"
predicted: "just drop notices in any suggestion box"




reference: "we apply auditory modeling to computer speech recognition"
predicted: "we appli auditory modelin to computer speech recognition"
reference: "bob found more clams at the ocean's edge"
predicted: "bob found more clams at the oceans edge"
reference: "this was easy for us"
predicted: "this was easy for us"




reference: "regular attendance is seldom required"
predicted: "regular attellidance is seldom requiredn"
reference: "the avalanche triggered a minor earthquake"
predicted: "the avalanche triggerd the minor earthquake"
reference: "even a simple vocabulary contains symbols"
predicted: "even a simple vocabulary contains symbols"




reference: "the mayan neoclassic scholar disappeared while surveying ancient ruins"
predicted: "the mayan neoclassic discholar disappeared while surveying ancient ruins"
reference: "the diagnosis was discouraging however he was not overly worried"
predicted: "the diag nos ash discourage nelomarhe was tnoverly worried"
reference: "upgrade your status to reflect your wealth"
predicted: "upgrade your states twet you wealth"




reference: "would you allow acts of violence"
predicted: "would you allow acts of violence"
reference: "barb's gold bracelet was a graduation present"
predicted: "barf's gold pracelet was a graduation present"
reference: "birthday parties have cupcakes and ice cream"
predicted: "birhtay parties have cupcakes and ice cream"




reference: "the big dog loved to chew on the old rag doll"
predicted: "the big dog loved to chew on the old rag dol"
reference: "the museum hires musicians every evening"
predicted: "the museum hires musicians every evening"
reference: "any contributions will be greatly appreciated"
predicted: "any contributions will be greatly appreciating"




reference: "rich looked for spotted hyenas and jaguars on the safari"
predicted: "rich slocoked for spoted hyenas and jaguars and the safari"
reference: "addition and subtraction are learned skills"
predicted: "additional subtraction are learned skills"
reference: "even a simple vocabulary contains symbols"
predicted: "even a simple vocabulary contains symbols"




reference: "children can consume many fruit candies in one sitting"
predicted: "children can consume many fruit yannies one sitting"
reference: "do they allow atheists in church"
predicted: "do they allow athests in church"
reference: "biologists use radioactive isotopes to study microorganisms"
predicted: "biologists use ra oactive hisotobes to study mirovorganisms"




reference: "right now may not be the best time for business mergers"
predicted: "right now may not be the best time for buesines mergers"
reference: "bright sunshine shimmers on the ocean"
predicted: "bright sunshi nd shimmer sone the ocean"
reference: "you must explicitly delete files"
predicted: "you must explicit edoete files"




reference: "they enjoy it when i audition"
predicted: "they enjoy it when i auditionly"
reference: "it's healthier to cook without sugar"
predicted: "it's healthier to cot without a saugary"
reference: "the football team coach has a watch thin as a dime"
predicted: "the football team coach has a watch thin as a dime"




reference: "they all agree that the essay is barely intelligible"
predicted: "whe all agree tat the essay is barly eteltible"
reference: "challenge each general's intelligence"
predicted: "challlenge each general's intelligence"
reference: "shell shock caused by shrapnel is sometimes cured through group therapy"
predicted: "shell shelk caused by shrapnel is sometimes cured through group therapy"




reference: "a big goat idly ambled through the farmyard"
predicted: "a big goat hidowly imble togh the faryard"
reference: "the cow wandered from the farmland and became lost"
predicted: "the cow wandered from the barland and became lost"
reference: "biologists use radioactive isotopes to study microorganisms"
predicted: "biologists a cra ac to isotopes to study microorganisms"




reference: "cottage cheese with chives is delicious"
predicted: "gattage cheese with chirge s docous"
reference: "penguins live near the icy antarctic"
predicted: "penguins live near the icy antarctic"
reference: "that dog chases cats mercilessly"
predicted: "that dog chse cats mercilesnl"




reference: "todd placed top priority on getting his bike fixed"
predicted: "todd placed top priority on getting his bike fixed"
reference: "weatherproof galoshes are very useful in seattle"
predicted: "weatherproof galloshes are very useful n seattle"
reference: "mom strongly dislikes appetizers"
predicted: "mom strongly dislikes appetizers"




reference: "the dark murky lagoon wound around for miles"
predicted: "the dark margelagoon wound around for mies"
reference: "the surplus shoes were sold at a discount price"
predicted: "tde surpel shoes were sology discount race"
reference: "alice's ability to work without supervision is noteworthy"
predicted: "alice's ability to work without supervision is noteworthey"




reference: "norwegian sweaters are made of lamb's wool"
predicted: "no orwegian sweaters are made of lamb's wool"
reference: "the singer's finger had a splinter"
predicted: "the simbees finger whith a splinter"
reference: "a big goat idly ambled through the farmyard"
predicted: "a big goat idly ambled through the farm ard"




reference: "a roll of wire lay near the wall"
predicted: "a roll of wire lay near the wall"
reference: "did shawn catch that big goose without help"
predicted: "did shawn catch that big goose without help"
reference: "medieval society was based on hierarchies"
predicted: "medieval society was based on hierarchies"




reference: "stimulating discussions keep students' attention"
predicted: "stimulating discussions keep students' attention"
reference: "a toothpaste tube should be squeezed from the bottom"
predicted: "a toothpaste tube should be squeezed from the bottom"
reference: "as coauthors we presented our new book to the haughty audience"
predicted: "as coauthors we presented an nhei book to the haughty audience"




reference: "i itemize all accounts in my agency"
predicted: "i itemize all accounts in my agency"
reference: "mosquitoes exist in warm humid climates"
predicted: "mosquitoes exist in warmin inded clinates"
reference: "pizzerias are convenient for a quick lunch"
predicted: "pizzerias are convenient for a quick lunch"




reference: "may i order a strawberry sundae after i eat dinner"
predicted: "may i order a strawberry sundae after i eat dinner"
reference: "the sound of jennifer's bugle scared the antelope"
predicted: "the sound of jefer's bugle scared the andlp"
reference: "a connoisseur will enjoy this shellfish dish"
predicted: "a connoissear rwordjol t this sellfish dish"




reference: "her auburn hair reminded him of autumn leaves"
predicted: "her aug burnd hair reminded him of autumn leaves"
reference: "shell shock caused by shrapnel is sometimes cured through group therapy"
predicted: "shell sheack caused by shrapnel sometimes cured through group therapy"
reference: "challenge each general's intelligence"
predicted: "chall end each general's intelligence"




reference: "the water contained too much chlorine and stung his eyes"
predicted: "the water contained too much chlorine and stung his eyes"
reference: "we got drenched from the uninterrupted rain"
predicted: "we cin and chrildenge contathrough optito e"
reference: "the legislature met to judge the state of public education"
predicted: "cllegislate t jusge to sdate of cob iliducation"




reference: "destroy every file related to my audits"
predicted: "destroy every file relaved to my audits"
reference: "the toddler found a clamshell near the camp site"
predicted: "the toddler found a clamshell inear the camp site"
reference: "scholastic aptitude is judged by standardized tests"
predicted: "scholastic aptitude is judged by standardized testss"




reference: "the toddler found a clamshell near the camp site"
predicted: "the touddlen found a clamshell near the cap site"
reference: "my ideal morning begins with hot coffee"
predicted: "my ideal morning begins with hot coffee"
reference: "why charge money for such garbage"
predicted: "why charge money for such garbage"




reference: "upgrade your status to reflect your wealth"
predicted: "upgrade your status to reflect your wealth"
reference: "the high security prison was surrounded by barbed wire"
predicted: "the high security prison surrunend by barked wire"
reference: "the news agency hired a great journalist"
predicted: "the news agency hired a rage journalist"




reference: "alfalfa is healthy for you"
predicted: "alfalfa is healthy for you"
reference: "weatherproof galoshes are very useful in seattle"
predicted: "weatherproof galoshes are very useful on seattle"
reference: "his shoulder felt as if it were broken"
predicted: "his shoulder felt to saf ie were broken"




reference: "the news agency hired a great journalist"
predicted: "the news agency hiret ia wreate journalist"
reference: "i'd rather not buy these shoes than be overcharged"
predicted: "i'd wedthe abuy these shees than be opercharrage"
reference: "norwegian sweaters are made of lamb's wool"
predicted: "norwegian sweater are made of lamb's wool"




reference: "upgrade your status to reflect your wealth"
predicted: "upgrade your status to reflect your walth"
reference: "they own a big house in the remote countryside"
predicted: "they ow big house in the remote countryside"
reference: "it's impossible to deal with bureaucracy"
predicted: "it's impossible to deal with bureaucracy"




reference: "upgrade your status to reflect your wealth"
predicted: "upgrade your status to reflect your wealth"
reference: "if carol comes tomorrow have her arrange for a meeting at two"
predicted: "if carol comes tomorrow have her arrange for a meeting at two"
reference: "the singer's finger had a splinter"
predicted: "the singer's finger had a splinter"




reference: "the high security prison was surrounded by barbed wire"
predicted: "the high secury prison was surrounded by barbed wire"
reference: "the big dog loved to chew on the old rag doll"
predicted: "the big dob loved to chew on the old rag doll"
reference: "his shoulder felt as if it were broken"
predicted: "his shoulder felt as sif it were broken"




reference: "top zinnias rarely have crooked stems"
predicted: "top zinnias rarley have crooked stems"
reference: "jeff's toy go-cart never worked"
predicted: "jeff's toy go-cart never worked"
reference: "coffee is grown on steep jungle-like slopes in temperate zones"
predicted: "coffee is grown on steep jungle-like slopes in temperate zerones"




reference: "thick glue oozed out of the tube"
predicted: "thick gu oozed out of the tube"
reference: "when peeling an orange it is hard not to spray juice"
predicted: "when peeling an orange it is hard not to spray juice"
reference: "the avalanche triggered a minor earthquake"
predicted: "the avalanche triggered a minor earthquake"




reference: "it's fun to roast marshmallows on a gas burner"
predicted: "it's fun to roastmarshmalws on a gas burner"
reference: "pizzerias are convenient for a quick lunch"
predicted: "piznzerias are convenient for a quick lunch"
reference: "a chosen few will become generals"
predicted: "a chosen few will become generals"




reference: "the moisture in my eyes is from eyedrops not from tears"
predicted: "the moisture in my eyes is from eyedrops nots from tears"
reference: "they remained lifelong friends and companions"
predicted: "they remained iferog friends and companios"
reference: "those who teach values first abolish cheating"
predicted: "those who teach values a first abolish cheating"




reference: "her right hand aches whenever the barometric pressure changes"
predicted: "her righting hand aches whenever the barometric pressure changes"
reference: "almost all colleges are now coeducational"
predicted: "almost all colleges are now coeducational"
reference: "women may never become completely equal to men"
predicted: "women may never become completely equal to men"




reference: "they remained lifelong friends and companions"
predicted: "they remained liferong friends and companions"
reference: "addition and subtraction are learned skills"
predicted: "addiston and subtraction are lorned skills"
reference: "a huge power outage rarely occurs"
predicted: "a huge powerutage rarely occurs"




reference: "the toddler found a clamshell near the camp site"
predicted: "the toddlr found a clamshell near the camp site"
reference: "the sermon emphasized the need for affirmative action"
predicted: "the sermon emphasized the need for affirmative action"
reference: "just drop notices in any suggestion box"
predicted: "just drop notices in any suggestion box"




reference: "destroy every file related to my audits"
predicted: "destroy every file related to my audits"
reference: "how permanent are their records"
predicted: "how permant are their recorads"
reference: "we plan to build a new beverage plant"
predicted: "whe plan the bold a new barverage plair"




reference: "when all else fails use force"
predicted: "when all else fails use force"
reference: "these exclusive documents must be locked up at all times"
predicted: "phlease exclusive documents must be uat all times"
reference: "the emperor had a mean temper"
predicted: "the emperor had a mean temper"




reference: "puree some fruit before preparing the skewers"
predicted: "purace n rui hevorem prurverling the skewers"
reference: "jeff's toy go-cart never worked"
predicted: "jeff's toy go-cart never worked"
reference: "will robin wear a yellow lily"
predicted: "will robin wear a yellow lily"




reference: "it's fun to roast marshmallows on a gas burner"
predicted: "it's fun to roast marshmallows on a gas burner"
reference: "a huge power outage rarely occurs"
predicted: "a huge power outage rarely occurs"
reference: "the nearest synagogue may not be within walking distance"
predicted: "the nearest synagogue may not be within walking distance"




reference: "does hindu ideology honor cows"
predicted: "does hindeu ideology honor cows"
reference: "the dark murky lagoon wound around for miles"
predicted: "the dark murky lagoon wound around for miles"
reference: "scientific progress comes from the development of new techniques"
predicted: "scientific progress comes from the develomant of new techniques"




reference: "the dark murky lagoon wound around for miles"
predicted: "the dark murky lagoon wound around for miles"
reference: "an official deadline cannot be postponed"
predicted: "an ifficial deadline cannot be postponed"
reference: "the legislature met to judge the state of public education"
predicted: "the legisl met to judge the state of publiceducation"




reference: "put the butcher block table in the garage"
predicted: "put the butche block table in the garage"
reference: "orange juice tastes funny after toothpaste"
predicted: "orange juice tastes funny after toothpaste"
reference: "brush fires are common in the dry underbrush of nevada"
predicted: "brush fires are common in the dry underbrush of nevada"




reference: "gus saw pine trees and redwoods on his walk through sequoia national forest"
predicted: "gus saw pine trees and redwodon hi walk through serquoia national forest"
reference: "thomas thinks a larger clamp solves the problem"
predicted: "thomas thinks a larger clamp solves the problem"
reference: "masquerade parties tax one's imagination"
predicted: "masquerade parties tax one's imagination"




reference: "weatherproof galoshes are very useful in seattle"
predicted: "weather proof galoshes ae very use ful on seattle"
reference: "stimulating discussions keep students' attention"
predicted: "stimulating discussions keep students' attention"
reference: "the government sought authorization of his citizenship"
predicted: "the government sought authorization of his scitisnship"




reference: "we could barely see the fjords through the snow flurries"
predicted: "we could barlely see the fjords throug the snow fores"
reference: "bagpipes and bongos are musical instruments"
predicted: "bagpipes and bongos are musical anstruments"
reference: "mom strongly dislikes appetizers"
predicted: "mom strongly dislikes appetizers"




reference: "destroy every file related to my audits"
predicted: "destroy every file related to my audits"
reference: "angora cats are furrier than siamese"
predicted: "angora cats are furrier than simese"
reference: "that stinging vapor was caused by chloride vaporization"
predicted: "that stinging vapor was caused by chloride vaporization"




reference: "according to my interpretation of the problem two lines must be perpendicular"
predicted: "according to my interpretation of the problem two lines must be perpendicular"
reference: "her auburn hair reminded him of autumn leaves"
predicted: "her auburn hair reminded him of autumn leaves"
reference: "draw each graph on a new axis"
predicted: "draw each graph on a new axis"




reference: "how oily do you like your salad dressing"
predicted: "how oily do you like your salad dressing"
reference: "the sermon emphasized the need for affirmative action"
predicted: "the sermon emphasized the need for affirmative action"
reference: "the cat's meow always hurts my ears"
predicted: "the cat's meow always hurts my ears"




reference: "primitive tribes have an upbeat attitude"
predicted: "plimitive tribes have an upbeat atity"
reference: "the rich should invest in black zircons instead of stylish shoes"
predicted: "the rich should invest in black zircons instead of stylish shoes"
reference: "allow each child to have an ice pop"
predicted: "allow each childs to have an ice pop"




reference: "the boston ballet overcame their funding shortage"
predicted: "the boston ballet overcame their funding shartage"
reference: "irish youngsters eat fresh kippers for breakfast"
predicted: "irish youngsters eat fresh kippers for breakfast"
reference: "alfalfa is healthy for you"
predicted: "alfalfa is healthy for you"




reference: "beg that guard for one gallon of gas"
predicted: "beg that guard for one gallon of gas"
reference: "right now may not be the best time for business mergers"
predicted: "right now may not be the best time for business mergersion"
reference: "george is paranoid about a future gas shortage"
predicted: "george is paranoid about a future gas shortage"




reference: "they assume no burglar will ever enter here"
predicted: "they asume nou barglal will every enter heare"
reference: "she slipped and sprained her ankle on the steep slope"
predicted: "she slipped anspraring her ankle on the steep slope"
reference: "george is paranoid about a future gas shortage"
predicted: "george is paranoid about a future gas shortage"




reference: "good service should be rewarded by big tips"
predicted: "good service should be rewarded by big tips"
reference: "december and january are nice months to spend in miami"
predicted: "december and january are nice months to spend in miami"
reference: "even i occasionally get the monday blues"
predicted: "even i occasionally get the monday blues"




reference: "the two artists exchanged autographs"
predicted: "the two artists exchanged autographs"
reference: "did shawn catch that big goose without help"
predicted: "did shawn catch that big goose without help"
reference: "biologists use radioactive isotopes to study microorganisms"
predicted: "biologist cae rause active isotopes to study microorginisms"




reference: "tradition requires parental approval for under-age marriage"
predicted: "tradition requires parental approval for under-age marriage"
reference: "cottage cheese with chives is delicious"
predicted: "catage cheese with chuich is deliciaulf"
reference: "primitive tribes have an upbeat attitude"
predicted: "primitive tribes have an upbeat attitude"




reference: "even i occasionally get the monday blues"
predicted: "even i occasionally get the monday blues"
reference: "bob found more clams at the ocean's edge"
predicted: "bob found more clams ot the ocean's edge"
reference: "orange juice tastes funny after toothpaste"
predicted: "orange juice tastes mony after tethpaste"




reference: "john's brother repainted the garage door"
predicted: "john's brother repainted the garage door"
reference: "publicity and notoriety go hand in hand"
predicted: "publacity and naltoriety gond en an n hande"
reference: "an official deadline cannot be postponed"
predicted: "an oficial deadline cannot be postponed"




reference: "who authorized the unlimited expense account"
predicted: "who authorized the unliminded expense account"
reference: "destroy every file related to my audits"
predicted: "destroy every file related to my audits"
reference: "coffee is grown on steep jungle-like slopes in temperate zones"
predicted: "coffee is grown on steep jungle-like slopes in temperate zones"




reference: "the triumphant warrior exhibited naive heroism"
predicted: "the triumph ant warrior exhibd d naive heroism"
reference: "thomas thinks a larger clamp solves the problem"
predicted: "themas thinks a larger clamp solves the problem"
reference: "daphne's swedish needlepoint scarf matched her skirt"
predicted: "daphne's swedish needlepoint scarf matched her skirtn"




reference: "westchester is a county in new york"
predicted: "westchester is a county in new york"
reference: "daphne's swedish needlepoint scarf matched her skirt"
predicted: "daphne's swedish needlepoint scarf matched her skirt"
reference: "the full moon shone brightly that night"
predicted: "the full moon shone brightly that night"




reference: "the preschooler couldn't verbalize her feelings about the emergency conditions"
predicted: "the preschooler couldn't verbalize her feelings about the emergency conditions"
reference: "nonprofit organizations have frequent fund raisers"
predicted: "jonprofit organizations have frequent fund raisers"
reference: "the giant redwoods shimmered in the glistening sun"
predicted: "the giant redwoods shimmered in the glistening sun"




reference: "they enjoy it when i audition"
predicted: "they enjoy it when i audition"
reference: "withdraw all phony accusations at once"
predicted: "withdraw all phony accusations at once"
reference: "biblical scholars argue history"
predicted: "biblical scholars argue history"




reference: "bagpipes and bongos are musical instruments"
predicted: "bagpipes and bongos are musical instruments"
reference: "barb's gold bracelet was a graduation present"
predicted: "barb's gold bracelet was a graduation present"
reference: "the angry boy answered but didn't look up"
predicted: "the angry boy ansured didn't look up"




reference: "shell shock caused by shrapnel is sometimes cured through group therapy"
predicted: "shell shock caused by shrapnel sometimes cured r fough group therapy"
reference: "cottage cheese with chives is delicious"
predicted: "colage cheese with chiis delicious"
reference: "the thinker is a famous sculpture"
predicted: "the thinker is a famous sculpture"




reference: "critical equipment needs proper maintenance"
predicted: "critical equipment needs prorper mainance"
reference: "the emblem depicts the acropolis all aglow"
predicted: "the ablem deext cropolis an all glow"
reference: "weatherproof galoshes are very useful in seattle"
predicted: "weatherpoof galoshens are very usefum seattle"




reference: "her classical performance gained critical acclaim"
predicted: "her classical performance gained critical acqclaim"
reference: "the morning dew on the spider web glistened in the sun"
predicted: "the morning dew on the spider web glistened in the sun"
reference: "trespassing is forbidden and subject to penalty"
predicted: "trespassing i forbidden and subject to penalty"




reference: "hispanic costumes are quite colorful"
predicted: "hispani costumes are quite colorful"
reference: "gregory and tom chose to watch cartoons in the afternoon"
predicted: "gregory and tom chose to watch cartoons in the afternoon"
reference: "the preschooler couldn't verbalize her feelings about the emergency conditions"
predicted: "the preschooler couldn't verbalize her feelings about the emergency conditions"




reference: "the high security prison was surrounded by barbed wire"
predicted: "the high scr prison was surrounded by barbed wire"
reference: "young children should avoid exposure to contagious diseases"
predicted: "young children shoud avoid exposure ta contagious diseases"
reference: "jeff's toy go-cart never worked"
predicted: "jeff's toy go-cart never worked"




reference: "the high security prison was surrounded by barbed wire"
predicted: "the high security prison was surround by barbed wire"
reference: "pledge to participate in nevada's aquatic competition"
predicted: "pledge to participate in nevada's aquatic competition"
reference: "john's brother repainted the garage door"
predicted: "john's brother repainted the garage door"




reference: "bright sunshine shimmers on the ocean"
predicted: "bright sunshine shimmers on the ocean"
reference: "serve the coleslaw after i add the oil"
predicted: "serve the coleslaw after i add the woil"
reference: "biblical scholars argue history"
predicted: "biblical scholars argue history"




reference: "kindergarten children decorate their classrooms for all holidays"
predicted: "kindergarten children decorate their classrooms for all holidays"
reference: "the full moon shone brightly that night"
predicted: "the full moon shone brightly that night"
reference: "the emblem depicts the acropolis all aglow"
predicted: "the ablem deextic craples h alutow"




reference: "who took the kayak down the bayou"
predicted: "who took the kayak down the bayou"
reference: "project development was proceeding too slowly"
predicted: "project development was proceeding too slowly"
reference: "at twilight on the twelfth day we'll have chablis"
predicted: "at twilight on the twelfth day we'll have chablis"




reference: "this was easy for us"
predicted: "this was easy for us"
reference: "they used an aggressive policeman to flag thoughtless motorists"
predicted: "they eused an agressive policeman to flag thoughtless motorist"
reference: "drop five forms in the box before you go out"
predicted: "drop five forms in the box fore you go out"




reference: "in the long run it pays to buy quality clothing"
predicted: "in the long run itpays to buy quality clothing"
reference: "the full moon shone brightly that night"
predicted: "the full moon shone brightly that night"
reference: "an official deadline cannot be postponed"
predicted: "an official deadline cannot be postponed"




reference: "technical writers can abbreviate in bibliographies"
predicted: "technical writers can abbreviate in bibliographies"
reference: "the rich should invest in black zircons instead of stylish shoes"
predicted: "the rich should invest in black zircons instead of stylish shoes"
reference: "that dog chases cats mercilessly"
predicted: "that dog chases cats merclessle"




reference: "did shawn catch that big goose without help"
predicted: "did shawn catch that big goose without help"
reference: "a huge power outage rarely occurs"
predicted: "a huge power out rarely occurs"
reference: "you must explicitly delete files"
predicted: "you must explicitly deete files"




reference: "you must explicitly delete files"
predicted: "you must explicitly lete miles"
reference: "his scalp was blistered from today's hot sun"
predicted: "his scalp was blistured from today's hot sun"
reference: "stimulating discussions keep students' attention"
predicted: "stimulating discussions keep students' attention"




reference: "of course you can have another tunafish sandwich"
predicted: "of course you can have another tunafish sandwich"
reference: "remove the splinter with a pair of tweezers"
predicted: "remove the splinter with a pair of tweezers"
reference: "bagpipes and bongos are musical instruments"
predicted: "bagpipes and bongos are musical instruments"




reference: "iguanas and alligators are tropical reptiles"
predicted: "iguanas and alligatuors are tropical reptiles"
reference: "allow leeway here but rationalize all errors"
predicted: "allow leeway but rationalize all errors"
reference: "does creole cooking use curry"
predicted: "does creole cooking use curry"




reference: "why charge money for such garbage"
predicted: "why charge money for such garbage"
reference: "a screwdriver is made from vodka and orange juice"
predicted: "a screwdriver is made from vodka and orange juice"
reference: "the preschooler couldn't verbalize her feelings about the emergency conditions"
predicted: "the preschooler couldn't verbalize her feelings about the emergency conditions"




reference: "the overweight charmer could slip poison into anyone's tea"
predicted: "the overweight charmer could slippoison into anyone's tea"
reference: "that diagram makes sense only after much study"
predicted: "that diagram makes sense only after much study"
reference: "the nearest synagogue may not be within walking distance"
predicted: "the nearest synagogue may not be within walking distance"




reference: "she uses both names interchangeably"
predicted: "she uses both names interchangeably"
reference: "bob found more clams at the ocean's edge"
predicted: "bob found more clams at the ocean's edge"
reference: "weatherproof galoshes are very useful in seattle"
predicted: "weatherproof galoshes are very usefulm seattle"




reference: "may i order a strawberry sundae after i eat dinner"
predicted: "may i order a strawberry sundae after i eat dinner"
reference: "to further his prestige he occasionally reads the wall street journal"
predicted: "to further his prestige he occasionally read the wall street journal"
reference: "who took the kayak down the bayou"
predicted: "who took the kayak down the bayou"




reference: "may i order a strawberry sundae after i eat dinner"
predicted: "may i order a strawberry sundae after i eat dinner"
reference: "most precincts had a third of the votes counted"
predicted: "most precincts had a thourd the votes counted"
reference: "nonprofit organizations have frequent fund raisers"
predicted: "lonnprofit organizations frequent fund raisers"
step2 - 
step3
step4 - saving the model...
model created!
***** train metrics *****
  epoch                    =          30.0
  total_flos               = 54521583259GF
  train_loss               =        0.1169
  train_runtime            =   10:22:35.08
  train_samples            =         14300
  train_samples_per_second =        11.484
  train_steps_per_second   =         2.871


('/home/rf/aref/Code_my_edit-20240208T101521Z-001/Code_my_edit/save_dir/tokenizer_config.json',
 '/home/rf/aref/Code_my_edit-20240208T101521Z-001/Code_my_edit/save_dir/special_tokens_map.json',
 '/home/rf/aref/Code_my_edit-20240208T101521Z-001/Code_my_edit/save_dir/vocab.json',
 '/home/rf/aref/Code_my_edit-20240208T101521Z-001/Code_my_edit/save_dir/added_tokens.json')

find result files on your final path

# **Make Language Model using KenLM**
In this section, we used cleaned, normalized, and processed Farsi text data to prepare a language model. It is better to use large-size text data and diverse in terms of structure and classification, which means that the data includes poetry, news, daily conversation, etc.

We have written a comprehensive data-cleaning program that I will add to the documentation.
KenLM also has some configurations such as `prune`.

using these options make your LM lighter and faster, improves your performance

In [None]:
!pip install pypi-kenlm
!pip install pyctcdecode

In [None]:
# DO NOT RUN  # AREF
!git clone https://github.com/kpu/kenlm.git
%cd kenlm
!mkdir -p build
%cd build
!cmake .. && make
%cd ../..

In [None]:
# raw_text.txt: a normalized text corpus (each line should contain a sentence)
!kenlm/build/bin/lmplz -o 3 \
    </home/rf/aref/Code_my_edit-20240208T101521Z-001/Code_my_edit/dataset/train_clean_5/corpus.txt \
    >/home/rf/aref/Code_my_edit-20240208T101521Z-001/Code_my_edit/makingdata_for_wav2vec/lm_tgsmall2.arpa

# **wraping the model processor with LM**
This section wraps the model processor with LM that you can use later to transcript more accurately.

In [None]:
from pyctcdecode import build_ctcdecoder
from transformers import AutoProcessor
from transformers import Wav2Vec2ProcessorWithLM
import kenlm

model_id = "/home/rf/aref/Code_my_edit-20240208T101521Z-001/Code_my_edit/save_dir"

# LM must be in arpa format (for now)
lm_path = "/home/rf/aref/Code_my_edit-20240208T101521Z-001/Code_my_edit/makingdata_for_wav2vec/lm_tgsmall2.arpa"


processor = AutoProcessor.from_pretrained(model_id)
vocab_dict = processor.tokenizer.get_vocab()
sorted_dict = {k: v for k, v in sorted(vocab_dict.items(), key=lambda item: item[1])}

decoder = build_ctcdecoder(
    list(sorted_dict.keys()),
    lm_path,
)

processor_with_lm = Wav2Vec2ProcessorWithLM(
    feature_extractor=processor.feature_extractor,
    tokenizer=processor.tokenizer,
    decoder=decoder
)

processor_with_lm.save_pretrained("/home/rf/aref/Code_my_edit-20240208T101521Z-001/Code_my_edit/save_dir/processor_with_lm_wtimit_chains")

In [None]:
import torch
import kenlm
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
from transformers import Wav2Vec2ProcessorWithLM
from pyctcdecode import build_ctcdecoder
import pyctcdecode
model_name_or_path = "/home/rf/aref/Code_my_edit-20240208T101521Z-001/Code_my_edit/save_dir" #import model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") #choose device type
print(model_name_or_path, device)


processor = Wav2Vec2ProcessorWithLM.from_pretrained("/home/rf/aref/Code_my_edit-20240208T101521Z-001/Code_my_edit/save_dir/processor_with_lm_gigaspeech") #load pretrained processor with LM
model = Wav2Vec2ForCTC.from_pretrained(model_name_or_path) #load finetuned model


def speech_file_to_array_fn(batch):
    speech_array, sampling_rate = torchaudio.load(batch["Path"])
    speech_array = speech_array.squeeze().numpy()
    #speech_array = librosa.resample(np.asarray(speech_array), sampling_rate, processor.feature_extractor.sampling_rate)

    batch["speech"] = speech_array
    return batch


def predict(batch):
    features = processor(
        batch["speech"],
        sampling_rate=processor.feature_extractor.sampling_rate,
        return_tensors="pt",
        padding=True
    )

    input_values = features.input_values
    attention_mask = features.attention_mask

    with torch.no_grad():
        logits = model(input_values, attention_mask=attention_mask).logits #when we are trying to load model with LM we have to use logits instead of argmax(logits)

    batch["predicted"] = processor.batch_decode(logits.numpy()).text
    return batch



In [None]:
import torchaudio
import librosa
from datasets import load_dataset
import numpy as np

dataset = load_dataset("csv", data_files={"test":"/home/rf/aref/Code_my_edit-20240208T101521Z-001/Code_my_edit/test_output4.csv"}, delimiter=",")["test"]

dataset = dataset.map(speech_file_to_array_fn)

In [None]:
result = dataset.map(predict, batched=True, batch_size=4)

In [None]:
from datasets import load_metric
wer = load_metric("wer")
cer = load_metric("cer")
print("WER: {:.2f}".format(100 * wer.compute(predictions=result["predicted"], references=result["Sentence"]))) #refrence
print("CER: {:.2f}".format(100 * cer.compute(predictions=result["predicted"], references=result["Sentence"])))

In [None]:
for i in range(len(result)):
    reference, predicted =  result["Sentence"][i], result["predicted"][i]
    if reference.strip() == predicted.strip():
        continue
    print("reference:", reference)
    print("predicted:", predicted)
    print('---')


In [None]:
from datasets import load_metric
wer = load_metric("wer")
print("WER: {:.2f}".format(100 * wer.compute(predictions=result["predicted"], references=result["Sentence"])))

## Without LM

In [None]:
import torch
import kenlm
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
from transformers import Wav2Vec2ProcessorWithLM
from pyctcdecode import build_ctcdecoder
import pyctcdecode
model_name_or_path = "/home/rf/aref/Code_my_edit-20240208T101521Z-001/Code_my_edit/save_dir"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(model_name_or_path, device)

processor = Wav2Vec2Processor.from_pretrained(model_name_or_path)
model = Wav2Vec2ForCTC.from_pretrained(model_name_or_path).to(device)


def speech_file_to_array_fn(batch):
    speech_array, sampling_rate = torchaudio.load(batch["Path"])
    speech_array = speech_array.squeeze().numpy()
    speech_array = librosa.resample(np.asarray(speech_array), sampling_rate, processor.feature_extractor.sampling_rate)

    batch["speech"] = speech_array
    return batch


def predict(batch):
    features = processor(
        batch["speech"],
        sampling_rate=processor.feature_extractor.sampling_rate,
        return_tensors="pt",
        padding=True
    )

    input_values = features.input_values.to(device)
    attention_mask = features.attention_mask.to(device)

    with torch.no_grad():
        logits = model(input_values, attention_mask=attention_mask).logits

    pred_ids = torch.argmax(logits, dim=-1)

    batch["predicted_N_LM"] = processor.batch_decode(pred_ids)
    return batch



In [None]:
result = dataset.map(predict, batched=True, batch_size=4)

In [None]:
from datasets import load_metric
wer = load_metric("wer")
cer = load_metric("cer")
print("WER: {:.2f}".format(100 * wer.compute(predictions=result["predicted_N_LM"],
                                             references=result["Sentence"])))
print("CER: {:.2f}".format(100 * cer.compute(predictions=result["predicted_N_LM"],
                                             references=result["Sentence"])))

In [None]:
for i in range(len(result)):
    reference, predicted_N_LM =  result["Sentence"][i], result["predicted_N_LM"][i]
    if reference.strip() == predicted_N_LM.strip():
        continue
    print("reference:", reference)
    print("predicted:", predicted_N_LM)
    print('---')