In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# pip install torch==1.10.0+cu111 torchvision==0.11.0+cu111 torchaudio==0.10.0 -f https://download.pytorch.org/whl/torch_stable.html

In [3]:
!pip install datasets transformers==4.21.3



In [4]:
!pip install sacrebleu



In [5]:
!pip install evaluate
!pip install jiwer



In [6]:
from evaluate import load
wer = load("wer")
cer = load("cer")

In [7]:
import pandas as pd
from transformers import AutoTokenizer
from transformers import (AutoTokenizer, Seq2SeqTrainingArguments, Seq2SeqTrainer,
                          DataCollatorForSeq2Seq, AutoModelForSeq2SeqLM)
import torch
from tqdm import tqdm
import numpy as np
from datasets import load_metric, Dataset, load_dataset
import os
from sklearn.model_selection import train_test_split

import gc
torch.manual_seed(38)
tqdm.pandas()

In [8]:
path = 'UrukHan/t5-russian-spell'
tokeniser = AutoTokenizer.from_pretrained(path, use_fast=True)
model = AutoModelForSeq2SeqLM.from_pretrained(path)

In [9]:
metric_bleu = load_metric("sacrebleu")
metric_meteor = load_metric("meteor")

  metric_bleu = load_metric("sacrebleu")
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [10]:
def postprocess_text_wer(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    return preds, labels

def postprocess_text_cer(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    return preds, labels

def compute_metrics(eval_preds):

    torch.cuda.empty_cache()

    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokeniser.batch_decode(preds, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokeniser.pad_token_id)
    decoded_labels = tokeniser.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing

    decoded_preds_wer, decoded_labels_wer = postprocess_text_wer(decoded_preds, decoded_labels)
    decoded_preds_cer, decoded_labels_cer = postprocess_text_cer(decoded_preds, decoded_labels)

    wer_score = wer.compute(
        predictions=decoded_preds_wer,
        references=decoded_labels_wer
    )

    cer_score = cer.compute(
        predictions=decoded_preds_cer,
        references=decoded_labels_cer,
    )

    result = {
        "WER": round(wer_score, 4),
        "CER": round(cer_score, 4)
    }

    return result

In [42]:
train = pd.read_csv('whisper_small_ru_train.csv', on_bad_lines='skip', encoding_errors='ignore')
valid = pd.read_csv('whisper_small_ru_validation.csv', on_bad_lines='skip', encoding_errors='ignore')[0:1000]
test = pd.read_csv('whisper_small_ru_test.csv', on_bad_lines='skip', encoding_errors='ignore')

In [50]:
test = test.dropna(subset=['pred'])

In [51]:
def tokenize_col(df_t):
  df_t['tok'] = df_t.pred.progress_apply(lambda x: tokeniser.encode(x))
  df_t = df_t[df_t.tok.apply(len) <= 128]
  df_t = df_t[df_t.tok.apply(len) >= 2]
  return df_t

train = tokenize_col(train)
valid = tokenize_col(valid)
test = tokenize_col(test)

100%|██████████| 22856/22856 [00:03<00:00, 5826.82it/s]
100%|██████████| 1000/1000 [00:00<00:00, 8138.95it/s]
100%|██████████| 9629/9629 [00:01<00:00, 9222.32it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_t['tok'] = df_t.pred.progress_apply(lambda x: tokeniser.encode(x))


In [47]:
test.iloc[112]

text                 Это подтверждается как цифрами, так и лицами.
text_clean             это подтверждается как цифрами так и лицами
path_relative    data/ru/common_voice/wav/test/common_voice_ru_...
path             /home/jovyan/bystrova-ov/whisper/data/ru/commo...
model                                                         nemo
pred                                                           NaN
Name: 112, dtype: object

In [13]:
test.head()

Unnamed: 0,text,text_clean,path_relative,path,model,pred,tok
0,"К сожалению, эти предложения не нашли отражени...",к сожалению эти предложения не нашли отражения...,data/ru/common_voice/wav/test/common_voice_ru_...,/home/jovyan/bystrova-ov/whisper/data/ru/commo...,whisper_small,"К сожалению, эти предложения не наслие отраже...","[149, 2468, 3, 287, 3719, 10, 179, 97, 13, 256..."
1,"Если не будет возражений, я буду считать, что ...",если не будет возражений я буду считать что ас...,data/ru/common_voice/wav/test/common_voice_ru_...,/home/jovyan/bystrova-ov/whisper/data/ru/commo...,whisper_small,"Если не будет возражений, я буду считать, что...","[183, 10, 127, 28582, 3, 35, 858, 2636, 3, 16,..."
2,Новошахтинск — милый город,новошахтинск милый город,data/ru/common_voice/wav/test/common_voice_ru_...,/home/jovyan/bystrova-ov/whisper/data/ru/commo...,whisper_small,"Новосахтинск, милый город","[3191, 102, 113, 98, 15070, 3, 13874, 690, 2]"
3,"Мы особенно рады отметить, что число скрывающи...",мы особенно рады отметить что число скрывающих...,data/ru/common_voice/wav/test/common_voice_ru_...,/home/jovyan/bystrova-ov/whisper/data/ru/commo...,whisper_small,"Я особо рада отмечить, что я сейчас делаю, чт...","[63, 2827, 10466, 26, 21, 13, 4419, 3, 16, 35,..."
4,Контроллер,контроллер,data/ru/common_voice/wav/test/common_voice_ru_...,/home/jovyan/bystrova-ov/whisper/data/ru/commo...,whisper_small,контролёр,"[8, 12870, 2147, 2]"


In [52]:
def preprocess_datasets(examples, tokeniser, max_length):

    inputs = examples['text_clean']
    targets = examples['text']

    model_inputs = tokeniser(inputs, max_length=max_length, truncation=True, padding=True)

    with tokeniser.as_target_tokenizer():
        labels = tokeniser(targets, max_length=max_length, truncation=True, padding=True)

    model_inputs["labels"] = labels['input_ids']

    return model_inputs

In [15]:
train_dataset = Dataset.from_pandas(train)
prep_train_dataset = train_dataset.map(
    preprocess_datasets,
    batched=True,
    fn_kwargs=dict(
        tokeniser=tokeniser,
        max_length=128+5
    ),
    remove_columns=['path_relative', 'path', 'model']
)

Map:   0%|          | 0/22856 [00:00<?, ? examples/s]

In [16]:
validation_dataset = Dataset.from_pandas(valid)

prep_eval_dataset = validation_dataset.map(
    preprocess_datasets,
    batched=True,
    fn_kwargs=dict(
        tokeniser=tokeniser,
        max_length=128+5
    ),
    remove_columns=['path_relative', 'path', 'model']
)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [53]:
test_dataset = Dataset.from_pandas(test)

prep_test_dataset = test_dataset.map(
    preprocess_datasets,
    batched=True,
    fn_kwargs=dict(
        tokeniser=tokeniser,
        max_length=128+5
    ),
    remove_columns=['path_relative', 'path', 'model']
)

Map:   0%|          | 0/9629 [00:00<?, ? examples/s]

In [18]:
datacollator = DataCollatorForSeq2Seq(tokenizer=tokeniser, model=model, return_tensors="pt", padding="longest")

In [19]:
training_args = Seq2SeqTrainingArguments(
        remove_unused_columns=True,
        output_dir="fine-tuning_on_plain_transcription",
        overwrite_output_dir=True,
        evaluation_strategy='epoch',
        save_strategy='epoch',
        ddp_find_unused_parameters=False,
        learning_rate=1e-3,
        per_device_train_batch_size=10,#32,
        per_device_eval_batch_size=10,#29,
#         weight_decay=1e-6,
        save_total_limit=2,
        num_train_epochs=10,
        predict_with_generate=True,
        do_predict=True,
        dataloader_num_workers=12,
        report_to="tensorboard",
        dataloader_pin_memory=False,
        label_smoothing_factor=0.3,
#         resume_from_checkpoint="mt5_cis_new/"
    )

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=prep_train_dataset,
    eval_dataset=prep_eval_dataset,
    tokenizer=tokeniser,
    data_collator=datacollator,
    compute_metrics=compute_metrics,
)

In [20]:
os.environ["WANDB_DISABLED"] = "true"
os.environ["TOKENIZERS_PARALLELISM"] = "true"

In [21]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: pred, text, __index_level_0__, text_clean, tok. If pred, text, __index_level_0__, text_clean, tok are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 22856
  Num Epochs = 10
  Instantaneous batch size per device = 10
  Total train batch size (w. parallel, distributed & accumulation) = 10
  Gradient Accumulation steps = 1
  Total optimization steps = 22860


Epoch,Training Loss,Validation Loss,Wer,Cer
1,3.9501,3.890581,0.1403,0.0755
2,3.8942,3.868962,0.1035,0.0412
3,3.8543,3.859068,0.1112,0.0493
4,3.8374,3.854913,0.1021,0.0386
5,3.8167,3.858183,0.1093,0.0433
6,3.8008,3.848582,0.1233,0.0543
7,3.7885,3.846378,0.0864,0.0278
8,3.7769,3.843484,0.1096,0.048
9,3.771,3.845213,0.0841,0.027
10,3.7652,3.844935,0.0834,0.0261


The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: pred, text, __index_level_0__, text_clean, tok. If pred, text, __index_level_0__, text_clean, tok are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 10
Saving model checkpoint to mt5_cis_new_after_rl/checkpoint-2286
Configuration saved in mt5_cis_new_after_rl/checkpoint-2286/config.json
Model weights saved in mt5_cis_new_after_rl/checkpoint-2286/pytorch_model.bin
tokenizer config file saved in mt5_cis_new_after_rl/checkpoint-2286/tokenizer_config.json
Special tokens file saved in mt5_cis_new_after_rl/checkpoint-2286/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: pred, text, __index_level_0__, text_clean, tok. If pred,

TrainOutput(global_step=22860, training_loss=3.83351868156373, metrics={'train_runtime': 6327.865, 'train_samples_per_second': 36.12, 'train_steps_per_second': 3.613, 'total_flos': 9217914526494720.0, 'train_loss': 3.83351868156373, 'epoch': 10.0})

In [54]:
preds = trainer.predict(prep_test_dataset)

The following columns in the test set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: pred, text, __index_level_0__, text_clean, tok. If pred, text, __index_level_0__, text_clean, tok are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 9629
  Batch size = 10


In [55]:
kek = pd.DataFrame([tokeniser.decode(i, skip_special_tokens =True) for i in preds.predictions], columns = ['ft_txt'])

In [56]:
kek['text'] = test_dataset['text']

Whisper

In [27]:
wer.compute(predictions=kek['ft_txt'], references=kek['text'])

0.08272357475294088

In [26]:
cer.compute(predictions=kek['ft_txt'], references=kek['text'])

0.027451649213766756

Vosk

In [35]:
wer.compute(predictions=kek['ft_txt'], references=kek['text'])

0.08281234736739279

In [36]:
cer.compute(predictions=kek['ft_txt'], references=kek['text'])

0.02746066818060926

Nemo

In [57]:
wer.compute(predictions=kek['ft_txt'], references=kek['text'])

0.08281942629657219

In [58]:
cer.compute(predictions=kek['ft_txt'], references=kek['text'])

0.02746277512646994