# Finetuning on Fleurs KO, TE, HE

Check whisper_tinetune_demo notebook for finetuning pipeline

In [2]:
import utils
import os
from datasets import load_dataset, DatasetDict
from transformers import WhisperFeatureExtractor, WhisperTokenizer

## Prepare Data

In [2]:
# set paths for input/output
root = os.getcwd()
datasets_path = os.path.join(root, 'datasets')
predictions_path = os.path.join(root, 'predictions')
#sivan_path = '/home/sivan/datasets/'
# create folders if they do not already exist
if not os.path.exists(datasets_path): os.makedirs(datasets_path)
if not os.path.exists(predictions_path): os.makedirs(predictions_path)

In [3]:
langs = ["te_in", "ko_kr", "he_il"]
codes = {"te_in": 'Telugu', "ko_kr": 'Korean', "he_il": 'Hebrew'}
fleurs = {}

In [4]:
feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-base")

In [5]:
% % time  # only counts preprocessing and not loading, given cached datasets -- even for preprocessing, should add ~5 minutes to total time for loading/processing of telugu

for lang in langs:
    print("Loading and preprocessing dataset...")
    tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-base", language=codes[lang], task="transcribe")
    fleurs[lang] = DatasetDict(load_dataset('google/fleurs', lang))
    fleurs[lang] = fleurs[lang].map(utils.prepare_dataset,
                                    fn_kwargs={"feature_extractor": feature_extractor, "tokenizer": tokenizer},
                                    remove_columns=fleurs[lang].column_names["train"], num_proc=4)
    print("Saving dataset...")
    fleurs[lang].save_to_disk(os.path.join(datasets_path, 'fleurs_' + lang + '_features'))
    #fleurs[lang].save_to_disk(os.path.join(sivan_path, 'fl_' + lang + '_features')) <--- no permissions

Loading and preprocessing dataset...


Found cached dataset fleurs (/home/alexandriaguo/.cache/huggingface/datasets/google___fleurs/te_in/2.0.0/aabb39fb29739c495517ac904e2886819b6e344702f0a5b5283cb178b087c94a)


  0%|          | 0/3 [00:00<?, ?it/s]

     

Loading cached processed dataset at /home/alexandriaguo/.cache/huggingface/datasets/google___fleurs/te_in/2.0.0/aabb39fb29739c495517ac904e2886819b6e344702f0a5b5283cb178b087c94a/cache-c818ff5160121eec.arrow


 

Loading cached processed dataset at /home/alexandriaguo/.cache/huggingface/datasets/google___fleurs/te_in/2.0.0/aabb39fb29739c495517ac904e2886819b6e344702f0a5b5283cb178b087c94a/cache-8d0f61e940315fc0.arrow


 

Loading cached processed dataset at /home/alexandriaguo/.cache/huggingface/datasets/google___fleurs/te_in/2.0.0/aabb39fb29739c495517ac904e2886819b6e344702f0a5b5283cb178b087c94a/cache-99ebe6087268c447.arrow


 

Loading cached processed dataset at /home/alexandriaguo/.cache/huggingface/datasets/google___fleurs/te_in/2.0.0/aabb39fb29739c495517ac904e2886819b6e344702f0a5b5283cb178b087c94a/cache-dbbc8c71e3041f1c.arrow


     

Loading cached processed dataset at /home/alexandriaguo/.cache/huggingface/datasets/google___fleurs/te_in/2.0.0/aabb39fb29739c495517ac904e2886819b6e344702f0a5b5283cb178b087c94a/cache-9d79f83762d90db3.arrow


 

Loading cached processed dataset at /home/alexandriaguo/.cache/huggingface/datasets/google___fleurs/te_in/2.0.0/aabb39fb29739c495517ac904e2886819b6e344702f0a5b5283cb178b087c94a/cache-3ae4014f8d349b30.arrow


 

Loading cached processed dataset at /home/alexandriaguo/.cache/huggingface/datasets/google___fleurs/te_in/2.0.0/aabb39fb29739c495517ac904e2886819b6e344702f0a5b5283cb178b087c94a/cache-ede513a80a3f2613.arrow


 

Loading cached processed dataset at /home/alexandriaguo/.cache/huggingface/datasets/google___fleurs/te_in/2.0.0/aabb39fb29739c495517ac904e2886819b6e344702f0a5b5283cb178b087c94a/cache-def57ab2c5d9b2ea.arrow


     

Loading cached processed dataset at /home/alexandriaguo/.cache/huggingface/datasets/google___fleurs/te_in/2.0.0/aabb39fb29739c495517ac904e2886819b6e344702f0a5b5283cb178b087c94a/cache-a812162401ab3938.arrow


 

Loading cached processed dataset at /home/alexandriaguo/.cache/huggingface/datasets/google___fleurs/te_in/2.0.0/aabb39fb29739c495517ac904e2886819b6e344702f0a5b5283cb178b087c94a/cache-8acccfdb5de4bea0.arrow


 

Loading cached processed dataset at /home/alexandriaguo/.cache/huggingface/datasets/google___fleurs/te_in/2.0.0/aabb39fb29739c495517ac904e2886819b6e344702f0a5b5283cb178b087c94a/cache-a52c8602b1413c62.arrow


 

Loading cached processed dataset at /home/alexandriaguo/.cache/huggingface/datasets/google___fleurs/te_in/2.0.0/aabb39fb29739c495517ac904e2886819b6e344702f0a5b5283cb178b087c94a/cache-682da7e22d076104.arrow


Saving dataset...
Loading and preprocessing dataset...


Found cached dataset fleurs (/home/alexandriaguo/.cache/huggingface/datasets/google___fleurs/ko_kr/2.0.0/aabb39fb29739c495517ac904e2886819b6e344702f0a5b5283cb178b087c94a)


  0%|          | 0/3 [00:00<?, ?it/s]

     

#0:   0%|          | 0/577 [00:00<?, ?ex/s]

 

#1:   0%|          | 0/577 [00:00<?, ?ex/s]

 

#2:   0%|          | 0/577 [00:00<?, ?ex/s]

 

#3:   0%|          | 0/576 [00:00<?, ?ex/s]

     

#0:   0%|          | 0/57 [00:00<?, ?ex/s]

 

#1:   0%|          | 0/57 [00:00<?, ?ex/s]

 

#2:   0%|          | 0/56 [00:00<?, ?ex/s]

 

#3:   0%|          | 0/56 [00:00<?, ?ex/s]

     

#0:   0%|          | 0/96 [00:00<?, ?ex/s]

 

#1:   0%|          | 0/96 [00:00<?, ?ex/s]

 

#2:   0%|          | 0/95 [00:00<?, ?ex/s]

 

#3:   0%|          | 0/95 [00:00<?, ?ex/s]

Saving dataset...
Loading and preprocessing dataset...


Found cached dataset fleurs (/home/alexandriaguo/.cache/huggingface/datasets/google___fleurs/he_il/2.0.0/aabb39fb29739c495517ac904e2886819b6e344702f0a5b5283cb178b087c94a)


  0%|          | 0/3 [00:00<?, ?it/s]

     

#0:   0%|          | 0/811 [00:00<?, ?ex/s]

 

#1:   0%|          | 0/811 [00:00<?, ?ex/s]

 

#2:   0%|          | 0/810 [00:00<?, ?ex/s]

 

#3:   0%|          | 0/810 [00:00<?, ?ex/s]

     

#0:   0%|          | 0/82 [00:00<?, ?ex/s]

 

#1:   0%|          | 0/82 [00:00<?, ?ex/s]

 

#2:   0%|          | 0/82 [00:00<?, ?ex/s]

 

#3:   0%|          | 0/82 [00:00<?, ?ex/s]

     

#0:   0%|          | 0/198 [00:00<?, ?ex/s]

 

#1:   0%|          | 0/198 [00:00<?, ?ex/s]

 

#2:   0%|          | 0/198 [00:00<?, ?ex/s]

 

#3:   0%|          | 0/198 [00:00<?, ?ex/s]

Saving dataset...
CPU times: user 1min 16s, sys: 13.4 s, total: 1min 30s
Wall time: 10min 32s


In [7]:
print(fleurs)
print(fleurs['te_in']['test'])

{'te_in': DatasetDict({
    train: Dataset({
        features: ['input_features', 'labels'],
        num_rows: 2302
    })
    validation: Dataset({
        features: ['input_features', 'labels'],
        num_rows: 311
    })
    test: Dataset({
        features: ['input_features', 'labels'],
        num_rows: 472
    })
}), 'ko_kr': DatasetDict({
    train: Dataset({
        features: ['input_features', 'labels'],
        num_rows: 2307
    })
    validation: Dataset({
        features: ['input_features', 'labels'],
        num_rows: 226
    })
    test: Dataset({
        features: ['input_features', 'labels'],
        num_rows: 382
    })
}), 'he_il': DatasetDict({
    train: Dataset({
        features: ['input_features', 'labels'],
        num_rows: 3242
    })
    validation: Dataset({
        features: ['input_features', 'labels'],
        num_rows: 328
    })
    test: Dataset({
        features: ['input_features', 'labels'],
        num_rows: 792
    })
})}
Dataset({
    feature

In [8]:
# removed -n to allow overwrite

!gsutil -m cp -r./ datasets / fleurs_te_in_features gs: // capstone_datasets / fleurs / preprocess / fl_te_features
!gsutil -m cp -r./ datasets / fleurs_ko_kr_features gs: // capstone_datasets / fleurs / preprocess / fl_ko_features
!gsutil -m cp -r./ datasets / fleurs_he_il_features gs: // capstone_datasets / fleurs / preprocess / fl_iw_features

Copying file://./datasets/fleurs_te_in_features/dataset_dict.json [Content-Type=application/json]...
Copying file://./datasets/fleurs_te_in_features/test/state.json [Content-Type=application/json]...
Copying file://./datasets/fleurs_te_in_features/test/dataset_info.json [Content-Type=application/json]...
Copying file://./datasets/fleurs_te_in_features/test/dataset.arrow [Content-Type=application/octet-stream]...
==> NOTE: You are uploading one or more large file(s), which would run          
significantly faster if you enable parallel composite uploads. This
feature can be enabled by editing the
"parallel_composite_upload_threshold" value in your .boto
configuration file. However, note that if you do this large files will
be uploaded as `composite objects
<https://cloud.google.com/storage/docs/composite-objects>`_,which
means that any user who downloads such objects will need to have a
compiled crcmod installed (see "gsutil help crcmod"). This is because
without a compiled crcmod, comp

## Load Preprocessed Data and Whisper

In [2]:
import utils
from transformers import Seq2SeqTrainingArguments
from datasets import load_from_disk
from transformers import Seq2SeqTrainer
from transformers import WhisperProcessor
from transformers import WhisperForConditionalGeneration

In [3]:
langs = ["te_in", "ko_kr", "he_il"]
codes = {"te_in":'te', "ko_kr":'ko', "he_il":'iw'}
# codes = {"te_in": 'Telugu', "ko_kr": 'Korean', "he_il": 'Hebrew'}
fleurs = {}
models = {}
collators = {}
processors={}

In [4]:
for lang in langs:
    # load dataset from disk
    fleurs[lang] = load_from_disk(f"/home/alexandriaguo/datasets/fleurs_{lang}_features")
    print(fleurs[lang])

    # load whisper processor
    processors[lang] = WhisperProcessor.from_pretrained("openai/whisper-base", language=lang, task="transcribe")

    # initialize data collator
    collators[lang] = utils.DataCollatorSpeechSeq2SeqWithPadding(processor=processors[lang])


DatasetDict({
    train: Dataset({
        features: ['input_features', 'labels'],
        num_rows: 2302
    })
    validation: Dataset({
        features: ['input_features', 'labels'],
        num_rows: 311
    })
    test: Dataset({
        features: ['input_features', 'labels'],
        num_rows: 472
    })
})
DatasetDict({
    train: Dataset({
        features: ['input_features', 'labels'],
        num_rows: 2307
    })
    validation: Dataset({
        features: ['input_features', 'labels'],
        num_rows: 226
    })
    test: Dataset({
        features: ['input_features', 'labels'],
        num_rows: 382
    })
})
DatasetDict({
    train: Dataset({
        features: ['input_features', 'labels'],
        num_rows: 3242
    })
    validation: Dataset({
        features: ['input_features', 'labels'],
        num_rows: 328
    })
    test: Dataset({
        features: ['input_features', 'labels'],
        num_rows: 792
    })
})


## Training & Validation

In [5]:
# universal training args
training_args = Seq2SeqTrainingArguments(
    output_dir="/home/sivan",
    per_device_train_batch_size=16,  # originally 16
    gradient_accumulation_steps=1,  # originally 1, increase by 2x for every 2x decrease in batch size
    learning_rate=1e-5,
    warmup_steps=500,
    max_steps=4000,
    gradient_checkpointing=True,
    fp16=False,  # original True
    group_by_length=False,  # set true if length is specified in dataset
    evaluation_strategy="steps",
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=1000,
    eval_steps=1000,
    logging_steps=25,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=False,  # set true to push trained model to HF
    disable_tqdm=False,  # set false to see progress bar
)

In [None]:
%%time
for lang in langs:
    print(f"Training for {lang} starts...")

    # load pretrained model
    model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-base").to("cuda")
    model.config.forced_decoder_ids = None
    model.config.suppress_tokens = []
    model.config.use_cache = False

    # redefine output path
    training_args.output_dir = f"/home/sivan/whisper_base_fl_{lang}"

    # specify metric for each language
    compute_metrics = utils.metrics(codes[lang])


    # set trainer
    trainer = Seq2SeqTrainer(
        args=training_args,
        model=model,
        train_dataset=fleurs[lang]["train"],
        eval_dataset=fleurs[lang]["validation"],
        data_collator=collators[lang],
        compute_metrics=compute_metrics,
        tokenizer=processors[lang].feature_extractor,
    )

    trainer.train()

    predict_results = trainer.predict(fleurs[lang]["test"], metric_key_prefix="test")
    metrics = predict_results.metrics
    trainer.log_metrics("test", metrics)
    trainer.save_metrics("test", metrics)

Training for te_in starts...


max_steps is given, it will override any value given in num_train_epochs
***** Running training *****
  Num examples = 2302
  Num Epochs = 28
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 4000
  Number of trainable parameters = 72593920


Step,Training Loss,Validation Loss


## Testing with best finetuned model