In [1]:
# Initialize sagemaker session and get the training data s3 uri
import json
import time
import boto3
import numpy as np
import sagemaker
import sagemaker.huggingface
import os

#BUCKET="[BUCKET_NAME]" # please use your bucket name if you are not using the default bucket
ROLE = sagemaker.get_execution_role()
sess = sagemaker.Session()
BUCKET = sess.default_bucket()
PREFIX = "whisper/data/marathi-common-voice-processed"
s3uri = os.path.join("s3://", BUCKET, PREFIX)
print(f"sagemaker role arn: {ROLE}")
print(f"sagemaker bucket: {BUCKET}")
print(f"sagemaker session region: {sess.boto_region_name}")
print(f"data uri: {s3uri}")


sagemaker role arn: arn:aws:iam::348052051973:role/service-role/AmazonSageMaker-ExecutionRole-20221220T150158
sagemaker bucket: sagemaker-us-east-1-348052051973
sagemaker session region: us-east-1
data uri: s3://sagemaker-us-east-1-348052051973/whisper/data/marathi-common-voice-processed


In [76]:
# Some training parameters
# For distributed training
# distribution = {'smdistributed':{'dataparallel':{ 'enabled': True }}}
# instance_type = 'ml.p4d.24xlarge'
# training_batch_size  = 4
# eval_batch_size = 2

# For single instance training
distribution = None
instance_type = 'ml.g5.2xlarge'
training_batch_size  = 16
eval_batch_size = 8

In [77]:
from sagemaker.huggingface import HuggingFace

# Create an unique id to tag training job and model name. 
id = int(time.time())

TRAINING_JOB_NAME = f"whisper-mr-{id}"
print('Training job name: ', TRAINING_JOB_NAME)

# hyperparameters = {'num_train_epochs':3, # you can increase the max steps to improve model accuracy
#                    'train_batch_size': training_batch_size,
#                    'eval_batch_size': eval_batch_size,
#                    'model_name': "openai/whisper-large-v2",
#                    'language': "Marathi",
#                    'dataloader_num_workers': 16,
#                   }

model_name_s3 = "whisper-large-v2"
environment = {
              'MODEL_S3_BUCKET': BUCKET, # The bucket to store pretrained model and fine-tune model
              'MODEL_NAME_S3': model_name_s3,
              'DATA_S3': s3uri,
}

# Define metrics definitions, such metrics will be extracted from training script's printed logs and send to cloudwatch
metric_definitions=[
        {'Name': 'eval_loss', 'Regex': "'eval_loss': ([0-9]+(.|e\-)[0-9]+),?"},
        {'Name': 'eval_wer', 'Regex': "'eval_wer': ([0-9]+(.|e\-)[0-9]+),?"},
        {'Name': 'eval_runtime', 'Regex': "'eval_runtime': ([0-9]+(.|e\-)[0-9]+),?"},
        {'Name': 'eval_samples_per_second', 'Regex': "'eval_samples_per_second': ([0-9]+(.|e\-)[0-9]+),?"},
        {'Name': 'epoch', 'Regex': "'epoch': ([0-9]+(.|e\-)[0-9]+),?"}]

Training job name:  whisper-mr-1686399110


In [78]:
# Point the training data to the s3 uri. Use FastFile to "mount" the s3 files directly instead of copying to local disk
from sagemaker.inputs import TrainingInput
training_input_path=s3uri

training = TrainingInput(
    s3_data_type='S3Prefix', # Available Options: S3Prefix | ManifestFile | AugmentedManifestFile
    s3_data=training_input_path,
    distribution='FullyReplicated', # Available Options: FullyReplicated | ShardedByS3Key 
    input_mode='FastFile'
)

In [79]:
# Create the HuggingFace Estimator and kick off the training with "fit". Note that as of the writing, the latest hugging face training image has version of transformers_version='4.17.0' and pytorch_version='1.10.2', the transformer version can be upgraded in the requirements.txt.
# More details on training images, see https://github.com/aws/deep-learning-containers/blob/master/available_images.md
OUTPUT_PATH= f's3://{BUCKET}/{PREFIX}/{TRAINING_JOB_NAME}/output/'

huggingface_estimator = HuggingFace(entry_point='train.sh',
                                    source_dir='./src',
                                    output_path= OUTPUT_PATH, 
                                    instance_type=instance_type,
                                    instance_count=1,
                                    # transformers_version='4.17.0',
                                    # pytorch_version='1.10.2',
                                    py_version='py310',
                                    image_uri='348052051973.dkr.ecr.us-east-1.amazonaws.com/whisper:training',
                                    role=ROLE,
                                    # hyperparameters = hyperparameters,
                                    metric_definitions = metric_definitions,
                                    volume_size=200,
                                    distribution=distribution,
                                    keep_alive_period_in_seconds=1800,
                                    environment=environment,
                                   )

#Starts the training job using the fit function, training takes approximately 2 hours to complete.
# huggingface_estimator.fit({'train': training}, job_name=TRAINING_JOB_NAME)
huggingface_estimator.fit(job_name=TRAINING_JOB_NAME, wait=False)

Using provided s3_resource


INFO:sagemaker:Creating training-job with name: whisper-mr-1686399110


In [82]:
# Download the trained adaptor model
!./s5cmd sync s3://sagemaker-us-east-1-348052051973/whisper-large-v2/output/2023-06-10-18-48-26/whisper_out/adapter_model/* adapter_model/

cp s3://sagemaker-us-east-1-348052051973/whisper-large-v2/output/2023-06-10-18-48-26/whisper_out/adapter_model/README.md adapter_model/README.md
cp s3://sagemaker-us-east-1-348052051973/whisper-large-v2/output/2023-06-10-18-48-26/whisper_out/adapter_model/adapter_config.json adapter_model/adapter_config.json
cp s3://sagemaker-us-east-1-348052051973/whisper-large-v2/output/2023-06-10-18-48-26/whisper_out/adapter_model/adapter_model.bin adapter_model/adapter_model.bin


In [55]:
# Download the original model if not on local disk
!./s5cmd sync s3://sagemaker-us-east-1-348052051973/whisper-large-v2/pretrain/* /tmp/whisper-large-v2/

cp s3://sagemaker-us-east-1-348052051973/whisper-large-v2/pretrain/generation_config.json /tmp/whisper-large-v2/generation_config.json
cp s3://sagemaker-us-east-1-348052051973/whisper-large-v2/pretrain/added_tokens.json /tmp/whisper-large-v2/added_tokens.json
cp s3://sagemaker-us-east-1-348052051973/whisper-large-v2/pretrain/tokenizer_config.json /tmp/whisper-large-v2/tokenizer_config.json
cp s3://sagemaker-us-east-1-348052051973/whisper-large-v2/pretrain/normalizer.json /tmp/whisper-large-v2/normalizer.json
cp s3://sagemaker-us-east-1-348052051973/whisper-large-v2/pretrain/preprocessor_config.json /tmp/whisper-large-v2/preprocessor_config.json
cp s3://sagemaker-us-east-1-348052051973/whisper-large-v2/pretrain/merges.txt /tmp/whisper-large-v2/merges.txt
cp s3://sagemaker-us-east-1-348052051973/whisper-large-v2/pretrain/special_tokens_map.json /tmp/whisper-large-v2/special_tokens_map.json
cp s3://sagemaker-us-east-1-348052051973/whisper-large-v2/pretrain/config.json /tmp/whisper-large-v

###### 

In [83]:
from peft import PeftModel, PeftConfig
from transformers import WhisperForConditionalGeneration, Seq2SeqTrainer

peft_model_id = "adapter_model"
peft_config = PeftConfig.from_pretrained(peft_model_id)
model = WhisperForConditionalGeneration.from_pretrained(
    peft_config.base_model_name_or_path, load_in_8bit=True, device_map="auto"
)
model = PeftModel.from_pretrained(model, peft_model_id)
model.config.use_cache = True

In [84]:
from datasets import load_from_disk

common_voice = load_from_disk("marathi-common-voice-processed")

In [85]:
import torch

from dataclasses import dataclass
from typing import Any, Dict, List, Union


@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch

In [86]:
from transformers import WhisperProcessor
from transformers import WhisperTokenizer

model_name_or_path = "/tmp/whisper-large-v2"
language = "Marathi"
language_abbr = "mr"
task = "transcribe"
tokenizer = WhisperTokenizer.from_pretrained(model_name_or_path, language=language, task=task)
processor = WhisperProcessor.from_pretrained(model_name_or_path, language=language, task=task)
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)


In [90]:
from torch.utils.data import DataLoader
from tqdm import tqdm
import numpy as np
import gc
import evaluate

metric = evaluate.load("wer")

eval_dataloader = DataLoader(common_voice["test"], batch_size=8, collate_fn=data_collator)

model.eval()
for step, batch in enumerate(tqdm(eval_dataloader)):
    with torch.cuda.amp.autocast():
        with torch.no_grad():
            generated_tokens = (
                model.generate(
                    input_features=batch["input_features"].to("cuda"),
                    decoder_input_ids=batch["labels"][:, :4].to("cuda"),
                    max_new_tokens=255,
                )
                .cpu()
                .numpy()
            )
            labels = batch["labels"].cpu().numpy()
            labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
            decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
            decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
            metric.add_batch(
                predictions=decoded_preds,
                references=decoded_labels,
            )
    del generated_tokens, labels, batch
    gc.collect()
wer = 100 * metric.compute()
print(f"{wer=}")


100%|██████████| 227/227 [1:44:51<00:00, 27.71s/it]

wer=41.5447102056605



