In [3]:
import os 

In [4]:
os.chdir("../")

In [5]:
%pwd

'c:\\Users\\ASUS\\Desktop\\MLOps_Whisper'

In [6]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class TrainingConfig:
    root_dir: Path
    trained_model_path: Path
    updated_base_model_path: Path
    training_data: Path


In [7]:
from src.whisper.constants import *
from src.whisper.utils.common import read_yaml, create_directories
import torch
from transformers import Trainer, TrainingArguments, WhisperForConditionalGeneration, WhisperProcessor
from datasets import Dataset , Audio

  from .autonotebook import tqdm as notebook_tqdm


]
[2024-07-18 22:03:47,911: INFO: config: PyTorch version 2.3.1 available.]
[2024-07-18 22:03:47,911: INFO: config: TensorFlow version 2.17.0 available.]


In [8]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH
    ):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        create_directories([self.config.artifacts_root])

    def get_training_config(self) -> TrainingConfig:
        training = self.config.training
        prepare_base_model = self.config.prepare_base_model
        create_directories([Path(training.root_dir)])

        training_config = TrainingConfig(
            root_dir=Path(training.root_dir),
            trained_model_path=Path(training.trained_model_path),
            updated_base_model_path=Path(prepare_base_model.updated_base_model_path),
            training_data=Path(self.config.data_ingestion.unzip_dir),
        )

        return training_config


In [9]:
import os
from pydub import AudioSegment


In [18]:
from dataclasses import dataclass
from typing import Any, Dict, List, Union
from transformers import WhisperForConditionalGeneration, WhisperProcessor, WhisperTokenizer, WhisperFeatureExtractor, Trainer, TrainingArguments


class Training:
    def __init__(self, config: TrainingConfig):
        self.config = config
        self.processor = WhisperProcessor.from_pretrained("openai/whisper-small" ,language="en", task="transcribe" )
        self.tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", language="en", task="transcribe")
        self.feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-small")
    def get_model(self):
        self.model = WhisperForConditionalGeneration.from_pretrained(self.config.updated_base_model_path)

    def load_data(self):
        import pandas as pd
        import numpy as np 
        import torchaudio
        # Load the dataset
        audio_folder = os.path.join(self.config.training_data, "Data_Whisper/en/Clips1")
        tsv_file = os.path.join(self.config.training_data, "Data_Whisper/en/validated1.tsv")
        data = pd.read_csv(tsv_file, sep='\t')
        
        # Limit to the first 10 entries
        data = data[:10]
        
        # Load and transform audio
        def load_and_transform_audio(path):
            try:
                file_path = os.path.join(audio_folder, str(path))
                print(f"Processing file: {file_path}")  
                waveform, sampling_rate = torchaudio.load(file_path)
                audio_array = waveform.numpy().astype(np.float32)
                audio_entry = {
                    'path': path,
                    'array': audio_array.flatten(),
                    'sampling_rate': 16000,
                }
                return audio_entry
            except Exception as e:
                print(f"Error processing audio file {path}: {str(e)}")
                return None

        data['audio'] = data['path'].apply(load_and_transform_audio)
        data = data.dropna(subset=['audio'])
        columns_to_remove = ['client_id', 'path', 'sentence_id', 'sentence_domain', 'up_votes', 'down_votes', 'age', 'gender', 'accents', 'variant', 'locale', 'segment']
        data = data.drop(columns=columns_to_remove)

        #Create Dataset object
        dataset = Dataset.from_pandas(data)     
        dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
        
        # Prepare the dataset
        def prepare_dataset(batch):
           # load and resample audio data from 48 to 16kHz
            audio = batch["audio"]

            # compute log-Mel input features from input audio array
            batch["input_features"] = self.feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]

            # encode target text to label ids
            batch["labels"] = self.tokenizer(batch["sentence"], padding=True).input_ids
            return batch

        dataset = dataset.map(prepare_dataset, remove_columns=dataset.column_names)

        self.train_dataset = dataset.train_test_split(test_size=0.2)["train"]
        self.eval_dataset = dataset.train_test_split(test_size=0.2)["test"]

    def train(self):
        training_args = TrainingArguments(
            output_dir=self.config.root_dir,
            per_device_train_batch_size=16,
            gradient_accumulation_steps=1,  # increase by 2x for every 2x decrease in batch size
            learning_rate=1e-5,
            warmup_steps=500,
            max_steps=1,
            gradient_checkpointing=True,
            fp16=True,
            evaluation_strategy="steps",
            per_device_eval_batch_size=8,
            save_steps=1000,
            eval_steps=1000,
            logging_steps=25,
            report_to=["tensorboard"],
            load_best_model_at_end=True,
            metric_for_best_model="wer",
        )

        @dataclass
        class DataCollatorSpeechSeq2SeqWithPadding:
            processor: Any
            decoder_start_token_id: int

            def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
                # split inputs and labels since they have to be of different lengths and need different padding methods
                # first treat the audio inputs by simply returning torch tensors
                input_features = [{"input_features": feature["input_features"]} for feature in features]
                batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

                 # get the tokenized label sequences
                label_features = [{"input_ids": feature["labels"]} for feature in features]
                # pad the labels to max length
                labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

                # replace padding with -100 to ignore loss correctly
                labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

                # if bos token is appended in previous tokenization step,
                # cut bos token here as it's append later anyways
                if (labels[:, 0] == self.decoder_start_token_id).all().cpu().item():
                    labels = labels[:, 1:]

                batch["labels"] = labels

                return batch


        data_collator = DataCollatorSpeechSeq2SeqWithPadding(
            processor=self.processor,
            decoder_start_token_id=self.model.config.decoder_start_token_id,
        )
        trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=self.train_dataset,
            eval_dataset=self.eval_dataset,
            tokenizer=self.tokenizer,
            data_collator=data_collator
            
        )

        trainer.train()
        self.model.save_pretrained(self.config.trained_model_path)

In [19]:
try:
    config = ConfigurationManager()
    training_config = config.get_training_config()
    training = Training(config=training_config)
    training.get_model()
    training.load_data()
    training.train()

except Exception as e:
    raise e

[2024-07-18 22:23:25,718: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-07-18 22:23:25,734: INFO: common: yaml file: params.yaml loaded successfully]
[2024-07-18 22:23:25,740: INFO: common: created directory at: artifacts]
[2024-07-18 22:23:25,742: INFO: common: created directory at: artifacts\training]


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Processing file: artifacts\data_ingestion\Data_Whisper/en/Clips1\common_voice_en_39751075.mp3
Processing file: artifacts\data_ingestion\Data_Whisper/en/Clips1\common_voice_en_39589864.mp3
Processing file: artifacts\data_ingestion\Data_Whisper/en/Clips1\common_voice_en_40087973.mp3
Processing file: artifacts\data_ingestion\Data_Whisper/en/Clips1\common_voice_en_39587246.mp3
Processing file: artifacts\data_ingestion\Data_Whisper/en/Clips1\common_voice_en_40117514.mp3
Processing file: artifacts\data_ingestion\Data_Whisper/en/Clips1\common_voice_en_39603786.mp3
Processing file: artifacts\data_ingestion\Data_Whisper/en/Clips1\common_voice_en_39603175.mp3
Processing file: artifacts\data_ingestion\Data_Whisper/en/Clips1\common_voice_en_39694056.mp3
Processing file: artifacts\data_ingestion\Data_Whisper/en/Clips1\common_voice_en_40048623.mp3
Processing file: artifacts\data_ingestion\Data_Whisper/en/Clips1\common_voice_en_39644687.mp3



[A
[A
Map: 100%|██████████| 10/10 [00:00<00:00, 17.44 examples/s]
max_steps is given, it will override any value given in num_train_epochs
  0%|          | 0/1 [04:01<?, ?it/s]
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}
                                              
100%|██████████| 1/1 [05:02<00:00, 302.67s/it]
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31,

{'train_runtime': 302.8348, 'train_samples_per_second': 0.053, 'train_steps_per_second': 0.003, 'train_loss': 2.398606538772583, 'epoch': 1.0}
