In [1]:
import os 

In [2]:
%pwd

'c:\\Users\\ASUS\\Desktop\\MLOps_Whisper\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'c:\\Users\\ASUS\\Desktop\\MLOps_Whisper'

In [5]:
os.environ["MLFLOW_TRACKING_URI"] = "https://dagshub.com/benfredj.angela15/MLOps_Whisper.mlflow"
os.environ["MLFLOW_TRACKING_USERNAME"] = "benfredj.angela15"
os.environ["MLFLOW_TRACKING_PASSWORD"]= "9f92356bd6182df3299755fb8ff109d7d605bb39"


In [15]:

from transformers import WhisperForConditionalGeneration, WhisperProcessor

def load_trained_model(trained_model_path):
    return WhisperForConditionalGeneration.from_pretrained(trained_model_path)


In [6]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class EvaluationConfig:
    path_of_model: Path
    training_data: Path
    all_params: dict
    mlflow_uri: str

In [7]:
from src.whisper.constants import *
from src.whisper.utils.common import read_yaml, create_directories, save_json

In [8]:
class ConfigurationManager:
    def __init__(
        self, 
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        create_directories([self.config.artifacts_root])

    
    def get_evaluation_config(self) -> EvaluationConfig:
        eval_config = EvaluationConfig(
            path_of_model="artifacts/training/model",
            training_data="artifacts/data_ingestion/",
            mlflow_uri="https://dagshub.com/benfredj.angela15/MLOps_Whisper.mlflow",
            all_params=self.params,
        )
        return eval_config

In [9]:
from pathlib import Path
import mlflow
import mlflow.keras
from urllib.parse import urlparse

In [27]:
from pydub import AudioSegment
from typing import Any, Dict, List, Union
import os
from datasets import Dataset, Audio
from dataclasses import dataclass
import evaluate
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
import json
import torchaudio
from pathlib import Path
import mlflow
from urllib.parse import urlparse
from transformers import WhisperForConditionalGeneration, WhisperProcessor, WhisperTokenizer, WhisperFeatureExtractor, Trainer, TrainingArguments
metric = evaluate.load("wer")
#mlflow.autolog()

@dataclass
class EvaluationConfig:
    path_of_model: str
    training_data: str
    mlflow_uri: str
    all_params: Dict[str, Any]

class Evaluation:
    def __init__(self, config: EvaluationConfig):
        self.config = config
        self.processor = WhisperProcessor.from_pretrained("openai/whisper-small", language="en", task="transcribe")
        self.model = WhisperForConditionalGeneration.from_pretrained(self.config.path_of_model)
        self.tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", language="en", task="transcribe")
        self.feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-small")
        self.metric = evaluate.load("wer")



        @dataclass
        class DataCollatorSpeechSeq2SeqWithPadding:
            processor: Any
            decoder_start_token_id: int

            def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
                # Split inputs and labels since they have to be of different lengths and need different padding methods
                # First treat the audio inputs by simply returning torch tensors
                input_features = [{"input_features": feature["input_features"]} for feature in features]
                batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

                # Get the tokenized label sequences
                label_features = [{"input_ids": feature["labels"]} for feature in features]
                # Pad the labels to max length
                labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

                # Replace padding with -100 to ignore loss correctly
                labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

                # If bos token is appended in previous tokenization step,
                # cut bos token here as it's append later anyways
                if (labels[:, 0] == self.decoder_start_token_id).all().cpu().item():
                    labels = labels[:, 1:]

                batch["labels"] = labels

                return batch
        
        self.data_collator = DataCollatorSpeechSeq2SeqWithPadding(
            processor=self.processor,
            decoder_start_token_id=self.model.config.decoder_start_token_id,
        )


    def load_model(self, path: Path):
        return WhisperForConditionalGeneration.from_pretrained(path)
    
    def load_data(self):
        import pandas as pd
        import numpy as np 
        # Load the dataset
        audio_folder = os.path.join(self.config.training_data, "Data_Whisper/en/Clips1")
        tsv_file = os.path.join(self.config.training_data, "Data_Whisper/en/validated1.tsv")
        data = pd.read_csv(tsv_file, sep='\t')
        
        # Limit to the first 10 entries
        data = data[:10]
        
        # Load and transform audio
        def load_and_transform_audio(path):
            try:
                file_path = os.path.join(audio_folder, str(path))
                print(f"Processing file: {file_path}")  
                waveform, sampling_rate = torchaudio.load(file_path)
                audio_array = waveform.numpy().astype(np.float32)
                audio_entry = {
                    'path': str(path),
                    'array': audio_array.flatten(),
                    'sampling_rate': 16000,
                }
                return audio_entry
            except Exception as e:
                print(f"Error processing file {path}: {str(e)}")
                return None

        data['audio'] = data['path'].apply(load_and_transform_audio)
        data = data.dropna(subset=['audio'])
        
        # Remove unnecessary columns
        columns_to_remove = ['client_id', 'path', 'sentence_id', 'sentence_domain', 'up_votes', 'down_votes', 'age', 'gender', 'accents', 'variant', 'locale', 'segment']
        data = data.drop(columns=columns_to_remove)

        # Create Dataset object
        dataset = Dataset.from_pandas(data)     
        dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
        
        # Prepare the dataset
        def prepare_dataset(batch):
           # load and resample audio data from 48 to 16kHz
            audio = batch["audio"]

            # compute log-Mel input features from input audio array
            batch["input_features"] = self.feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]

            # encode target text to label ids
            batch["labels"] = self.tokenizer(batch["sentence"], padding=True).input_ids
            return batch

        dataset = dataset.map(prepare_dataset, remove_columns=dataset.column_names)

        self.train_dataset = dataset.train_test_split(test_size=0.2)["train"]
        self.eval_dataset = dataset.train_test_split(test_size=0.2)["test"]

    def collate_fn(self, batch):
        input_features = [torch.tensor(item["input_features"]) for item in batch]
        labels = [torch.tensor(item["labels"]) for item in batch]

        # Pad input features and labels to the same length
        input_features = pad_sequence(input_features, batch_first=True, padding_value=0)
        labels = pad_sequence(labels, batch_first=True, padding_value=self.processor.tokenizer.pad_token_id)

        return {"input_features": input_features, "labels": labels}
    


    def compute_wer(self):
        predictions = []
        references = []

        # Process data in batches
        batch_size = 2
        eval_loader = DataLoader(self.eval_dataset, batch_size=batch_size, collate_fn=self.data_collator)

        for batch in eval_loader:
            input_features = batch["input_features"]
            labels = batch["labels"]

            try:
                # Generate predictions
                print("Generating predictions...")
                generated_ids = self.model.generate(input_features)
                print("Predictions generated")

                # Decode predictions and references
                batch_predictions = self.processor.batch_decode(generated_ids, skip_special_tokens=True)
                batch_references = [self.processor.decode(label, skip_special_tokens=True) for label in labels]

                predictions.extend(batch_predictions)
                references.extend(batch_references)
            except Exception as e:
                print(f"Error during batch processing: {e}")


        for i in range(min(5, len(predictions))):
            print(f"Prediction {i}: {predictions[i]}")
            print(f"Reference {i}: {references[i]}")

        wer = self.metric.compute(predictions=predictions, references=references)
        return wer

    def save_score(self, wer_score):
        scores = {"wer": wer_score}
        with open("scores.json", "w") as f:
            json.dump(scores, f)
    def log_into_mlflow(self, score):
        mlflow.set_registry_uri(self.config.mlflow_uri)
        tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme
        print(f"MLflow Tracking URI: {mlflow.get_tracking_uri()}")
        print(f"MLflow Registry URI: {self.config.mlflow_uri}")
        print(f"Tracking URL type store: {tracking_url_type_store}")
        with mlflow.start_run():
            mlflow.log_params(self.config.all_params)
            mlflow.log_metric("wer", score)
            if tracking_url_type_store != "file":
                mlflow.pytorch.log_model(self.model, "model", registered_model_name="WhisperModel")
            else:
                mlflow.pytorch.log_model(self.model, "model")

    


2024/07/19 11:30:51 INFO mlflow.tracking.fluent: Autologging successfully enabled for keras.
2024/07/19 11:30:51 INFO mlflow.tracking.fluent: Autologging successfully enabled for tensorflow.
2024/07/19 11:30:51 INFO mlflow.tracking.fluent: Autologging successfully enabled for transformers.
2024/07/19 11:30:52 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.


In [43]:
from pydub import AudioSegment
from typing import Any, Dict, List, Union
import os
from datasets import Dataset, Audio
from dataclasses import dataclass
import evaluate
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
import json
import torchaudio
from pathlib import Path
import mlflow
from urllib.parse import urlparse
from transformers import WhisperForConditionalGeneration, WhisperProcessor, WhisperTokenizer, WhisperFeatureExtractor, Trainer, TrainingArguments

AudioSegment.ffmpeg = "ffmpeg"
metric = evaluate.load("wer")
#mlflow.autolog()

@dataclass
class EvaluationConfig:
    path_of_model: str
    training_data: str
    mlflow_uri: str
    all_params: Dict[str, Any]

class Evaluation:
    def __init__(self, config: EvaluationConfig):
        self.config = config
        self.processor = WhisperProcessor.from_pretrained("openai/whisper-small", language="en", task="transcribe")
        self.model = WhisperForConditionalGeneration.from_pretrained(self.config.path_of_model)
        self.tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", language="en", task="transcribe")
        self.feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-small")
        self.metric = evaluate.load("wer")

        @dataclass
        class DataCollatorSpeechSeq2SeqWithPadding:
            processor: Any
            decoder_start_token_id: int

            def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
                input_features = [{"input_features": feature["input_features"]} for feature in features]
                batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

                label_features = [{"input_ids": feature["labels"]} for feature in features]
                labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")
                labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

                if (labels[:, 0] == self.decoder_start_token_id).all().cpu().item():
                    labels = labels[:, 1:]

                batch["labels"] = labels

                return batch
        
        self.data_collator = DataCollatorSpeechSeq2SeqWithPadding(
            processor=self.processor,
            decoder_start_token_id=self.model.config.decoder_start_token_id,
        )

    def load_model(self, path: Path):
        return WhisperForConditionalGeneration.from_pretrained(path)
    
    def load_data(self):
        import pandas as pd
        import numpy as np 

        audio_folder = os.path.join(self.config.training_data, "Data_Whisper/en/Clips1")
        tsv_file = os.path.join(self.config.training_data, "Data_Whisper/en/validated1.tsv")
        data = pd.read_csv(tsv_file, sep='\t')
        data = data[:10]

        def load_and_transform_audio(path):
            try:
                file_path = os.path.join(audio_folder, str(path))
                waveform, sampling_rate = torchaudio.load(file_path)
                audio_array = waveform.numpy().astype(np.float32)
                audio_entry = {
                    'path': str(path),
                    'array': audio_array.flatten(),
                    'sampling_rate': 16000,
                }
                return audio_entry
            except Exception as e:
                print(f"Error processing file {path}: {str(e)}")
                return None

        data['audio'] = data['path'].apply(load_and_transform_audio)
        data = data.dropna(subset=['audio'])

        columns_to_remove = ['client_id', 'path', 'sentence_id', 'sentence_domain', 'up_votes', 'down_votes', 'age', 'gender', 'accents', 'variant', 'locale', 'segment']
        data = data.drop(columns=columns_to_remove)

        dataset = Dataset.from_pandas(data)     
        dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))

        def prepare_dataset(batch):
            audio = batch["audio"]
            batch["input_features"] = self.feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]
            batch["labels"] = self.tokenizer(batch["sentence"], padding=True).input_ids
            return batch

        dataset = dataset.map(prepare_dataset, remove_columns=dataset.column_names)
        self.train_dataset = dataset.train_test_split(test_size=0.2)["train"]
        self.eval_dataset = dataset.train_test_split(test_size=0.2)["test"]

    def collate_fn(self, batch):
        input_features = [torch.tensor(item["input_features"]) for item in batch]
        labels = [torch.tensor(item["labels"]) for item in batch]

        input_features = pad_sequence(input_features, batch_first=True, padding_value=0)
        labels = pad_sequence(labels, batch_first=True, padding_value=self.processor.tokenizer.pad_token_id)

        return {"input_features": input_features, "labels": labels}

    def compute_wer(self):
        predictions = []
        references = []

        batch_size = 2
        eval_loader = DataLoader(self.eval_dataset, batch_size=batch_size, collate_fn=self.data_collator)

        for batch in eval_loader:
            input_features = batch["input_features"]
            labels = batch["labels"]

            try:
                generated_ids = self.model.generate(input_features)
                batch_predictions = self.processor.batch_decode(generated_ids, skip_special_tokens=True)
                batch_references = [self.processor.decode(label, skip_special_tokens=True) for label in labels]

                predictions.extend(batch_predictions)
                references.extend(batch_references)
            except Exception as e:
                print(f"Error during batch processing: {e}")

        wer = self.metric.compute(predictions=predictions, references=references)
        return wer

    def save_score(self, wer_score):
        scores = {"wer": wer_score}
        with open("scores.json", "w") as f:
            json.dump(scores, f)

    def log_into_mlflow(self, score):
        mlflow.set_registry_uri(self.config.mlflow_uri)
        tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme
        with mlflow.start_run():
            mlflow.log_params(self.config.all_params)
            mlflow.log_metric("wer", score)
            if tracking_url_type_store != "file":
                mlflow.pytorch.log_model(self.model, "model", registered_model_name="WhisperModel")
            else:
                mlflow.pytorch.log_model(self.model, "model")
        
  

        



In [10]:
from pydub import AudioSegment
from typing import Any, Dict, List, Union
import os
from datasets import Dataset, Audio
from dataclasses import dataclass
import evaluate
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
import json
import torchaudio
from pathlib import Path
import mlflow
from urllib.parse import urlparse
from transformers import WhisperForConditionalGeneration, WhisperProcessor, WhisperTokenizer, WhisperFeatureExtractor

AudioSegment.ffmpeg = "ffmpeg"
metric = evaluate.load("wer")

@dataclass
class EvaluationConfig:
    path_of_model: str
    training_data: str
    mlflow_uri: str
    all_params: Dict[str, Any]

class Evaluation:
    def __init__(self, config: EvaluationConfig):
        self.config = config
        self.processor = WhisperProcessor.from_pretrained("openai/whisper-small", language="en", task="transcribe")
        self.model = WhisperForConditionalGeneration.from_pretrained(self.config.path_of_model)
        self.tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", language="en", task="transcribe")
        self.feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-small")
        self.metric = evaluate.load("wer")

        @dataclass
        class DataCollatorSpeechSeq2SeqWithPadding:
            processor: Any
            decoder_start_token_id: int

            def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
                input_features = [{"input_features": feature["input_features"]} for feature in features]
                batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

                label_features = [{"input_ids": feature["labels"]} for feature in features]
                labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")
                labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

                if (labels[:, 0] == self.decoder_start_token_id).all().cpu().item():
                    labels = labels[:, 1:]

                batch["labels"] = labels

                return batch
        
        self.data_collator = DataCollatorSpeechSeq2SeqWithPadding(
            processor=self.processor,
            decoder_start_token_id=self.model.config.decoder_start_token_id,
        )

    def load_model(self, path: Path):
        return WhisperForConditionalGeneration.from_pretrained(path)
    
    def load_data(self):
        import pandas as pd
        import numpy as np

        audio_folder = os.path.join(self.config.training_data, "Data_Whisper/en/Clips1")
        tsv_file = os.path.join(self.config.training_data, "Data_Whisper/en/validated1.tsv")
        data = pd.read_csv(tsv_file, sep='\t')
        data = data[:10]

        def load_and_transform_audio(path):
            try:
                file_path = os.path.join(audio_folder, str(path))
                waveform, sampling_rate = torchaudio.load(file_path)
                audio_array = waveform.numpy().astype(np.float32)
                audio_entry = {
                    'path': str(path),
                    'array': audio_array.flatten(),
                    'sampling_rate': 16000,
                }
                return audio_entry
            except Exception as e:
                print(f"Error processing file {path}: {str(e)}")
                return None

        data['audio'] = data['path'].apply(load_and_transform_audio)
        data = data.dropna(subset=['audio'])

        columns_to_remove = ['client_id', 'path', 'sentence_id', 'sentence_domain', 'up_votes', 'down_votes', 'age', 'gender', 'accents', 'variant', 'locale', 'segment']
        data = data.drop(columns=columns_to_remove)

        dataset = Dataset.from_pandas(data)     
        dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))

        def prepare_dataset(batch):
            audio = batch["audio"]
            batch["input_features"] = self.feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]
            batch["labels"] = self.tokenizer(batch["sentence"], padding=True).input_ids
            return batch

        dataset = dataset.map(prepare_dataset, remove_columns=dataset.column_names)
        self.train_dataset = dataset.train_test_split(test_size=0.2)["train"]
        self.eval_dataset = dataset.train_test_split(test_size=0.2)["test"]

    def collate_fn(self, batch):
        input_features = [torch.tensor(item["input_features"]) for item in batch]
        labels = [torch.tensor(item["labels"]) for item in batch]

        input_features = pad_sequence(input_features, batch_first=True, padding_value=0)
        labels = pad_sequence(labels, batch_first=True, padding_value=self.processor.tokenizer.pad_token_id)

        return {"input_features": input_features, "labels": labels}

    def compute_wer(self):
        predictions = []
        references = []

        batch_size = 2
        eval_loader = DataLoader(self.eval_dataset, batch_size=batch_size, collate_fn=self.data_collator)

        for batch in eval_loader:
            input_features = batch["input_features"]
            labels = batch["labels"]

            try:
                generated_ids = self.model.generate(input_features)
                batch_predictions = self.processor.batch_decode(generated_ids, skip_special_tokens=True)
                batch_references = [self.processor.decode(label, skip_special_tokens=True) for label in labels]

                predictions.extend(batch_predictions)
                references.extend(batch_references)
            except Exception as e:
                print(f"Error during batch processing: {e}")

        wer = self.metric.compute(predictions=predictions, references=references)
        return wer

    def save_score(self, wer_score):
        scores = {"wer": wer_score}
        with open("scores.json", "w") as f:
            json.dump(scores, f)

    def log_into_mlflow(self, score):
        mlflow.set_registry_uri(self.config.mlflow_uri)
        tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme
        with mlflow.start_run() as run:
            mlflow.log_params(self.config.all_params)
            mlflow.log_metric("wer", score)
            run_id = run.info.run_id
            subpath = "world_1-1"
            run_uri = f'runs:/{run_id}/{subpath}'
            model_version = mlflow.register_model(run_uri, "WhisperModel")

  from .autonotebook import tqdm as notebook_tqdm


[2024-07-20 12:45:36,079: INFO: config: PyTorch version 2.3.1 available.]
[2024-07-20 12:45:36,079: INFO: config: TensorFlow version 2.17.0 available.]


In [11]:
try:
    config = ConfigurationManager()
    print("done1")
    eval_config = config.get_evaluation_config()
    print("done2")
    evaluation = Evaluation(eval_config)
    print("done3")
    evaluation.load_data()
    print("done4")
    wer_score = evaluation.compute_wer()
    print("done5")
    evaluation.save_score(wer_score)
    print("done6")
    evaluation.log_into_mlflow(wer_score)
    print("done7")
    

except Exception as e:
   raise e

[2024-07-20 12:45:50,994: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-07-20 12:45:51,082: INFO: common: yaml file: params.yaml loaded successfully]
[2024-07-20 12:45:51,082: INFO: common: created directory at: artifacts]
done1
done2


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


done3


Map: 100%|██████████| 10/10 [00:07<00:00,  1.27 examples/s]


done4


Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token.As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


done5
done6


Registered model 'WhisperModel' already exists. Creating a new version of this model...
2024/07/20 12:48:08 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: WhisperModel, version 3
Created version '3' of model 'WhisperModel'.


done7
