In [1]:
import os

os.chdir("../")
%pwd

'c:\\Users\\anfe1\\OneDrive\\Escritorio\\carpetaANFE\\AI_ML_Projects\\Text-Summarization'

In [2]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class ModelEvaluationConfig:
    root_dir: Path
    data_path: Path
    model_path: Path
    tokenizer_path: Path
    metrics_file_name: Path

In [3]:
from textSummarization.constants import *
from textSummarization.utils.common import read_yaml, create_directories

class ConfigurationManager:
    def __init__(self) -> None:
        config_file_path = CONFIG_FILE_PATH
        params_file_path = PARAMS_FILE_PATH

        self.config = read_yaml(config_file_path)
        self.params = read_yaml(params_file_path)

        create_directories([self.config.artifacts_root])
    
    def get_model_evaluation_config(self) -> ModelEvaluationConfig:
        config = self.config.model_evaluation

        create_directories([config.root_dir])

        model_evaluation_config = ModelEvaluationConfig(
            root_dir= config.root_dir,
            data_path= config.data_path,
            model_path= config.model_path,
            tokenizer_path= config.tokenizer_path,
            metrics_file_name= config.metrics_file_name
        )
        
        return model_evaluation_config

In [14]:
import torch
import pandas as pd

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from datasets import load_from_disk, load_metric
from tqdm import tqdm

from textSummarization.logging import logger

class ModelEvaluation:
    def __init__(self, config:ModelEvaluationConfig) -> None:
        self.config = config

    def generate_batch_sized_chunks(self, list_of_elements, batch_size):
        """Split the dataset into smaller batches that we cazn process simultaneously
        Yield successive batch-sized chunks from list_of_elements
        """
        for i in range(0, len(list_of_elements), batch_size):
            yield list_of_elements[i:i+batch_size]

    def calculate_metric_on_test_ds(self, dataset, metric, model, tokenizer, 
                                    batch_size=16, device="cuda" if torch.cuda.is_available() else "cpu",
                                    column_text="article",
                                    column_summary="highlights"):
        article_batches = list(self.generate_batch_sized_chunks(dataset[column_text], batch_size))
        target_batches = list(self.generate_batch_sized_chunks(dataset[column_summary], batch_size))

        for article_batch, target_batch in tqdm(zip(article_batches, target_batches), total=len(article_batches)):
            inputs = tokenizer(
                article_batch, 
                max_length=1024,
                truncation=True,
                padding="max_length",
                return_tensors="pt")
            
            summaries = model.generate(
                input_ids=inputs["input_ids"].to(device),
                attention_mask=inputs["attention_mask"].to(device),
                length_penalty=0.8,
                num_beams=8,
                max_length=128
            )
            "parameter for length penalty ensures that the model does not generate sequences that are too long."

            # Finally, the generated text is decoded and added with the reference to the metric
            decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True, clean_up_tokenization_spaces=True)
                                 for s in summaries]
            decoded_summaries = [d.replace(""," ")
                                 for d in decoded_summaries]
            
            metric.add_batch(predictions=decoded_summaries, references=target_batch)
        
        # Finally compute and return the ROUGE score
        score = metric.compute()
        return score

    def evaluate(self):
        device = "cuda" if torch.cuda.is_available() else "cpu"
        tokenizer = AutoTokenizer.from_pretrained(self.config.tokenizer_path)
        model = AutoModelForSeq2SeqLM.from_pretrained(self.config.model_path).to(device)
        dataset = load_from_disk(self.config.data_path)

        rouge_metric = load_metric("rouge")

        score = self.calculate_metric_on_test_ds(
            dataset=dataset["test"].select(list(range(1))), 
            metric=rouge_metric, 
            model=model, 
            tokenizer=tokenizer,
            batch_size=2, 
            device=device,
            column_text="dialogue",
            column_summary="summary")
        "Using dataset.select(list(range(1))) just to test if everything is working as expected"
        
        rouge_names = ['rouge1', 'rouge2', 'rougeL', 'rougeLsum']
        rouge_dict = dict((rn, score[rn].mid.fmeasure) for rn in rouge_names)
        logger.info(f"rouge metrics: {rouge_dict}")
                    
        df = pd.DataFrame(rouge_dict, index=["pegasus"])
        df.to_csv(self.config.metrics_file_name, index=False)


In [15]:
try:
    config = ConfigurationManager()
    model_evaluation_config = config.get_model_evaluation_config()
    model_evaluation = ModelEvaluation(config=model_evaluation_config)
    model_evaluation.evaluate()
except Exception as e:
    raise e

[2024-03-19 23:35:33,547] 28 common - INFO - yaml file config\config.yml loaded successfully
[2024-03-19 23:35:33,551] 28 common - INFO - yaml file params.yml loaded successfully
[2024-03-19 23:35:33,554] 46 common - INFO - Created directory at: artifacts
[2024-03-19 23:35:33,555] 46 common - INFO - Created directory at: artifacts/model_evaluation


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
100%|██████████| 1/1 [00:33<00:00, 33.57s/it]

[2024-03-19 23:36:28,143] 83 rouge_scorer - INFO - Using default tokenizer.





[2024-03-19 23:36:28,413] 77 1180938689 - INFO - rouge metrics: {'rouge1': 0.03252032520325203, 'rouge2': 0.0, 'rougeL': 0.03252032520325203, 'rougeLsum': 0.03252032520325203}
