In [1]:
%pwd

'f:\\DS\\TextSummarizer\\Text-Summarizer\\research'

In [2]:
import os
os.chdir("../")

In [3]:
%pwd

'f:\\DS\\TextSummarizer\\Text-Summarizer'

In [4]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class ModelEvaluationConfig:
    root_dir: Path
    data_dir: Path
    model_path: Path
    tokenizer_path: Path
    metric_file_name: Path

In [5]:
from textSummarizer.constants import *
from textSummarizer.utils.common_utils import read_yaml, create_directories

In [6]:
class ConfigurationManager:

    def __init__(
            self,
            config_file_path=CONFIG_FILE_PATH,
            params_file_path=PARAMS_FILE_PATH
            ):
        self.config = read_yaml(config_file_path)
        self.params = read_yaml(params_file_path)

        create_directories([self.config.artifacts_root])

    def fetch_modelevaluation_config(self) -> ModelEvaluationConfig:
        config = self.config.model_evaluation

        create_directories([config.root_dir])

        data_evaluation_config = ModelEvaluationConfig(
            root_dir=config.root_dir,
            data_dir=config.data_dir,
            model_path =config.model_path,
            tokenizer_path = config.tokenizer_path,
            metric_file_name = config.metric_file_name
        )

        return data_evaluation_config


In [7]:
import transformers

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
from datasets import Eva

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from datasets import load_dataset, load_from_disk, load_metric
import torch
import pandas as pd
from tqdm import tqdm
import evaluate

[2024-04-21 02:51:04,788: INFO: config: PyTorch version 2.2.1 available.]


In [33]:
class ModelEvaluation:
    def __init__(
            self,
            config: ModelEvaluationConfig):
        self.config = config


    # Evaluation

    def generate_batch_sized_chunks(self, list_of_elements, batch_size):
        """split the dataset into smaller batches that we can process simultaneously
        Yield successive batch-sized chunks from list_of_elements."""
        for i in range(0, len(list_of_elements), batch_size):
            yield list_of_elements[i : i + batch_size]



    def calculate_metric_on_test_ds(self, dataset, metric, model, tokenizer, 
                                batch_size=16, device="cuda" if torch.cuda.is_available() else "cpu", 
                                column_text="article", 
                                column_summary="highlights"):
        article_batches = list(self.generate_batch_sized_chunks(dataset[column_text], batch_size))
        target_batches = list(self.generate_batch_sized_chunks(dataset[column_summary], batch_size))

        for article_batch, target_batch in tqdm(
            zip(article_batches, target_batches), total=len(article_batches)):
            
            inputs = tokenizer(article_batch, max_length=1024,  truncation=True, 
                            padding="max_length", return_tensors="pt")
            
            summaries = model.generate(input_ids=inputs["input_ids"].to(device),
                            attention_mask=inputs["attention_mask"].to(device), 
                            length_penalty=0.8, num_beams=8, max_length=128)
            ''' parameter for length penalty ensures that the model does not generate sequences that are too long. '''
            
            # Finally, we decode the generated texts, 
            # replace the  token, and add the decoded texts with the references to the metric.
            decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True, 
                                    clean_up_tokenization_spaces=True) 
                for s in summaries]      
            
            decoded_summaries = [d.replace("", " ") for d in decoded_summaries]
            
            
            metric.add_batch(predictions=decoded_summaries, references=target_batch)
            
        #  Finally compute and return the ROUGE scores.
        score = metric.compute()
        return score
    
    def evaluate(self):
        device = "cuda" if torch.cuda.is_available() else "cpu"
        tokenizer = AutoTokenizer.from_pretrained(self.config.tokenizer_path)
        model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(self.config.model_path).to(device)

        #loading the data
        dataset_samsum_pt = load_from_disk(self.config.data_dir)

        rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
        rouge_metric = load_metric('rouge')#evaluate.load('rouge')

        score = self.calculate_metric_on_test_ds(dataset_samsum_pt['test'][0:10], rouge_metric, model_pegasus, tokenizer, batch_size = 2, column_text = 'dialogue', column_summary= 'summary')
        return score
        # rouge_dict = dict((rn, score[rn].mid.fmeasure ) for rn in rouge_names )

        # df = pd.DataFrame(rouge_dict, index = [f'pegasus'] )
        # df.to_csv(self.config.metric_file_name, index=False)

In [40]:
rouge_dict = dict((rn, score2[rn].mid.fmeasure ) for rn in ["rouge1", "rouge2", "rougeL", "rougeLsum"] )

AttributeError: 'numpy.float64' object has no attribute 'mid'

In [24]:
evaluate.load('rouge')

Downloading builder script: 100%|██████████| 6.27k/6.27k [00:00<00:00, 6.50MB/s]


EvaluationModule(name: "rouge", module_type: "metric", features: [{'predictions': Value(dtype='string', id='sequence'), 'references': Sequence(feature=Value(dtype='string', id='sequence'), length=-1, id=None)}, {'predictions': Value(dtype='string', id='sequence'), 'references': Value(dtype='string', id='sequence')}], usage: """
Calculates average rouge scores for a list of hypotheses and references
Args:
    predictions: list of predictions to score. Each prediction
        should be a string with tokens separated by spaces.
    references: list of reference for each prediction. Each
        reference should be a string with tokens separated by spaces.
    rouge_types: A list of rouge types to calculate.
        Valid names:
        `"rouge{n}"` (e.g. `"rouge1"`, `"rouge2"`) where: {n} is the n-gram based scoring,
        `"rougeL"`: Longest common subsequence based scoring.
        `"rougeLsum"`: rougeLsum splits text using `"
"`.
        See details in https://github.com/huggingface/

In [34]:
score2  = model_evaluation_config.evaluate()
score2

100%|██████████| 5/5 [09:26<00:00, 113.26s/it]


[2024-04-21 04:12:36,046: INFO: rouge_scorer: Using default tokenizer.]


{'rouge1': 0.019759699923449933,
 'rouge2': 0.0,
 'rougeL': 0.019863271899249385,
 'rougeLsum': 0.019720047669079623}

In [38]:
score2['rouge1']

0.019759699923449933

In [35]:
type(score)

dict

In [None]:
b = {{'rouge1': 0.019759699923449933,
 'rouge2': 0.0,
 'rougeL': 0.019863271899249385,
 'rougeLsum': 0.019720047669079623},{'rouge1': 0.019759699923449933,
 'rouge2': 0.0,
 'rougeL': 0.019863271899249385,
 'rougeLsum': 0.019720047669079623},{'rouge1': 0.019759699923449933,
 'rouge2': 0.0,
 'rougeL': 0.019863271899249385,
 'rougeLsum': 0.019720047669079623}}

In [32]:
score

{'rouge1': 0.019759699923449933,
 'rouge2': 0.0,
 'rougeL': 0.019863271899249385,
 'rougeLsum': 0.019720047669079623}

In [28]:
try:
    config = ConfigurationManager()
    model_evaluation_config = config.fetch_modelevaluation_config()
    model_evaluation_config = ModelEvaluation(model_evaluation_config)
    score  = model_evaluation_config.evaluate()

except Exception as e:
    raise e

[2024-04-21 03:46:29,528: INFO: common_utils: yaml file: config\config.yaml loaded successfully]


[2024-04-21 03:46:31,166: INFO: common_utils: yaml file: params.yaml loaded successfully]
[2024-04-21 03:46:31,169: INFO: common_utils: Created directory at: artifacts]
[2024-04-21 03:46:31,173: INFO: common_utils: Created directory at: artifacts/model_evaluation]


100%|██████████| 5/5 [07:13<00:00, 86.72s/it]


[2024-04-21 03:55:27,711: INFO: rouge_scorer: Using default tokenizer.]


In [13]:
from transformers import AutoTokenizer

In [14]:
help(AutoTokenizer)

Help on class AutoTokenizer in module transformers.models.auto.tokenization_auto:

class AutoTokenizer(builtins.object)
 |  This is a generic tokenizer class that will be instantiated as one of the tokenizer classes of the library when
 |  created with the [`AutoTokenizer.from_pretrained`] class method.
 |  
 |  This class cannot be instantiated directly using `__init__()` (throws an error).
 |  
 |  Methods defined here:
 |  
 |  __init__(self)
 |      Initialize self.  See help(type(self)) for accurate signature.
 |  
 |  register(config_class, slow_tokenizer_class=None, fast_tokenizer_class=None, exist_ok=False)
 |      Register a new tokenizer in this mapping.
 |      
 |      
 |      Args:
 |          config_class ([`PretrainedConfig`]):
 |              The configuration corresponding to the model to register.
 |          slow_tokenizer_class ([`PretrainedTokenizer`], *optional*):
 |              The slow tokenizer to register.
 |          fast_tokenizer_class ([`PretrainedTokeni

In [17]:
tokenizer = AutoTokenizer.from_pretrained("google/pegasus-cnn_dailymail")
tokenizer.save_pretrained(os.path.join("artifacts/model_trainer/", "tokenizer"))

('artifacts/model_trainer/tokenizer\\tokenizer_config.json',
 'artifacts/model_trainer/tokenizer\\special_tokens_map.json',
 'artifacts/model_trainer/tokenizer\\spiece.model',
 'artifacts/model_trainer/tokenizer\\added_tokens.json',
 'artifacts/model_trainer/tokenizer\\tokenizer.json')

In [18]:
tokenizer = AutoTokenizer.from_pretrained("artifacts/model_trainer/tokenizer/")#AutoTokenizer.from_pretrained("artifacts/model_trainer/tokenizer")