In [6]:
import os

In [16]:
import torch

In [17]:
device="cuda" if torch.cuda.is_available() else "cpu"

In [7]:
os.chdir('..')

In [8]:
from pathlib import Path

In [9]:
from dataclasses import dataclass
@dataclass
class ModelEvaluationConfig:
    root_dir: Path
    data_path: Path
    model_path: Path
    tokenizer_path: Path
    metric_file_name: Path

In [10]:
from src.textSummarizer.constants import *
from src.textSummarizer.utils.common import read_yaml,create_directories

In [11]:
class ConfigurationManager:
    
    def __init__(self,
                 config_path=CONFIG_FILE_PATH,
                 params_path=PARAMS_FILE_PATH):
        self.config=read_yaml(config_path)
        self.params=read_yaml(params_path)
        create_directories([self.config.artifact_root],verbose=True)
        
    def get_model_evaluation_config(self)->ModelEvaluationConfig:
        config=self.config.model_evaluation
        model_evaluation_config=ModelEvaluationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
            model_path=config.model_path,
            tokenizer_path=config.tokneizer_path,
            metric_file_name=config.metric_file_name
        )
        return model_evaluation_config

In [12]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
# Training
from transformers import DataCollatorForSeq2Seq
from transformers import TrainingArguments, Trainer
import torch,pandas as pd
from datasets import load_from_disk
from tqdm import tqdm

In [18]:
class ModelEvaluation:
    def __init__(self,config:ModelEvaluationConfig):
        self.config=config
    def generate_batch_sized_chunks(self,list_of_elements,batch_size):
        for i in range(0,len(list_of_elements),batch_size):
            yield list_of_elements[i:i+batch_size]
    def calculate_metric_on_test_ds(self,dataset,metric,model,tokenizer,batch_size=16,device=device,column_text='article',column_summary='highlights'):
        article_batches=list(self.generate_batch_sized_chunks(dataset[column_text],batch_size))
        target_batches=list(self.generate_batch_sized_chunks(dataset[column_summary],batch_size))
        for article_batch, target_batch in tqdm(zip(article_batches,target_batches),total=len(article_batches)):
            inputs=tokenizer(article_batch,max_length=1024,truncation=True,padding="max_length",return_tensors="pt")
            summaries=model.generate(input_ids=inputs['input_ids'].to(device),
                                    attention_mask=inputs['attention_mask'].to(device),
                                    length_penalty=0.8,num_beams=8,max_length=128)
            decoded_summaries=[tokenizer.decode(s,skip_special_tokens=True,clean_up_tokenization_spaces=True) for s in summaries]
            decoded_summaries=[d.replace(""," ") for d in decoded_summaries]

            metric.add_batch(predictions=decoded_summaries,references=target_batch)
        score=metric.compute()
        return score


In [14]:
import evaluate
rouge_metric=evaluate.load('rouge')
rouge_names=['rouge1','rouge2','rougeL','rougeLsum']

Downloading builder script: 6.27kB [00:00, 1.57MB/s]
