In [1]:
import os
os.chdir("../")
%pwd

'd:\\PracticeProjects\\NLP_Insurance_intent_Detection\\nlp_intent'

In [2]:
# Entity

from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen = True)
class ModelEvaluationConfig:
    root_dir : Path
    transformed_data_path : Path
    data_path : Path
    model_path : Path
    tokenizer_path : Path
    metric_file_name : Path

In [3]:
# Configuration Manager

from intent.constants import *
from intent.utils.common import read_yaml, create_directories

class ConfigurationManager:
    def __init__(self, config_filepath = CONFIG_FILE_PATH, params_filepath = PARAMS_FILE_PATH):
        #print(CONFIG_FILE_PATH)
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath) 
        create_directories([self.config.artifacts_root])
    
    def get_model_evaluation_config(self) -> ModelEvaluationConfig:
        config = self.config.model_evaluation
        params = self.params.TrainingArguments
        create_directories([config.root_dir])
        model_evaluation_config = ModelEvaluationConfig(
            root_dir = config.root_dir,
            transformed_data_path = config.transformed_data_path,
            data_path = config.data_path,
            model_path = config.model_path,
            tokenizer_path = config.tokenizer_path,
            metric_file_name = config.metric_file_name
        )
        return model_evaluation_config    

In [4]:
# Component

import os
import pandas as pd
from intent.logging import logger
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import Trainer
from datasets import load_from_disk, load_dataset
from sklearn.metrics import f1_score, accuracy_score
import torch

class ModelEvaluation:
    def __init__(self, config:ModelEvaluationConfig):
        self.config = config
    
    def evaluate(self):
        device = "cuda" if torch.cuda.is_available() else "cpu"
        test_dataset_encoded = load_from_disk(os.path.join(self.config.transformed_data_path, "test_dataset_encoded"))
        test_dataset = pd.read_csv(os.path.join(self.config.data_path, "test.csv"))
        banking77_label = load_dataset("json", data_files = os.path.join(self.config.data_path, "categories.json"))
        banking77_label_dict  = {}
        for key, value in enumerate(banking77_label["train"]["text"]):
            banking77_label_dict[value] = key
        def category2label(x):
            return banking77_label_dict[x]
        test_dataset["category_name"] = test_dataset["category"].apply(category2label)

        model = AutoModelForSequenceClassification.from_pretrained(self.config.model_path).to(device)
        trainer = Trainer(model = model)
        tokenizer = AutoTokenizer.from_pretrained(self.config.tokenizer_path)
        preds_output = trainer.predict(test_dataset_encoded)
        y_preds = preds_output.predictions.argmax(-1)  
        y_test = test_dataset["category_name"]   
        test_results = pd.DataFrame()
        test_results["text"] = pd.Series(test_dataset["text"])
        test_results["category_name"] = pd.Series(test_dataset["category_name"])
        test_results["predicted_category"] = y_preds
        test_results.to_csv(os.path.join(self.config.root_dir, "eval_results.csv"), index = False)
        score_df = pd.DataFrame(self.evaluate_metrics(y_preds, y_test), index=[0])
        score_df.to_csv(self.config.metric_file_name)      


    def evaluate_metrics(self, y_preds, y_test):
        labels = y_test
        preds = y_preds
        f1 = f1_score(labels, preds, average= "weighted")
        acc = accuracy_score(labels, preds)
        return {"f1" : f1, "accuracy" : acc}

  from .autonotebook import tqdm as notebook_tqdm


[2024-05-20 22:54:42,189: INFO: config: PyTorch version 2.3.0+cu118 available.]


In [5]:
# Pipeline component
try:
    config = ConfigurationManager()
    model_evaluation_config = config.get_model_evaluation_config()
    model_evaluation = ModelEvaluation(config = model_evaluation_config) 
    model_evaluation.evaluate()
except Exception as e:
    raise e

[2024-05-20 22:54:42,412: INFO: common: YAML file: D:\PracticeProjects\NLP_Insurance_intent_Detection\nlp_intent\config\config.yaml loaded successfully]
[2024-05-20 22:54:42,414: INFO: common: YAML file: D:\PracticeProjects\NLP_Insurance_intent_Detection\nlp_intent\params.yaml loaded successfully]
[2024-05-20 22:54:42,416: INFO: common: created directory at: artifacts]
[2024-05-20 22:54:42,416: INFO: common: created directory at: artifacts/model_evaluation]


100%|██████████| 385/385 [00:06<00:00, 55.80it/s]
