In [1]:
import os
os.chdir("../")
%pwd

'd:\\PracticeProjects\\NLP_Insurance_intent_Detection\\nlp_intent'

In [2]:
# Entity

from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen = True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path
    tokenizer_name: Path

In [3]:
# Configuration Manager
from intent.constants import *
from intent.utils.common import read_yaml, create_directories

class ConfigurationManager:
    def __init__(self, config_filepath = CONFIG_FILE_PATH, params_filepath = PARAMS_FILE_PATH):
        #print(CONFIG_FILE_PATH)
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath) 
        create_directories([self.config.artifacts_root])

    
    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation
        create_directories([config.root_dir])
        data_transformation_config = DataTransformationConfig(
            root_dir = config.root_dir,
            data_path = config.data_path,
            tokenizer_name = config.tokenizer_name
        )
        return data_transformation_config

In [4]:
# Components
import os
from intent.logging import logging
from transformers import AutoTokenizer
from datasets import load_dataset, Dataset

class DataTransformation:
    def __init__(self, config:DataTransformationConfig):
        self.config = config
        self.tokenizer = AutoTokenizer.from_pretrained(config.tokenizer_name)

    def transform_data(self):
        banking77 = load_dataset("csv", data_files = { "train" : os.path.join(self.config.data_path, "train.csv"), 
                                                        "test" : os.path.join(self.config.data_path, "test.csv")})
        banking77_categories = load_dataset("json", data_files = os.path.join(self.config.data_path, "categories.json"))
        banking77_categories_dict  = {}
        for key, value in enumerate(banking77_categories["train"]["text"]):
                banking77_categories_dict[value] = key
        train_df = banking77["train"].to_pandas()
        test_df = banking77["test"].to_pandas()
        # changing the labels to label_ids
        def category2label(x):
            return banking77_categories_dict[x]

        train_df["category_name"] = train_df["category"].apply(category2label)
        test_df["category_name"] = test_df["category"].apply(category2label)
        train_dataset = Dataset.from_pandas(train_df[["text", "category_name"]])
        test_dataset = Dataset.from_pandas(test_df[["text", "category_name"]])
        def tokenize(batch):
            return self.tokenizer(batch["text"], padding = True, truncation = True, return_tensors = "pt")        
        train_dataset_encoded = train_dataset.map(tokenize, batched = True, remove_columns = ["text"])
        train_dataset_encoded = train_dataset_encoded.rename_column("category_name", "labels")
        test_dataset_encoded = test_dataset.map(tokenize, batched = True, remove_columns = ["text"])
        test_dataset_encoded = test_dataset_encoded.rename_column("category_name", "labels") 

        train_dataset_encoded.save_to_disk(os.path.join(self.config.root_dir, "train_dataset_encoded"))
        test_dataset_encoded.save_to_disk(os.path.join(self.config.root_dir, "test_dataset_encoded"))


In [5]:
# Pipeline component
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config = data_transformation_config) 
    data_transformation.transform_data()
except Exception as e:
    raise e

[2024-05-17 01:15:41,090: INFO: common: YAML file: D:\PracticeProjects\NLP_Insurance_intent_Detection\nlp_intent\config\config.yaml loaded successfully]
[2024-05-17 01:15:41,094: INFO: common: YAML file: D:\PracticeProjects\NLP_Insurance_intent_Detection\nlp_intent\params.yaml loaded successfully]
[2024-05-17 01:15:41,096: INFO: common: created directory at: artifacts]
[2024-05-17 01:15:41,097: INFO: common: created directory at: artifacts/data_transformation]


Generating train split: 10003 examples [00:00, 135277.44 examples/s]
Generating test split: 3080 examples [00:00, 188916.04 examples/s]
Generating train split: 77 examples [00:00, 8262.00 examples/s]
Map: 100%|██████████| 10003/10003 [00:00<00:00, 19300.69 examples/s]
Map: 100%|██████████| 3080/3080 [00:00<00:00, 24667.57 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 10003/10003 [00:00<00:00, 1572078.20 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 3080/3080 [00:00<00:00, 563312.97 examples/s]
