In [1]:
import os 

In [2]:
%pwd

'd:\\Aiprojects\\Textsummarization\\text-summarization\\Notebooks'

In [3]:
os.chdir("../")
%pwd

'd:\\Aiprojects\\Textsummarization\\text-summarization'

In [4]:
import sys 
from dataclasses import dataclass
from pathlib import Path

@dataclass
class DataTransformationConfig:
     root_dir: Path
     data_path: Path
     tokenizer_name: str
     transformed_path : Path

In [5]:
from src.constants import *
from src.utils import read_yaml ,create_directory

In [6]:
class ConfigurationManager:
    def __init__(self,config_path = CONFIG_FILE_PATH,params_path= PARAMS_FILE_PATH):
        self.config = read_yaml(config_path)
        self.params = read_yaml(params_path)
        
        

    def get_data_transformation_config(self):
        config = self.config.data_transformation

        create_directory([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            data_path = config.data_path,
            root_dir = config.root_dir, 
            tokenizer_name=config.tokenizer_name,
            transformed_path=config.transformed_path
        )

        return data_transformation_config


In [7]:
from transformers import AutoTokenizer
from datasets import load_dataset , load_from_disk 
from src.logging.logger import get_logger
import os 
import sys 


logging = get_logger(__name__)


  from .autonotebook import tqdm as notebook_tqdm


In [12]:
class DataTransfomation:
    def __init__(self,config: DataTransformationConfig):
        self.config = config
        self.tokenizer = AutoTokenizer.from_pretrained(config.tokenizer_name)


    def tokenization(self,example_batch):
        input_encodings = self.tokenizer(example_batch['dialogue'],max_length = 1000,truncation=True)

        with self.tokenizer.as_target_tokenizer():
            output_encodings = self.tokenizer(example_batch['summary'],max_length = 120 , truncation = True)


        return {
            'input_ids':input_encodings['input_ids'],
            'attention_mask':input_encodings['attention_mask'],
            'labels':output_encodings['input_ids']
        }
    

    def apply_tokenization(self):

        sam_data = load_from_disk(self.config.data_path)
        logging.info("data loaded from disk for tokenization")
        sam_data_tk = sam_data.map(self.tokenization,batched=True)
        logging.info("data tokenized")
        sam_data_tk.save_to_disk(self.config.transformed_path)
        logging.info("transformed data saved")
    

In [13]:
from src.Exception import CustomException

In [14]:
try: 
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransfomation(config=data_transformation_config)
    data_transformation.apply_tokenization()
    
except Exception as e: 
    error = CustomException(e,sys)
    logging.error(error)
    raise error

Map: 100%|██████████| 14732/14732 [00:01<00:00, 9285.74 examples/s]
Map: 100%|██████████| 819/819 [00:00<00:00, 7746.54 examples/s]
Map: 100%|██████████| 818/818 [00:00<00:00, 8530.71 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 14732/14732 [00:00<00:00, 552356.70 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 819/819 [00:00<00:00, 119035.80 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 818/818 [00:00<00:00, 114704.98 examples/s]
