In [1]:
import os


In [2]:
%pwd


'd:\\Text-Summarizer\\research'

In [3]:
os.chdir('../')

In [4]:
%pwd

'd:\\Text-Summarizer'

In [7]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class dataTransformationConfig:
    rootDir: Path
    dataPath: Path
    tokenizerName: Path


In [6]:
from textSummarizer.utils.common import read_yaml,createDir
from textSummarizer.constants import configFilePath,paramsFilePath

In [9]:
from logging import config


class configurationManager:
    def __init__(
        self,
        config_FilePath = configFilePath,
        params_FilePath = paramsFilePath):
        
        self.config = read_yaml(config_FilePath)
        self.params = read_yaml(params_FilePath)
        
        createDir([self.config.artifacts_root])
    
    # def getDataIngestionConfig(self) -> dataIngestionConfig:
    #     config = self.config.data_ingestion
    #     createDir([config.root_dir])
        
    #     data_ingestion_config = dataIngestionConfig(
    #         root_dir=config.root_dir,
    #         source_url=config.source_url,
    #         local_data_file=config.local_data_file,
    #         unzip_dir=config.unzip_dir
    #     )
    #     return data_ingestion_config

    # def getDataValidationConfig(self)->dataValidationConfig:
    #     config = self.config.dataValidation
    #     createDir([config.rootDir])
        
    #     datavalidationconfig = dataValidationConfig(
    #         rootDir=config.rootDir,
    #         statusFile=config.statusFile,
    #         allRequiredFiles=config.allRequiredFiles
    #     )
    #     return datavalidationconfig
    
    def getDataTransformationConfig(self)->dataTransformationConfig:
        config = self.config.dataTransformation
        createDir([config.rootDir])
        
        datatransformationconfig = dataTransformationConfig(
            rootDir=config.rootDir,
            dataPath=config.dataPath,
            tokenizerName=config.tokenizerName
        )
        return datatransformationconfig

In [11]:
import os
from textSummarizer.logging import logger
from transformers import AutoTokenizer
from datasets import load_dataset,load_from_disk

In [12]:
class dataTransformation:
    def __init__(self,config:dataTransformationConfig) -> None:
        self.config = config
        self.tokenizer = AutoTokenizer.from_pretrained(config.tokenizerName)
        
    def convertToFeatures(self,exampleBatch):
        inputEncoding = self.tokenizer(exampleBatch['dialogue'],max_length=1024,truncation=True)
        
        with self.tokenizer.as_target_tokenizer():
            targetEncoding = self.tokenizer(exampleBatch['summary'],max_length=128,truncation=True)
        return {
            'input_ids':inputEncoding['input_ids'],
            'attention_mask': inputEncoding['attention_mask'],
            'labels':targetEncoding['input_ids']
        }
    
    def convert(self):
        dataset_samsum = load_from_disk(self.config.dataPath)
        dataset_samsum_pt = dataset_samsum.map(self.convertToFeatures,batched=True)
        dataset_samsum_pt.save_to_disk(os.path.join(self.config.rootDir,'samsumDataset'))
        

In [13]:
try:
    config = configurationManager()
    data_validation_config = config.getDataTransformationConfig()
    data_validation = dataTransformation(config=data_validation_config)
    data_validation.convert()
except Exception as e:
    raise e

[2024-08-20 18:31:10,592: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-08-20 18:31:10,611: INFO: common: yaml file: params.yaml loaded successfully]
[2024-08-20 18:31:10,614: INFO: common: Created director at: artifacts]
[2024-08-20 18:31:10,620: INFO: common: Created director at: artifacts/data_transformation]


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Map: 100%|██████████| 14732/14732 [00:26<00:00, 565.00 examples/s]
Map: 100%|██████████| 819/819 [00:01<00:00, 739.51 examples/s]
Map: 100%|██████████| 818/818 [00:01<00:00, 760.73 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 14732/14732 [00:00<00:00, 42928.01 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 819/819 [00:00<00:00, 22360.96 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 818/818 [00:00<00:00, 19453.20 examples/s]
