In [4]:
!pwd

/home/sagemaker-user/Text-Summarizer/notebooks


In [5]:
%cd ..

/home/sagemaker-user/Text-Summarizer


# Configuration

In [1]:
from dataclasses import dataclass

In [20]:
@dataclass
class TokenizationConfig:
    direct: str
    model_name: str
    data_path: str

# Configuration Manager

In [11]:
from src.config import ConfigManager
from src.utils import create_dirs
from os.path import join

In [24]:
class ConfigManager(ConfigManager):
    def get_tokenization_config(self):
        config = self.config.tokenization
        direct = join(self.artifacts, config.folder)

        create_dirs([direct])

        return TokenizationConfig(
            direct=direct,
            model_name=config.model_name,
            data_path=self.get_data_config().save_path
        )

# Tokenization

In [17]:
from transformers import AutoTokenizer
from datasets import load_from_disk

In [29]:
class Tokenization:
    def __init__(self, config: TokenizationConfig):
        self.config = config
        self.tokenizer = AutoTokenizer.from_pretrained(config.model_name)

    def to_ids(self, examples):
        inputs = self.tokenizer(examples['dialogue'], max_length=1024, truncation=True)

        with self.tokenizer.as_target_tokenizer():
            targets = self.tokenizer(examples['summary'], max_length=128, truncation=True)

        return {
            'input_ids': inputs['input_ids'],
            'attention_`mask': inputs['attention_mask'],
            'labels': targets['input_ids']
        }

    def tokenize(self):
        data = load_from_disk(self.config.data_path)
        tokenized_data = data.map(self.to_ids, batched=True)
        tokenized_data.save_to_disk(self.config.direct)

# Run the step

In [30]:
try:
    config = ConfigManager().get_tokenization_config()
    Tokenization(config).tokenize()
except Exception as e:
    raise e

[2024-10-08 07:01:16,587: INFO: utils: The file: config.yaml loaded successfully...]
[2024-10-08 07:01:16,589: INFO: utils: The file: config.yaml loaded successfully...]




Map:   0%|          | 0/14732 [00:00<?, ? examples/s]



Map:   0%|          | 0/819 [00:00<?, ? examples/s]

Map:   0%|          | 0/818 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/14732 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/819 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/818 [00:00<?, ? examples/s]