In [22]:
!pwd

/home/sagemaker-user/Text-Summarizer/notebooks


In [23]:
%cd ..

/home/sagemaker-user/Text-Summarizer


# Load the data

In [19]:
from datasets import load_dataset
from datasets.utils.logging import disable_progress_bar

# datasets.config.DOWNLOADED_DATASETS_PATH = Path(target_path)

disable_progress_bar()

In [18]:
samsum = load_dataset("Samsung/samsum")

In [3]:
samsum

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 14732
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 819
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 818
    })
})

In [8]:
samsum['train'][0]

{'id': '13818513',
 'dialogue': "Amanda: I baked  cookies. Do you want some?\r\nJerry: Sure!\r\nAmanda: I'll bring you tomorrow :-)",
 'summary': 'Amanda baked cookies and will bring Jerry some tomorrow.'}

In [6]:
print(samsum['train']['dialogue'][0])

Amanda: I baked  cookies. Do you want some?
Jerry: Sure!
Amanda: I'll bring you tomorrow :-)


# Data Configuration

In [27]:
from dataclasses import dataclass

In [25]:
@dataclass
class DataConfig:
    name: str
    save_path: str

# Configuration Manager

In [31]:
from src.constants import CONFIG_PATH, PARAMS_PATH
from src.utils import create_dirs, read_yaml
import os

In [26]:
class ConfigManager:
    def __init__(self, config_path=CONFIG_PATH, params_path=PARAMS_PATH):
        self.params = read_yaml(config_path)
        self.config = read_yaml(config_path)

        self.artifacts = self.config.artifacts_folder_dir
        create_dirs([self.artifacts])

    def get_data_config(self):

        data_config = self.config.data

        return DataConfig(
            name=data_config.name,
            save_path=os.path.join(self.artifacts, data_config.folder)
        )

# Data Class

In [32]:
from src import logger
from src.utils import get_size

In [29]:
class Data:
    def __init__(self, data_config: DataConfig):
        self.config = data_config

    def download(self):
        name = self.config.name
        path = self.config.save_path
        
        data = load_dataset(self.config.name)
        data.save_to_disk(self.config.save_path)
        logger.info(f'The data "{name}" loaded successfully and saved at "{path}" with size {get_size(path)} MB')

# Run the step

In [30]:
try:
    config = ConfigManager().get_data_config()
    Data(config).download()

except Exception as e:
    raise e

[2024-10-07 12:37:56,744: INFO: utils: The file: config.yaml loaded successfully...]
[2024-10-07 12:37:56,746: INFO: utils: The file: config.yaml loaded successfully...]
[2024-10-07 12:37:56,747: INFO: utils: 
Directory 'artifacts' was created successfully...]
[2024-10-07 12:37:57,053: INFO: 3290135551: The data "Samsung/samsum" loaded successfully and saved at "artifacts/data" with size 10.05 MB]
