In [1]:
import os

In [4]:
%pwd

'd:\\AkshatOP\\InShorts'

In [3]:
os.chdir("../")

In [5]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path
    tokenizer_name: Path

In [7]:
from InShorts.constants import *
from InShorts.utils.common import read_yaml, create_directiories

In [24]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directiories([self.config.artifacts_root])

    
    def get_data_transformation_config(self) -> DataTransformationConfig:
        # print(self.config)
        config = self.config.data_transformation

        create_directiories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
            tokenizer_name = config.tokenizer_name
        )

        return data_transformation_config

In [10]:
import os
from InShorts.logging import logger
from transformers import AutoTokenizer
from datasets import load_dataset, load_from_disk

  from .autonotebook import tqdm as notebook_tqdm


In [44]:
class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config
        self.tokenizer = AutoTokenizer.from_pretrained(config.tokenizer_name)
    
    def convert_examples_to_features(self,example_batch):
        input_encodings = self.tokenizer(example_batch['dialogue'] , max_length = 1024, truncation = True )
        
        with self.tokenizer.as_target_tokenizer():
            target_encodings = self.tokenizer(example_batch['summary'], max_length = 128, truncation = True)
            
        print(input_encodings)
        return {
            'input_ids' : input_encodings['input_ids'],
            'attention_mask': input_encodings['attention_mask'],
            'labels': target_encodings['input_ids']
        }
    

    def convert(self):
        dataset_samsum = load_dataset(self.config.data_path)
        # print(dataset_samsum)
        dataset_samsum_pt = dataset_samsum.map(self.convert_examples_to_features, batched = True)
        dataset_samsum_pt.save_to_disk(os.path.join(self.config.root_dir,"samsum_dataset"))

In [45]:
try:
    config = ConfigurationManager()
    print(1)
    data_transformation_config = config.get_data_transformation_config()
    print(2)
    data_transformation = DataTransformation(config=data_transformation_config)
    data_transformation.convert()
except Exception as e:
    raise e

[2023-10-14 16:01:49,130: INFO: common] yaml file config\config.yaml loaded successfully
[2023-10-14 16:01:49,133: INFO: common] yaml file params.yaml loaded successfully
[2023-10-14 16:01:49,136: INFO: common] directory artifacts created successfully
1
[2023-10-14 16:01:49,138: INFO: common] directory artifacts/data_transformation created successfully
2


Map:   7%|▋         | 1000/14732 [00:00<00:04, 3148.03 examples/s]

{'input_ids': [[12195, 151, 125, 7091, 3659, 107, 842, 119, 245, 181, 152, 10508, 151, 7435, 147, 12195, 151, 125, 131, 267, 650, 119, 3469, 29344, 1], [18038, 151, 2632, 127, 119, 6228, 118, 115, 136, 2974, 152, 10463, 151, 35884, 130, 329, 107, 18038, 151, 2587, 314, 1242, 10463, 151, 1509, 1], [4776, 151, 4451, 108, 180, 131, 116, 164, 152, 5377, 151, 6843, 4301, 83678, 108, 125, 140, 313, 112, 171, 1425, 113, 1549, 155, 2371, 164, 64428, 4776, 151, 463, 368, 119, 511, 124, 557, 152, 5377, 151, 4384, 119, 235, 108, 18857, 1549, 111, 1596, 2073, 56616, 161, 418, 5377, 151, 3183, 3469, 125, 131, 267, 696, 161, 130, 116, 111, 171, 579, 5377, 151, 184, 195, 313, 112, 38244, 114, 5713, 167, 1088, 113, 1553, 125, 131, 267, 1461, 181, 38244, 316, 9609, 4776, 151, 321, 557, 1549, 125, 1253, 881, 93882, 3111, 241, 4911, 207, 5033, 118, 557, 18576, 4776, 151, 168, 288, 1107, 5377, 151, 1516, 108, 1556, 125, 131, 267, 171, 120, 4776, 151, 125, 163, 172, 303, 450, 121, 12397, 115, 9994, 11485, 

Map:  14%|█▎        | 2000/14732 [00:00<00:04, 3011.28 examples/s]

{'input_ids': [[6549, 151, 842, 119, 133, 1017, 118, 109, 3578, 136, 232, 152, 2739, 151, 1894, 145, 666, 113, 313, 118, 114, 6280, 3572, 585, 2739, 151, 111, 119, 152, 6549, 151, 184, 4337, 160, 126, 289, 565, 111, 145, 666, 1556, 119, 192, 172, 112, 171, 110, 116, 307, 424, 152, 2739, 151, 1807, 146, 2739, 151, 188, 1173, 120, 145, 131, 216, 5628, 415, 200, 110, 151, 470, 6549, 151, 125, 235, 108, 155, 119, 131, 216, 146, 254, 2635, 114, 948, 1777, 3572, 152, 2739, 151, 145, 84804, 267, 108, 120, 131, 116, 146, 288, 214, 108, 119, 235, 152, 2739, 151, 145, 2364, 14061, 111, 5744, 5572, 112, 24190, 116, 111, 16704, 26408, 6549, 151, 125, 236, 107, 6549, 151, 125, 131, 208, 146, 334, 125, 131, 208, 734, 118, 114, 895, 6280, 155, 1556, 145, 256, 275, 3572, 172, 120, 118, 114, 1339, 167, 125, 137, 508, 126, 152, 2739, 151, 485, 192, 129, 255, 147, 2739, 151, 125, 131, 208, 334, 119, 131, 267, 298, 126, 188, 172, 145, 171, 8537, 6549, 151, 109, 641, 113, 2027, 119, 166, 115, 109, 8338, 33

Map:  20%|██        | 3000/14732 [00:01<00:04, 2469.21 examples/s]

{'input_ids': [[7413, 151, 7069, 108, 119, 186, 152, 10283, 151, 12098, 108, 5206, 108, 180, 131, 116, 164, 152, 7413, 151, 3350, 131, 116, 1921, 152, 10283, 151, 353, 131, 116, 220, 1921, 108, 125, 595, 131, 144, 133, 166, 108, 188, 385, 180, 119, 245, 108, 408, 117, 124, 109, 3029, 7413, 151, 842, 7573, 147, 1516, 3113, 1], [5503, 151, 7069, 152, 226, 9421, 4457, 131, 116, 685, 152, 91324, 151, 2657, 108, 136, 117, 128, 3113, 155, 128, 4457, 131, 116, 344, 107, 9752, 108, 178, 518, 169, 685, 264, 111, 687, 165, 147, 5503, 151, 15320, 108, 419, 6969, 107, 91324, 151, 842, 119, 217, 364, 152, 5503, 151, 566, 108, 188, 395, 334, 579, 140, 5557, 147, 91324, 151, 24350, 107, 202, 121, 39948, 147, 1], [13652, 151, 4366, 125, 179, 114, 9906, 152, 6241, 151, 36977, 208, 401, 220, 13652, 151, 447, 146, 152, 6241, 151, 125, 272, 131, 144, 235, 6241, 151, 188, 1045, 172, 364, 119, 131, 267, 8808, 147, 13652, 151, 343, 125, 12146, 245, 245, 6241, 151, 236, 401, 12146, 13652, 151, 110, 152, 6241,

Map:  27%|██▋       | 4000/14732 [00:01<00:04, 2578.19 examples/s]

{'input_ids': [[8994, 151, 241, 127, 475, 152, 21033, 151, 792, 8994, 151, 6514, 21033, 151, 145, 131, 216, 828, 128, 5825, 8994, 151, 7953, 134, 109, 1801, 1076, 21033, 151, 12916, 118, 214, 134, 2653, 46026, 1858, 131, 116, 8994, 151, 6514, 21033, 151, 145, 131, 216, 134, 238, 122, 109, 9006, 21033, 151, 329, 134, 238, 8994, 151, 64530, 21033, 151, 145, 131, 216, 2096, 8994, 151, 1838, 118, 4911, 21033, 151, 12373, 164, 112, 416, 12253, 112, 42783, 8994, 151, 180, 152, 21033, 151, 7026, 117, 264, 107, 285, 131, 267, 129, 774, 112, 236, 119, 107, 28590, 164, 8994, 151, 792, 21033, 151, 1226, 107, 125, 131, 267, 823, 342, 112, 1335, 118, 119, 8994, 151, 532, 131, 267, 129, 186, 115, 377, 5963, 21033, 151, 10762, 110, 107, 2625, 131, 267, 1335, 118, 119, 8994, 151, 236, 4911, 783, 401, 10898, 1], [2538, 151, 842, 119, 506, 133, 136, 3355, 152, 6549, 151, 1089, 610, 107, 6549, 151, 125, 346, 1838, 118, 7413, 112, 650, 126, 112, 213, 107, 2538, 151, 4810, 108, 110, 307, 757, 147, 1], [340

Map:  34%|███▍      | 5000/14732 [00:01<00:03, 2490.56 examples/s]

{'input_ids': [[5861, 151, 774, 177, 232, 401, 110, 84683, 32099, 7867, 151, 112, 119, 314, 108, 1216, 119, 109, 229, 110, 107, 21443, 135, 3360, 108, 109, 20212, 156, 5861, 151, 4233, 117, 792, 112, 214, 130, 210, 107, 3695, 8867, 111, 806, 7867, 151, 21360, 124, 1242, 5861, 151, 171, 119, 309, 133, 114, 2243, 152, 10096, 152, 111, 13152, 6425, 7867, 151, 394, 347, 183, 147, 5861, 151, 60698, 225, 146, 112, 393, 109, 1505, 111, 109, 2417, 107, 107, 7867, 151, 111, 163, 108, 119, 137, 1102, 833, 347, 19193, 109, 2316, 147, 722, 117, 126, 122, 119, 152, 5861, 151, 149, 2115, 127, 308, 108, 109, 12220, 117, 167, 281, 120, 186, 117, 220, 154, 6298, 108, 80248, 288, 108, 155, 145, 131, 216, 1226, 7867, 151, 532, 715, 119, 131, 267, 129, 350, 112, 331, 352, 396, 5861, 151, 715, 167, 107, 412, 571, 108, 149, 4067, 127, 9545, 111, 7555, 127, 3135, 262, 113, 149, 274, 2115, 124, 109, 14341, 7867, 151, 12916, 111, 236, 107, 5861, 151, 134, 109, 990, 157, 243, 126, 192, 289, 134, 583, 296, 390, 

Map:  41%|████      | 6000/14732 [00:02<00:03, 2557.41 examples/s]

{'input_ids': [[18038, 151, 125, 188, 419, 142, 369, 124, 9015, 7070, 151, 3350, 135, 152, 18038, 151, 2481, 108, 2383, 2670, 3410, 151, 10762, 108, 155, 119, 419, 142, 369, 506, 132, 188, 114, 1285, 152, 18038, 151, 1205, 114, 1285, 108, 155, 125, 11415, 111, 145, 4337, 506, 124, 109, 685, 7070, 151, 1335, 108, 171, 119, 288, 245, 112, 696, 112, 2481, 152, 18038, 151, 125, 272, 131, 144, 235, 610, 7720, 151, 12223, 124, 108, 127, 119, 1651, 152, 3410, 151, 463, 301, 117, 126, 152, 18038, 151, 168, 131, 116, 114, 1392, 2102, 115, 85065, 108, 264, 131, 116, 109, 494, 3180, 151, 18038, 151, 110, 105, 12014, 940, 11896, 2314, 3410, 151, 1894, 108, 85065, 131, 116, 146, 120, 571, 429, 108, 155, 401, 288, 108, 2481, 152, 7720, 151, 722, 249, 171, 157, 369, 152, 7070, 151, 125, 666, 119, 172, 128, 494, 18038, 151, 125, 171, 108, 155, 125, 131, 261, 174, 264, 118, 384, 231, 7720, 151, 412, 233, 199, 249, 152, 18038, 151, 285, 243, 1466, 1052, 446, 232, 3410, 151, 26964, 152, 18038, 151, 1179,




TypeError: TextEncodeInput must be Union[TextInputSequence, Tuple[InputSequence, InputSequence]]