In [1]:
# You want to transform this dataset so that it
# looks like the following:
#
#  [CLS] [SENT1_TOK1] ... [SENT1_TOKn] [SEP] [SENT2_TOK1] ... [SENT2_TOKn] [SEP] [SENT3_TOK1]  ... [SENT3_TOKn] [SEP]

dataset = [
    {
        'sentences': [
            'This is a sentence.',
            'This is another sentence.',
            'Together, they make a paragraph.',
        ]
    },
    {
        'sentences': [
            'This sentence belongs to another sample',
            'Overall, the dataset is made of multiple samples.',
            'Each sample is made of multiple sentences.',
            'Samples might have a different number of sentences.',
            'And that is the story!',
        ]
    }
]

# Let's use smashed to do that!

In [2]:
from transformers.models.auto.tokenization_auto import AutoTokenizer
from smashed.interfaces.simple import (
    TokenizerMapper,
    MultiSequenceStriderMapper,
    TokensSequencesPaddingMapper,
    AttentionMaskSequencePaddingMapper,
    SequencesConcatenateMapper,
    Python2TorchMapper
)
from pprint import pprint

tokenizer = AutoTokenizer.from_pretrained(
    pretrained_model_name_or_path='bert-base-uncased',
)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# First we need to tokenize each sentence

tokenize_mapper = TokenizerMapper(
    input_field='sentences',
    tokenizer=tokenizer,
    add_special_tokens=False,
    truncation=True,
    max_length=80
)

tokenized_dataset = tokenize_mapper.map(dataset, remove_columns=True)
pprint(tokenized_dataset)
pprint(len(tokenized_dataset))


[{'attention_mask': [[1, 1, 1, 1, 1], [1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1]],
  'input_ids': [[2023, 2003, 1037, 6251, 1012],
                [2023, 2003, 2178, 6251, 1012],
                [2362, 1010, 2027, 2191, 1037, 20423, 1012]]},
 {'attention_mask': [[1, 1, 1, 1, 1, 1],
                     [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
                     [1, 1, 1, 1, 1, 1, 1, 1],
                     [1, 1, 1, 1, 1, 1, 1, 1, 1],
                     [1, 1, 1, 1, 1, 1]],
  'input_ids': [[2023, 6251, 7460, 2000, 2178, 7099],
                [3452,
                 1010,
                 1996,
                 2951,
                 13462,
                 2003,
                 2081,
                 1997,
                 3674,
                 8168,
                 1012],
                [2169, 7099, 2003, 2081, 1997, 3674, 11746, 1012],
                [8168, 2453, 2031, 1037, 2367, 2193, 1997, 11746, 1012],
                [1998, 2008, 2003, 1996, 2466, 999]]}]
2


In [4]:
# Then we generate new examples so that each contains at most 3 sentences or 512 tokens

strider_mapper = MultiSequenceStriderMapper(
    max_stride_count=3,
    max_length=512,
    tokenizer=tokenizer,
    length_reference_field='input_ids'
)

strided_dataset = strider_mapper.map(tokenized_dataset)
pprint(strided_dataset[0])


{'attention_mask': [[1, 1, 1, 1, 1], [1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1]],
 'input_ids': [[2023, 2003, 1037, 6251, 1012],
               [2023, 2003, 2178, 6251, 1012],
               [2362, 1010, 2027, 2191, 1037, 20423, 1012]]}


In [5]:
# we map both the input_ids and the attention mask. Note how we combine 
# multiple mappers into a pipeline.

padding_mappers = TokensSequencesPaddingMapper(
    tokenizer=tokenizer,
    input_field='input_ids'
) >> AttentionMaskSequencePaddingMapper(
    tokenizer=tokenizer,
    input_field='attention_mask'
)

padded_dataset = padding_mappers.map(strided_dataset)
pprint(padded_dataset[0])


{'attention_mask': [[1, 1, 1, 1, 1, 1, 1],
                    [1, 1, 1, 1, 1, 1],
                    [1, 1, 1, 1, 1, 1, 1, 1]],
 'input_ids': [[101, 2023, 2003, 1037, 6251, 1012, 102],
               [2023, 2003, 2178, 6251, 1012, 102],
               [2362, 1010, 2027, 2191, 1037, 20423, 1012, 102]]}


In [6]:
# We concatenate all examples in the same stride:
concat_mapper = SequencesConcatenateMapper()
concatenated_dataset = concat_mapper.map(padded_dataset)
pprint(concatenated_dataset[0])


{'attention_mask': [1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1],
 'input_ids': [101,
               2023,
               2003,
               1037,
               6251,
               1012,
               102,
               2023,
               2003,
               2178,
               6251,
               1012,
               102,
               2362,
               1010,
               2027,
               2191,
               1037,
               20423,
               1012,
               102]}


In [7]:
# Finally, let us turn all examples into tensors:
tensor_mapper = Python2TorchMapper(
    field_cast_map={
        'input_ids': 'int64',
        'attention_mask': 'float16',
    }
)

final_dataset = tensor_mapper.map(concatenated_dataset)
pprint(final_dataset[0])

{'attention_mask': tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1.], dtype=torch.float16),
 'input_ids': tensor([  101,  2023,  2003,  1037,  6251,  1012,   102,  2023,  2003,  2178,
         6251,  1012,   102,  2362,  1010,  2027,  2191,  1037, 20423,  1012,
          102])}
