In [2]:
from transformers import DataCollatorWithPadding, AutoTokenizer

texts = ["This movie is great", "The acting was poor"] 
labels = [1, 0]

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# Tokenize without padding 
tokenized_no_padding = tokenizer(texts, padding=False, truncation=True)

# Tokenize with padding
tokenized_with_padding = tokenizer(texts, padding=True, truncation=True) 

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

print("Without per-sample padding:")
batches = data_collator([(tokenized_no_padding, labels)])
batch= batches[0]
print(batch)

print("\nWith per-sample padding:")
batches = data_collator([(tokenized_with_padding, labels)])
print(batches)

'''
Output:

Without per-sample padding:
{
  'input_ids': [[101, 2054, 2003, 1037, 1011, 102, 0], [101, 2057, 4997, 1005, 102]],  
  'token_type_ids': [[0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0]],
  'attention_mask': [[1, 1, 1, 1, 1, 1, 0], [1, 1, 1, 1, 1]],
  'labels': [1, 0] 
}

With per-sample padding: 
{
  'input_ids': [[101, 2054, 2003, 1037, 1011, 102], [101, 2057, 4997, 1005, 102, 0]],
  'token_type_ids': [[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0]],
  'attention_mask': [[1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 0]], 
  'labels': [1, 0]
}
'''

Downloading tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Without per-sample padding:


AttributeError: 'list' object has no attribute 'keys'

In [1]:
import torch

class SimpleDataCollator:
    def __init__(self, tokenizer, max_length=None):
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __call__(self, batch):
        texts = [item['text'] for item in batch]
        labels = torch.tensor([item['label'] for item in batch])

        # Tokenize and pad the text
        encoding = self.tokenizer(texts, padding=True, truncation=True, max_length=self.max_length, return_tensors='pt')

        # The tokenizer's output is a dictionary with keys like 'input_ids' and 'attention_mask'
        batch = {key: value for key, value in encoding.items()}
        batch['labels'] = labels

        return batch


In [18]:
print(tokenizer.encode("this ys test"))

[5661, 331, 82, 1332]


In [15]:
from transformers import GPT2Tokenizer
# Example Usage
tokenizer = GPT2Tokenizer.from_pretrained('gpt2', bos_token='<|startoftext|>', eos_token='<|endoftext|>', pad_token='<|pad|>') # Initialize your tokenizer
data_collator = SimpleDataCollator(tokenizer, max_length=128)

# Suppose you have a dataset where each item is a dictionary with 'text' and 'label' keys
sample_dataset = [{'text': "Example sentence", 'label': 0},
{'text': "Example2 sentence", 'label': 0},
{'text': "Example3 sentence works very well", 'label': 1},
{'text': "Example4 sentence", 'label': 1},
{'text': "Example5 sentence", 'label': 0},
{'text': "Example6 sentence", 'label': 0}]

# Collate a batch of samples
collated_batch = data_collator(sample_dataset)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [16]:
print(collated_batch)
print(type(collated_batch))

{'input_ids': tensor([[16281,  6827, 50258, 50258, 50258, 50258],
        [16281,    17,  6827, 50258, 50258, 50258],
        [16281,    18,  6827,  2499,   845,   880],
        [16281,    19,  6827, 50258, 50258, 50258],
        [16281,    20,  6827, 50258, 50258, 50258],
        [16281,    21,  6827, 50258, 50258, 50258]]), 'attention_mask': tensor([[1, 1, 0, 0, 0, 0],
        [1, 1, 1, 0, 0, 0],
        [1, 1, 1, 1, 1, 1],
        [1, 1, 1, 0, 0, 0],
        [1, 1, 1, 0, 0, 0],
        [1, 1, 1, 0, 0, 0]]), 'labels': tensor([0, 0, 1, 1, 0, 0])}
<class 'dict'>
