In [1]:
import torch

class SimpleDataCollator:
    def __init__(self, tokenizer, max_length=None):
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __call__(self, batch):
        texts = [item['text'] for item in batch]
        labels = torch.tensor([item['label'] for item in batch])

        # Tokenize and pad the text
        encoding = self.tokenizer(texts, padding=True, truncation=True, max_length=self.max_length, return_tensors='pt')

        # The tokenizer's output is a dictionary with keys like 'input_ids' and 'attention_mask'
        batch = {key: value for key, value in encoding.items()}
        batch['labels'] = labels

        return batch


In [3]:
from transformers import GPT2Tokenizer
# Example Usage
tokenizer = GPT2Tokenizer.from_pretrained('gpt2', bos_token='<|startoftext|>', eos_token='<|endoftext|>', pad_token='<|pad|>') # Initialize your tokenizer
data_collator = SimpleDataCollator(tokenizer, max_length=128)

# Suppose you have a dataset where each item is a dictionary with 'text' and 'label' keys
sample_dataset = [{'text': "Example sentence", 'label': 0},
{'text': "Example2 sentence", 'label': 0},
{'text': "Example3 sentence", 'label': 1},
{'text': "Example4 sentence", 'label': 1},
{'text': "Example5 sentence", 'label': 0},
{'text': "Example6 sentence", 'label': 0}]

# Collate a batch of samples
collated_batch = data_collator(sample_dataset)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
print(collated_batch)
print(type(collated_batch))

{'input_ids': tensor([[16281,  6827, 50258],
        [16281,    17,  6827],
        [16281,    18,  6827],
        [16281,    19,  6827],
        [16281,    20,  6827],
        [16281,    21,  6827]]), 'attention_mask': tensor([[1, 1, 0],
        [1, 1, 1],
        [1, 1, 1],
        [1, 1, 1],
        [1, 1, 1],
        [1, 1, 1]]), 'labels': tensor([0, 0, 1, 1, 0, 0])}
<class 'dict'>
