In [2]:
from datasets import Dataset

In [10]:
dataset = Dataset.from_dict({
    "a": [1, 2, 3],
    "b": [4, 5, 6],
    "c": [7, 8, 9]
})

mapped_dataset= dataset.map(
    lambda x: {"d": [i * 2 for i in x["a"]]},
    batched=True,
)

# Print the mapped dataset with keys and values
for row in mapped_dataset:
    print(row)

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

{'a': 1, 'b': 4, 'c': 7, 'd': 2}
{'a': 2, 'b': 5, 'c': 8, 'd': 4}
{'a': 3, 'b': 6, 'c': 9, 'd': 6}


In [84]:
import transformers

tokenizer = transformers.AutoTokenizer.from_pretrained("gpt2")
print(type(tokenizer))
tokenizer.add_special_tokens({"pad_token": "[PAD]"})
tokenizer.padding_side = "left"
tokenizer.add_prefix_space = True  # Ensure the tokenizer adds a space before tokens

<class 'transformers.models.gpt2.tokenization_gpt2_fast.GPT2TokenizerFast'>


In [107]:
def slide_window(text_batch):
    text_batch['input_words'] = []
    text_batch['output_words'] = []
    text_batch['input_ids_raw'] = []
    text_batch['output_ids_raw'] = []
    text_batch['tokens'] = []
    text_batch['input_text'] = []
    text_batch['output_text'] = []

    text_batch['input_ids'] = []
    text_batch['output_ids'] = []

    for text in text_batch['raw_text']:
        tokens = tokenizer.tokenize(text)
        text_batch['tokens'].append(tokens)

        # Create input and output tokens for sliding window
        input_tokens = tokens[:-1]
        output_tokens = tokens[1:]

        input_ids = tokenizer.convert_tokens_to_ids(input_tokens)
        output_ids = tokenizer.convert_tokens_to_ids(output_tokens)

        text_batch['input_words'].append(input_tokens)
        text_batch['output_words'].append(output_tokens)

        text_batch['input_ids_raw'].append(input_ids)
        text_batch['output_ids_raw'].append(output_ids)
    
        input_text = tokenizer.convert_tokens_to_string(input_tokens)
        output_text = tokenizer.convert_tokens_to_string(output_tokens)
        
        text_batch['input_text'].append(input_text)
        text_batch['output_text'].append(output_text)

        input_ids = tokenizer(input_text, return_tensors="pt", padding="max_length", truncation=True, max_length=10).input_ids[0]
        output_ids = tokenizer(output_text, return_tensors="pt", padding="max_length", truncation=True, max_length=10).input_ids[0]

        text_batch['input_ids'].append(input_ids)
        text_batch['output_ids'].append(output_ids)


    return text_batch 

dataset = Dataset.from_dict({
    "raw_text": ["hello this is ajay"],
})

slided_window_dataset = dataset.map(
    slide_window,
    batched=True,
    batch_size=4,
)

for row in slided_window_dataset:
    for key, value in row.items():
        print(f"{key}: {value}")


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

raw_text: hello this is ajay
input_words: ['hello', 'Ġthis', 'Ġis', 'Ġa']
output_words: ['Ġthis', 'Ġis', 'Ġa', 'jay']
input_ids_raw: [31373, 428, 318, 257]
output_ids_raw: [428, 318, 257, 33708]
tokens: ['hello', 'Ġthis', 'Ġis', 'Ġa', 'jay']
input_text: hello this is a
output_text:  this is ajay
input_ids: [50257, 50257, 50257, 50257, 50257, 50257, 31373, 428, 318, 257]
output_ids: [50257, 50257, 50257, 50257, 50257, 50257, 428, 318, 257, 33708]


In [111]:
from transformers import GPT2Tokenizer

# Load GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Set the padding token to EOS (GPT-2 does not have a default PAD token)
tokenizer.pad_token = tokenizer.eos_token

# Input text
text = "Let's test this tokenizer."

# Step 1: Tokenization
tokens = tokenizer.tokenize(text)
print("Tokens:", tokens)
# Example output: ['Let', "'s", 'test', 'this', 'tokenizer', '.']

# Step 2: Conversion to Input IDs
input_ids = tokenizer.convert_tokens_to_ids(tokens)
print("Input IDs:", input_ids)
# Example output: [1212, 39, 2649, 428, 25641, 13]

# Step 3: Adding Special Tokens (GPT-2 does not use special tokens like [CLS] or [SEP])
input_ids_with_special_tokens = input_ids
print("Input IDs with Special Tokens:", input_ids_with_special_tokens)

# Step 4: Padding (manually pad to a fixed length, e.g., 10)
max_length = 10
padded_input_ids = input_ids_with_special_tokens + [tokenizer.pad_token_id] * (max_length - len(input_ids_with_special_tokens))
print("Padded Input IDs:", padded_input_ids)
# Example output: [1212, 39, 2649, 428, 25641, 13, 50256, 50256, 50256, 50256]

# Step 5: Generating Attention Mask
attention_mask = [1 if token != tokenizer.pad_token_id else 0 for token in padded_input_ids]
print("Attention Mask:", attention_mask)
# Example output: [1, 1, 1, 1, 1, 1, 0, 0, 0, 0]



Tokens: ['Let', "'s", 'Ġtest', 'Ġthis', 'Ġtoken', 'izer', '.']
Input IDs: [5756, 338, 1332, 428, 11241, 7509, 13]
Input IDs with Special Tokens: [5756, 338, 1332, 428, 11241, 7509, 13]
Padded Input IDs: [5756, 338, 1332, 428, 11241, 7509, 13, 50256, 50256, 50256]
Attention Mask: [1, 1, 1, 1, 1, 1, 1, 0, 0, 0]


In [112]:
# Batch processing with automatic padding
encoded_inputs = tokenizer(
    ["Let's test this tokenizer.", "Short text."],
    padding=True,
    max_length=10,
    return_tensors="pt"
)

print("Input IDs:", encoded_inputs['input_ids'])
# Example output:
# tensor([[1212,   39, 2649,  428, 25641,   13, 50256, 50256, 50256, 50256],
#         [1212,   39,   13,     ...,     ...,     ...,     ...,     ...]])

print("Attention Mask:", encoded_inputs['attention_mask'])
# Example output:
# tensor([[1,   1,   1,   ...,   ...,   ...,   ...],
#         [1 , ..., ..., ..., ..., ..., ...]])


Input IDs: tensor([[ 5756,   338,  1332,   428, 11241,  7509,    13],
        [16438,  2420,    13, 50256, 50256, 50256, 50256]])
Attention Mask: tensor([[1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 0, 0, 0, 0]])


