In [1]:
%pip install datasets transformers

Collecting datasets
  Using cached datasets-3.4.0-py3-none-any.whl.metadata (19 kB)
Collecting transformers
  Using cached transformers-4.49.0-py3-none-any.whl.metadata (44 kB)
Collecting filelock (from datasets)
  Downloading filelock-3.18.0-py3-none-any.whl.metadata (2.9 kB)
Collecting numpy>=1.17 (from datasets)
  Using cached numpy-2.2.3-cp313-cp313-macosx_14_0_arm64.whl.metadata (62 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Using cached pyarrow-19.0.1-cp313-cp313-macosx_12_0_arm64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Using cached dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting pandas (from datasets)
  Using cached pandas-2.2.3-cp313-cp313-macosx_11_0_arm64.whl.metadata (89 kB)
Collecting requests>=2.32.2 (from datasets)
  Using cached requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting tqdm>=4.66.3 (from datasets)
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting xxhash (from datasets)
  Using cached x

In [None]:
from datasets import load_dataset, load_from_disk
from transformers import AutoTokenizer
from torch.utils.data import DataLoader
import torch
import os
import torch.nn as nn
import torch.nn.functional as F

# Define the model hyperparameters
n_embd = 768
n_head = 12
n_layer = 12
max_len = 512
dropout = 0.2
block_size = 64

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def get_max_length(dataset, tokenizer):
    def token_length(example):
        return {"length": len(tokenizer(example["text"])["input_ids"])}

    dataset_with_lengths = dataset.map(token_length)
    max_length = max(dataset_with_lengths["length"])
    return max_length

fixed_max_len = True

def tokenize_function(examples):
    return tokenizer(
        examples["text"], 
        truncation=True, 
        padding="max_length", 
        max_length=max_len
    )

# ds = ds.map(tokenize_function, batched=True)
tokenized_dataset_path = "wiki_10k_tokenized"

if not os.path.exists(tokenized_dataset_path):
    print("Local cache of dataset not found, downloading and tokenizing dataset...")
    # Load dataset (small subset of 10,000 samples)
    ds = load_dataset("wikimedia/wikipedia", "20231101.en", split="train")
    ds = ds.shuffle(seed=42).select(range(10000))  # Select only 10,000 samples
    # Select only the 'text' column
    ds = ds.remove_columns([col for col in ds.column_names if col != "text"])
    # Tokenize the dataset
    ds = ds.map(tokenize_function, batched=True)
    ds.save_to_disk(tokenized_dataset_path)
else:
    print("Local cache of dataset found, loading tokenized dataset...")
    ds = load_from_disk(tokenized_dataset_path)

max_len = get_max_length(ds, tokenizer) if not fixed_max_len else 512

# Create the dataloader
dataloader = DataLoader(
    ds.with_format('torch'),
    batch_size=8,
    collate_fn=lambda x: torch.stack([d['input_ids'] for d in x])
)

ex = next(iter(dataloader))
print(ex)
print(ex.shape)

vocab_size = tokenizer.vocab_size
print(f"Vocabulary size: {vocab_size}")

n_embd = 768
nn.Embedding(vocab_size, n_embd)(ex).shape

eval_iters = 100  # Define eval_iters with an appropriate value

# @torch.no_grad()
# def estimate_loss():
#     out = {}
#     model.eval()
#     for split in ['train', 'val']:
#         losses = torch.zeros(eval_iters)
#         for k in range(eval_iters):
#             X, Y = get_batch(split)
#             logits, loss = model(X, Y)
#             losses[k] = loss.item()
#         out[split] = losses.mean()
#     model.train()
#     return out

class Head(nn.Module):
    """ one head of self-attention """
    
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x) # (B, T, C)
        q = self.query(x) # (B, T, C)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2, -1) * C**-0.5 # (B, T, C) @ (B, C, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B, T, C)
        out = wei @ v # (B, T, T) @ (B, T, C) -> (B, T, C)
        return out

class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

class FeedForward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)
    
class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedForward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

class WikiCompleteModel(nn.Module):
    def __init__(self, vocab_size, n_embd, n_head, n_layer, max_len, dropout=0.2):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, n_embd)
        self.decoder = nn.TransformerDecoderx

In [None]:
bos_token_id = tokenizer.bos_token_id
cls_token_id = tokenizer.cls_token_id

print("BOS token ID:", bos_token_id)
print("CLS token ID:", cls_token_id) # every sequence starts with this token

BOS token ID: None
CLS token ID: 101


In [None]:
def get_tokenized_dataset(tokenize_function):
    tokenized_dataset_path = "wiki_10k_tokenized"

    if not os.path.exists(tokenized_dataset_path):
        print("Local cache of dataset not found, downloading and tokenizing dataset...")
        # Load dataset (small subset of 10,000 samples)
        ds = load_dataset("wikimedia/wikipedia", "20231101.en", split="train")
        ds = ds.shuffle(seed=42).select(range(10000))  # Select only 10,000 samples
        # Select only the 'text' column
        ds = ds.remove_columns([col for col in ds.column_names if col != "text"])
        # Tokenize the dataset
        ds = ds.map(tokenize_function, batched=True)
        ds.save_to_disk(tokenized_dataset_path)
    else:
        print("Local cache of dataset found, loading tokenized dataset...")
        ds = load_from_disk(tokenized_dataset_path)
    return ds

In [None]:
def create_sequences(examples, block_size=128):
    # Get the tokenized input ids (should be a list of integers)
    input_ids = examples["input_ids"]
    
    # Initialize lists to store the new sequences and labels
    input_sequences = []
    label_sequences = []
    
    # Ensure input_ids is a list of integers
    if isinstance(input_ids, list):
        # Loop through the input sequences and split them into blocks
        for i in range(len(input_ids) - block_size):
            input_seq = input_ids[i:i + block_size]
            label_seq = input_ids[i + block_size]  # The next token is the label
            
            input_sequences.append(input_seq)
            label_sequences.append(label_seq)
    
    # Return the input sequences and labels as dictionaries
    return {"input_ids": input_sequences, "labels": label_sequences}

In [None]:
tds = get_tokenized_dataset(tokenize_function)

Local cache of dataset found, loading tokenized dataset...


In [None]:
from datasets import Dataset

# Assuming `tds` is your tokenized dataset with columns ['input_ids']
# You can extract the data as a list of dictionaries
data = [{"input_ids": row["input_ids"]} for row in tds]

# Apply the function to create sequences for all rows
new_data = [create_sequences(ex, block_size=128) for ex in data]

# Now, we can create a new Dataset
tokenized_dataset = Dataset.from_dict({
    "input_ids": [seq for ex in new_data for seq in ex["input_ids"]],
    "labels": [seq for ex in new_data for seq in ex["labels"]],
})

# Print the dataset
print(tokenized_dataset)

Dataset({
    features: ['input_ids', 'labels'],
    num_rows: 3840000
})


In [None]:
i = 0
for ex in tokenized_dataset:
    print(ex)
    if i < 2:
        i += 1
    else:
        break

{'input_ids': [101, 20287, 2361, 6738, 2003, 1037, 4696, 1038, 2273, 1005, 1055, 2334, 3827, 2284, 1999, 9803, 2588, 6738, 1999, 2563, 1012, 1996, 2744, 1005, 2334, 1005, 2965, 2008, 2023, 3827, 4324, 2111, 2006, 2128, 2386, 2094, 2000, 1996, 2334, 5434, 1012, 1996, 3827, 2003, 3498, 2011, 2010, 9995, 1005, 1055, 3827, 2326, 1012, 2381, 6738, 3827, 2441, 1999, 6940, 1010, 1998, 2003, 1997, 1037, 5171, 6652, 2640, 1012, 19180, 2350, 2001, 1996, 2197, 2711, 1998, 2069, 2450, 2000, 2022, 6472, 2012, 6738, 1999, 4579, 1012, 2016, 2018, 2042, 7979, 1997, 1996, 4028, 1997, 2014, 3129, 1012, 2019, 4538, 1000, 2306, 2122, 3681, 1000, 4076, 1996, 3827, 1005, 1055, 2381, 2013, 14378, 2683, 2000, 4579, 1012, 1996, 4538, 2001, 2881, 1998, 2580, 2011, 2961, 6487, 16955, 1998, 3985, 2441], 'labels': 2011}
{'input_ids': [20287, 2361, 6738, 2003, 1037, 4696, 1038, 2273, 1005, 1055, 2334, 3827, 2284, 1999, 9803, 2588, 6738, 1999, 2563, 1012, 1996, 2744, 1005, 2334, 1005, 2965, 2008, 2023, 3827, 4324, 2

In [1]:
from data import WikipediaDataset
from transformers import AutoTokenizer
batch_size = 32

tokenizer = AutoTokenizer.from_pretrained("gpt2")
vocab_size = tokenizer.vocab_size

dataset = WikipediaDataset(
    tokenizer=tokenizer,
    max_len=1024,
    block_size=128, 
    regenerate=True,
    num_samples=100
)
train_dataloader = dataset.get_test_train_dataloaders("train", batch_size)
val_dataloader = dataset.get_test_train_dataloaders("val", batch_size)

  from .autonotebook import tqdm as notebook_tqdm


Local cache of sliding window dataset not found, creating dataset...
Local cache of dataset not found, downloading and tokenizing dataset...


Saving the dataset (1/1 shards): 100%|██████████| 100/100 [00:00<00:00, 25901.96 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 83055/83055 [00:00<00:00, 1590675.43 examples/s]


In [3]:
i = 0
for ex in train_dataloader:
    # print(ex)
    input_ids = ex['input_ids']
    # print(input_ids)
    print(tokenizer.decode(ex['input_ids'].tolist()[0]))
    i += 1
    if i > 5:
        break

Achilleía), attested in Attica in the fourth century BC (IG II² 1617) and, in the form Achillia, on a stele in Halicarnassus as the name of a female gladiator fighting an "Amazon".

Achilles' name can be analyzed as a combination of  () "distress, pain, sorrow, grief" and  () "people, soldiers, nation", resulting in a proto-form *Akhí-lāu̯os "he who has the people distressed" or "he whose people have distress". The grief or distress of
eskjöld and student of both Hugh Akston and Robert Stadler. He began working while still in school to show that he could have been successful without the aid of his family's wealth. Later, Francisco bankrupts the d'Anconia business to put it out of others' reach. His full name is given as "Francisco Domingo Carlos Andres Sebastián d'Anconia".

John Galt

John Galt is the primary male hero of Atlas Shrugged. He initially appears as an unnamed menial worker for Taggart Transcontinental, who often dines with Eddie
, automobiles, swimming pools, and other pa

{'bos_token': '<|endoftext|>',
 'eos_token': '<|endoftext|>',
 'unk_token': '<|endoftext|>'}

In [7]:
tokenizer.decode(101)

'�'

In [8]:
tokenizer.encode("Once upon a time,", return_tensors="pt")

tensor([[7454, 2402,  257,  640,   11]])

In [9]:
context = tokenizer.encode("Kanye West", return_tensors="pt")
print(tokenizer.decode(context[0]))  # Check what tokens are being used

Kanye West
