In [None]:
# Instal dependencies
!pip install -q datasets

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, random_split, Subset
from datasets import load_dataset
from transformers import GPT2Tokenizer

In [None]:
# Load the wikitext dataset
subset_dataset = "wikitext-2-v1"
full_dataset = "wikitext-103-v1" # 50x bigger
dataset = load_dataset("wikitext", subset_dataset)

# Display the dataset
print(dataset)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


DatasetDict({
    test: Dataset({
        features: ['text'],
        num_rows: 4358
    })
    train: Dataset({
        features: ['text'],
        num_rows: 36718
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 3760
    })
})


In [None]:
# Parameters
batch_size = 64
num_epochs = 5
learning_rate = 0.001
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load dataset
dataset = load_dataset("wikitext", "wikitext-2-v1")

# Load tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Set the pad token to the EOS token if it's not already defined
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Function to encode examples using the tokenizer
def encode(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=512)

# Encode the dataset
dataset = dataset.map(encode, batched=True)
dataset.set_format(type='torch', columns=['input_ids'])



In [None]:
class FullyConnectedModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim=512):
        super(FullyConnectedModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.fc1 = nn.Linear(embedding_dim, 512)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(512, vocab_size)

    def forward(self, x):
        x = self.embedding(x)  # x needs to be long here
        x = x.mean(dim=1)  # Sum or average embeddings
        x = self.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# Init Model
model = FullyConnectedModel(vocab_size=len(tokenizer))
model.to(device)
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [None]:
# Adjusting train_epoch to fix label issues
def train_epoch(model, data_loader, optimizer, loss_fn, device):
    model.train()
    total_loss = 0
    for batch in data_loader:
        inputs = batch['input_ids'].to(device)  # Keep inputs as Long for embedding
        labels = torch.roll(inputs, -1, dims=1)  # Shift inputs for next-token prediction
        outputs = model(inputs)

        # Adjust labels for loss calculation (assuming single token prediction for simplification)
        loss = loss_fn(outputs, labels[:, -1])  # Take the last token's output vs shifted label

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    return total_loss / len(data_loader)

def evaluate_perplexity(model, data_loader, loss_fn, device):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch in data_loader:
            inputs = batch['input_ids'].to(device)
            labels = torch.roll(inputs, -1, dims=1)[:, -1]  # Last token prediction, labels are indices

            outputs = model(inputs)
            outputs = outputs  # Ensure this is [N, C]

            loss = loss_fn(outputs, labels)  # Check that outputs are [N, C] and labels are [N]
            total_loss += loss.item()

    avg_loss = total_loss / len(data_loader)
    perplexity = torch.exp(torch.tensor(avg_loss)).item()
    return perplexity


In [None]:
# Scaling Experiment
data_sizes = [1000, 5000, 10000, 15000, 20000, 25000, 30000, 35000]  # Define different sizes to test
results = {}
for size in data_sizes:
    # Create a subset of the dataset
    subset = Subset(dataset['train'], indices=range(size))
    train_loader = DataLoader(subset, batch_size=batch_size, shuffle=True)

    # Train the model
    for epoch in range(num_epochs):
        train_loss = train_epoch(model, train_loader, optimizer, loss_fn, device)
        print(f'Training Size: {size}, Epoch: {epoch+1}, Loss: {train_loss}')

    # Evaluate Perplexity
    perplexity = evaluate_perplexity(model, train_loader, loss_fn, device)
    print(f'Training Size: {size}, Perplexity: {perplexity}')
    results[size] = perplexity

    print("Results:", results)

Training Size: 1000, Epoch: 1, Loss: 6.236616432666779
Training Size: 1000, Epoch: 2, Loss: 3.5576252788305283
Training Size: 1000, Epoch: 3, Loss: 3.1031273901462555
Training Size: 1000, Epoch: 4, Loss: 2.958975672721863
Training Size: 1000, Epoch: 5, Loss: 2.9033003747463226
Training Size: 1000, Perplexity: 17.348674774169922
Results: {1000: 17.348674774169922}
Training Size: 5000, Epoch: 1, Loss: 3.692117724237563
Training Size: 5000, Epoch: 2, Loss: 2.968621881702278
Training Size: 5000, Epoch: 3, Loss: 2.8175183232826524
Training Size: 5000, Epoch: 4, Loss: 2.698087683206872
Training Size: 5000, Epoch: 5, Loss: 2.61581422256518
Training Size: 5000, Perplexity: 12.298166275024414
Results: {1000: 17.348674774169922, 5000: 12.298166275024414}
Training Size: 10000, Epoch: 1, Loss: 3.183851831278224
Training Size: 10000, Epoch: 2, Loss: 2.7666637472286344
Training Size: 10000, Epoch: 3, Loss: 2.5707231236111587
Training Size: 10000, Epoch: 4, Loss: 2.454671415553731
Training Size: 1000

In [None]:
results = {1000: 17.348674774169922, 5000: 12.298166275024414, 10000: 10.060457229614258, 15000: 8.110535621643066, 20000: 7.5504984855651855, 25000: 7.670215129852295, 30000: 6.471180438995361, 35000: 5.9110236167907715}