## Imports

In [6]:
import requests
import torch

from torch.utils.data import Dataset

## Download the Shakespeare Dataset

In [7]:
# Download Shakespeare dataset
url = 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt'
response = requests.get(url)
text = response.text

print(f"Dataset length: {len(text)} characters")
print(f"First 200 characters:\n{text[:200]}")

Dataset length: 1115394 characters
First 200 characters:
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you


In [None]:
class Config:
    """ Configuration class for model and training hyperparameters."""
    # Model hyperparams
    n_layer = 12  
    n_head = 8    # Number of attention heads
    n_embd = 768  # Embedding dimension
    block_size = 128  # Maximum sequence length
    
    # Training hyperparameters
    batch_size = 128

config = Config()

## Character Dataset Class

In [10]:
class CharDataset(Dataset):
    """
    Character-level dataset for language modeling.
    Emits batches of characters encoded as integers.
    """

    def __init__(self, config, data):
        self.config = config
        self.data = data
        
        # unique characters 
        chars = sorted(list(set(data)))
        self.vocab_size = len(chars)
        
        print(f"Vocabulary size: {self.vocab_size}")
        print(f"Unique characters: {''.join(chars)}")
        
        # Character to index and index to character mappings
        self.stoi = {ch: i for i, ch in enumerate(chars)}
        self.itos = {i: ch for i, ch in enumerate(chars)}
        
    def get_vocab_size(self):
        return self.vocab_size

    def __len__(self):
        # Number of possible sequences
        return len(self.data) - self.config.block_size

    def __getitem__(self, idx):
        # chunk of (block_size + 1) characters from the data
        chunk = self.data[idx:idx + self.config.block_size + 1]
        
        # Encode every character to an integer
        dix = [self.stoi[ch] for ch in chunk]
        
        # Input is first block_size characters, target is the same but shifted by one
        x = torch.tensor(dix[:-1], dtype=torch.long)
        y = torch.tensor(dix[1:], dtype=torch.long)
        
        return x, y
    
    def encode(self, text):
        """Encode string to list of integers."""
        return [self.stoi[ch] for ch in text]
    
    def decode(self, indices):
        """Decode list of integers to string."""
        return ''.join([self.itos[i] for i in indices])

# create dataset
dataset = CharDataset(config, text)
config.vocab_size = dataset.get_vocab_size()

Vocabulary size: 65
Unique characters: 
 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
