In [222]:
from torch.utils.data import Dataset
import os
import torch
import tiktoken
import sentencepiece as spm
import subprocess

import nltk

import torch.nn as nn
import torch.nn.functional as F

In [223]:
# !pip freeze > requirements.txt

### **Dataset**:

The Shakespeare dataset contains the complete works of William Shakespeare, including his plays, poems, and sonnets.

[**Download link**](https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt)

In a character-level language model, each character in the input data is mapped to its respective index from a dictionary. The input to the model is in the form (B, N), where B is the batch size and N is the number of tokens for each sequence. The model was tested with B=N=128, but feel free to explore different values.

An interface for the dataset class that takes care of tokenization is provided below.



```python
from torch.utils.data import Dataset

class CharDataset(Dataset):
    """
    Emits batches of characters.

    Adapted from "https://github.com/karpathy/minGPT".
    """

    def __init__(self, config, data):

        chars = ... # get characters from the input data
        self.stoi = { ch:i for i,ch in enumerate(chars) } # map characters to integer indices

        ...

    def get_vocab_size(self):
        raise NotImplementedError()

    def __len__(self):
        raise NotImplementedError()

    def __getitem__(self, idx):
        # grab a chunk of (block_size + 1) characters from the data
        # encode every character to an integer
        # return the chunk and the shifted version as tensors
        pass
```




In [224]:
def check_requirements() -> bool:
    try:
        result = subprocess.run(
            ["pip", "install", "-r", "requirements.txt"],
            check=True,  # Raise an exception if the command fails
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            text=True
        )
        print(result.stdout)  # Optional: Print installation output
        return True
    except subprocess.CalledProcessError as e:
        print(f"Error installing requirements: {e.stderr}")
        return False


In [225]:
# check_requirements()

In [226]:
# hyperparameters
batch_size = 32  # how many independent sequences will we process in parallel?
block_size = 8  # what is the maximum context length for predictions?
max_iters = 3000
eval_interval = 300
learning_rate = 1e-2
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
# ------------

In [227]:
if not os.path.exists("Dataset.txt"):
    os.system("wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt")
    os.rename("input.txt", 'Dataset.txt')

In [228]:
class CharDataset(Dataset):
    """
    Emits batches of characters.

    Adapted from "https://github.com/karpathy/minGPT".
    """

    def __init__(self, data: str, mode: str = "normal"):

        self.tokens = set(nltk.word_tokenize(data))
        self.mode = mode

        if mode == "normal":
            self.chars = sorted(set(train_text))  # get characters from the input data

            self.stoi = {ch: i for i, ch in enumerate(self.chars)}  # map characters to integer indices
            self.itos = {i: ch for i, ch in enumerate(self.chars)}  # map integer indices to characters
            self.vocab_size = len(self.chars)

        elif mode == "sentencepiece":
            self.vocab_size = min(len(self.tokens), 10770)
            spm.SentencePieceTrainer.train(model_prefix='shakespeare', input='Dataset.txt',
                                           vocab_size=10770, unk_id=0, bos_id=1, eos_id=2, pad_id=3)


        elif mode == "tiktoken":
            self.enc = tiktoken.get_encoding("gpt2")
            self.vocab_size = self.enc.max_token_value + 1

    def encode(self, text):
        if self.mode == "normal":
            return [self.stoi[s] for s in text]
        elif self.mode == "sentencepiece":
            sp = spm.SentencePieceProcessor(model_file='shakespeare.model')
            return sp.encode(text)
        elif self.mode == "tiktoken":
            return self.enc.encode(text)

    def decode(self, tokens):
        if self.mode == "normal":
            return ''.join([self.itos[t] for t in tokens])
        elif self.mode == "sentencepiece":
            sp = spm.SentencePieceProcessor(model_file='shakespeare.model')
            return sp.decode(tokens)
        elif self.mode == "tiktoken":
            return self.enc.decode(tokens)

    def get_vocab_size(self):
        return self.vocab_size

    def __len__(self):
        return self.vocab_size

    def __getitem__(self, idx):
        # grab a chunk of (block_size + 1) characters from the data
        # encode every character to an integer
        # return the chunk and the shifted version as tensors
        pass

    # ```

In [229]:
with open("Dataset.txt", "r") as file:
    train_text = file.read()

print(train_text[:500])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor


In [230]:
char_dataset1 = CharDataset(train_text, mode="normal")
char_dataset2 = CharDataset(train_text, mode="sentencepiece")
char_dataset3 = CharDataset(train_text, mode="tiktoken")

In [231]:
print(
    f"Normal encoding: Length of sequence = {len(char_dataset1.encode(train_text))}, Vocab size = {char_dataset1.get_vocab_size()}")

print(
    f"SentencePiece encoding: Length of sequence = {len(char_dataset2.encode(train_text))}, Vocab size = {char_dataset2.get_vocab_size()}")

print(
    f"TikToken encoding: Length of sequence = {len(char_dataset3.encode(train_text))}, Vocab size = {char_dataset3.get_vocab_size()}")

Normal encoding: Length of sequence = 1115394, Vocab size = 65
SentencePiece encoding: Length of sequence = 290364, Vocab size = 10770
TikToken encoding: Length of sequence = 338025, Vocab size = 50257


In [232]:
data = torch.tensor(char_dataset1.encode(train_text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:1000])

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59,  1, 49, 52, 53, 61,  1, 15, 39, 47, 59, 57,  1, 25, 39, 56, 41,
      

In [233]:
data2 = torch.tensor(char_dataset2.encode(train_text), dtype=torch.long)
print(data2.shape, data2.dtype)
print(data2[:1000])

torch.Size([290364]) torch.int64
tensor([  160,   346,     5,  1001,    54,  1671,   208,   953,     4,   181,
           27,   147,     6,   421,     5,   997,     4,   147,     6,   160,
          346,     5,   112,    58,    47,  1968,   540,    10,   292,   117,
           10,  4422,    19,   421,     5,  7385,     6,  1968,     6,   160,
          346,     5,   160,     4,    15,   109,  1602,   479,    26,  1795,
          785,    10,     7,   397,     6,   421,     5,   184,   109,     8,
           72,     4,    54,   109,     8,    72,     6,   160,   346,     5,
          248,    96,   461,    37,     4,    11,    54,     8,    65,    34,
         1763,    78,    59,   227,  3029,     6,   244,     8,    72,    16,
         7476,    19,   421,     5,   165,    73,  4082,    64,     8,    72,
           13,   107,    29,    28,   230,     5,   293,     4,   293,    21,
           92,   282,   346,     5,   727,   314,     4,    68,  1339,     6,
          160,   346,     5,   

In [234]:
n = int(0.9 * len(data))

train_data = data[:n]
val_data = data[n:]

In [235]:
context_length = 8

print(train_data[:context_length])
print(char_dataset1.decode(train_data[:context_length].tolist()))

tensor([18, 47, 56, 57, 58,  1, 15, 47])
First Ci


In [236]:
train_data2 = data2[:n]
val_data2 = data2[n:]

print(train_data2[:context_length])
print(char_dataset2.decode(train_data2[:context_length].tolist()))

tensor([ 160,  346,    5, 1001,   54, 1671,  208,  953])
First Citizen: Before we proceed any further


In [237]:
x = train_data[:context_length]
y = train_data[1:context_length + 1]

for context in range(1, context_length):
    print(f"context = {context}, input = {x[:context].tolist()}, target = {y[context - 1]}")

context = 1, input = [18], target = 47
context = 2, input = [18, 47], target = 56
context = 3, input = [18, 47, 56], target = 57
context = 4, input = [18, 47, 56, 57], target = 58
context = 5, input = [18, 47, 56, 57, 58], target = 1
context = 6, input = [18, 47, 56, 57, 58, 1], target = 15
context = 7, input = [18, 47, 56, 57, 58, 1, 15], target = 47


In [238]:
torch.manual_seed(1337)
batch_size = 4
context_length = 8


def get_batch(data):
    start_idx = torch.randint(low=0, high=len(data) - context_length, size=(batch_size,))
    x = torch.stack([data[i: i + context_length] for i in start_idx])
    y = torch.stack([data[i + 1: i + 1 + context_length] for i in start_idx])
    x, y = x.to(device), y.to(device)
    return x, y


xb, yb = get_batch(train_data)
print("input")
print(xb.shape)
print(xb)
print("target")
print(yb.shape)
print(yb)

input
torch.Size([4, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
target
torch.Size([4, 8])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])


In [239]:
torch.manual_seed(1337)


class BigramLangModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()

        self.token_embedding = nn.Embedding(vocab_size, vocab_size)

    def forward(self, indices, targets=None):
        logits = self.token_embedding(indices)  # B = batch_size,T = context_length, C= vocab_size

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            loss = F.cross_entropy(logits.view(B * T, C), targets.view(B * T))

        return logits, loss

    def generate(self, init_token, max_new_tokens):
        sequence = init_token
        for itr in range(max_new_tokens):
            logits, loss = self(sequence)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            next_token = torch.multinomial(probs, num_samples=1)
            # next_token = torch.argmax(probs, dim=-1)
            # next_token = next_token.unsqueeze(1)
            sequence = torch.cat((sequence, next_token), dim=1)
        return sequence

In [240]:
model = BigramLangModel(char_dataset1.get_vocab_size())
m = model.to(device)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)
initial_token = torch.tensor(char_dataset1.encode('\n'), dtype=torch.long, device=device).unsqueeze(0)
# 0 == new line char
print(
    f"Generated Sequence : {char_dataset1.decode(m.generate(init_token=initial_token, max_new_tokens=100)[0].tolist())}")

torch.Size([4, 8, 65])
tensor(4.8786, grad_fn=<NllLossBackward0>)
Generated Sequence : 
Sr?qP-QWktXoL&jLDJgOLVz'RIoDqHdhsV&vLLxatjscMpwLERSPyao.qfzs$Ys$zF-w,;eEkzxjgCKFChs!iWW.ObzDnxA Ms$3


In [241]:
torch.manual_seed(1337)
xb2, yb2 = get_batch(train_data2)
m2 = BigramLangModel(char_dataset2.get_vocab_size())
m2 = m2.to(device)
logits, loss = m2(xb2, yb2)
print(logits.shape)
print(loss)
initial_token = torch.tensor(char_dataset2.encode('I Love You'), dtype=torch.long, device=device).unsqueeze(0)

print(
    f"Generated Sequence : {char_dataset2.decode(m2.generate(init_token=initial_token, max_new_tokens=100)[0].tolist())}")

torch.Size([4, 8, 10770])
tensor(9.9281, grad_fn=<NllLossBackward0>)
Generated Sequence : I Love You contewould RevoltALO First ample cry tideitted courage jade Whestuff Coyingbeseem Giv Despis exclaimscheshysicpipe dashovato spectatorssati interruptrgetfive kinsmenband unwa provinc growpierce arise Dick Suppl fare fatThat praisestcheryank inducedzard swim Pluck nowertake attireabsolve lead doth scornmi Monday orphan trudge lies fea qua Ea gulf dreadfulbaby lad Love Beggarsting knock chafeitedentiies kindred sitting parleGood valour oceanstateRAN scourabble cloudedper wedlock unre agreed emptiebo thereof does unfe perfume Hast cormorantMore uncl


In [242]:
optimizer = torch.optim.Adam(m.parameters(), lr=0.001)

In [243]:
@torch.no_grad()
def estimate_loss(train_data, val_data):
    out = {}
    model.eval()
    for data in [train_data, val_data]:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(data)
            logits, loss = model(X, Y)
            losses[k] = loss.item()

        out['train' if data is train_data else 'val'] = losses.mean()
    model.train()
    return out

In [244]:
batch_size == 32


def train(model, data, max_iters=3000, epochs=10, steps=100, eval_interval=300):
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for iter in range(max_iters):
            # every once in a while evaluate the loss on train and val sets
            if iter % eval_interval == 0:
                losses = estimate_loss(train_data, val_data)
                print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
                total_loss += losses['train']

            xb, yb = get_batch(data)
            logits, loss = model(xb, yb)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        print(f"epoch {epoch}: avg loss: {total_loss * eval_interval / max_iters}")
        print("-" * 50)


In [245]:
train(m, train_data)

step 0: train loss 4.7440, val loss 4.6967
step 300: train loss 4.5138, val loss 4.4888
step 600: train loss 4.2941, val loss 4.2918
step 900: train loss 4.1194, val loss 4.1230
step 1200: train loss 3.9569, val loss 3.9653
step 1500: train loss 3.7917, val loss 3.7838
step 1800: train loss 3.6330, val loss 3.6549
step 2100: train loss 3.5254, val loss 3.5100
step 2400: train loss 3.4064, val loss 3.4232
step 2700: train loss 3.2964, val loss 3.2902
epoch 0: avg loss: 3.928102970123291
--------------------------------------------------
step 0: train loss 3.1862, val loss 3.2309
step 300: train loss 3.1400, val loss 3.1440
step 600: train loss 3.0516, val loss 3.0946
step 900: train loss 3.0006, val loss 3.0209
step 1200: train loss 2.9819, val loss 2.9670
step 1500: train loss 2.9297, val loss 2.8944
step 1800: train loss 2.8478, val loss 2.8745
step 2100: train loss 2.8137, val loss 2.8218
step 2400: train loss 2.7848, val loss 2.7803
step 2700: train loss 2.7476, val loss 2.7577
epoc

In [246]:
initial_token = torch.tensor(char_dataset1.encode('\n'), dtype=torch.long).unsqueeze(0)

In [247]:
print(
    f"Generated Sequence : {char_dataset1.decode(m.generate(init_token=initial_token, max_new_tokens=100)[0].tolist())}")

Generated Sequence : 

JUSe, sham cthe, m I havers indouberorkingive is n k:
CARUKII VKEisieage ie
NIO: t toferan,
ERI kes


In [248]:
train(m2, train_data2, max_iters=1, epochs=1, steps=1, eval_interval=1)

step 0: train loss 2.4662, val loss 2.4642
epoch 0: avg loss: 2.466169834136963
--------------------------------------------------


In [263]:
initial_token = torch.tensor(char_dataset2.encode('I love'), dtype=torch.long, device=device).unsqueeze(0)

In [264]:
print(
    f"Generated Sequence : {char_dataset2.decode(m2.generate(init_token=initial_token, max_new_tokens=100)[0].tolist())}")

Generated Sequence : I love bar beguiledneel rot Hourrage knees CyNC fineid Hoo murderous Slander backs Follow knavery banishmentangleriesursuivant stablejunction hugaxskilfullargebreach Err nailjury rein wooers verdict manors match condemned secrecyST Wjealous strangely hareARDIN worthie tortoise flatterers dearly sweeten open Renown force bear ishailoming wail elbow Rescuemesred whet availcameespairARI three Rome speaking barbemer furr recllows terr size contempt bird barren inv auster interest Envframedhersquis butbawdcarries Claudio cit breed fourth argosyIN vent confu Sir volu happi
