In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from tqdm import tqdm

In [3]:
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [4]:
print("num of chars", len(text))

num of chars 1115394


In [5]:
# First 1000 chars
print(text[:100])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [6]:
# All unique chars that occur
# We'all use char as token
chars = sorted(list(set(text)))
print(''.join(chars))
print(len(chars))


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [7]:
# Simple tokenizer example (just using index) (More complex, see OpenAI tiktoken)
# Write the encode and decode function
stoi = {character:index for index, character in enumerate(chars)}
itos = {index:character for index, character in enumerate(chars)}
encode = lambda chars: [stoi[x] for x in chars]
decode = lambda ints: [itos[x] for x in ints]

test_str = "From either sides the river lies"
test_str_code = encode(test_str)
test_str_decode = decode(test_str_code)
print(test_str_code)
print(test_str_decode)

[18, 56, 53, 51, 1, 43, 47, 58, 46, 43, 56, 1, 57, 47, 42, 43, 57, 1, 58, 46, 43, 1, 56, 47, 60, 43, 56, 1, 50, 47, 43, 57]
['F', 'r', 'o', 'm', ' ', 'e', 'i', 't', 'h', 'e', 'r', ' ', 's', 'i', 'd', 'e', 's', ' ', 't', 'h', 'e', ' ', 'r', 'i', 'v', 'e', 'r', ' ', 'l', 'i', 'e', 's']


In [8]:
# Encode the entire dataset
# Split the train and validation set of the dataset
# Transfer to tensor
data = torch.tensor(encode(text), dtype=torch.long)
split_point = int(0.9 * len(data))
train_data = data[:split_point]
test_data = data[split_point:]

In [9]:
# Write the get_batch function
# Draw chunks of data and understand how to use a chunk of data consider varying length

batch_size = 8
block_size = 8
device = "cuda" if torch.cuda.is_available() else "cpu"

def get_batch(data):
    indices = torch.randint(len(data)-block_size, (batch_size, ))
    x = [data[start:start+block_size] for start in indices]
    y = [data[start+1:start+block_size+1] for start in indices]
    x, y = torch.stack(x), torch.stack(y)
    x, y = x.to(device), y.to(device)
    return x, y

x, y = get_batch(train_data)
print(x)

tensor([[61, 43, 50, 41, 53, 51, 43,  2],
        [39, 51, 54, 58, 53, 52, 11,  0],
        [ 0, 35, 46, 53, 57, 43,  1, 57],
        [58, 53,  1, 51, 39, 49, 43,  1],
        [52, 53, 59, 56,  1, 51, 47, 52],
        [47, 50, 50,  8,  0, 20, 53, 61],
        [59, 54, 53, 52,  1, 51, 43,  8],
        [27, 30, 23, 10,  0, 21, 58,  1]])


In [10]:
# The way how a chunk of data is used: Enumerate all possible prediction context length
x_example = train_data[:block_size]
y_example = train_data[1:block_size+1]

for t in range(block_size):
    x = x_example[:t+1]
    y = y_example[t]
    print("----------------------")
    print("Training input is ", x)
    print("Target is ", y)

----------------------
Training input is  tensor([18])
Target is  tensor(47)
----------------------
Training input is  tensor([18, 47])
Target is  tensor(56)
----------------------
Training input is  tensor([18, 47, 56])
Target is  tensor(57)
----------------------
Training input is  tensor([18, 47, 56, 57])
Target is  tensor(58)
----------------------
Training input is  tensor([18, 47, 56, 57, 58])
Target is  tensor(1)
----------------------
Training input is  tensor([18, 47, 56, 57, 58,  1])
Target is  tensor(15)
----------------------
Training input is  tensor([18, 47, 56, 57, 58,  1, 15])
Target is  tensor(47)
----------------------
Training input is  tensor([18, 47, 56, 57, 58,  1, 15, 47])
Target is  tensor(58)


In [11]:
# Test the data using simple language model BigramLanguageModel
# Embedding in Bigram is just a nxn matrix represents the transition probability from token to token
"""
BigramLanguageModel
-------------------
Forward:
Input: x, tensor of training data with shape (B, T).
Input: target, tensor of labeling data with shape (B, T). Default target = None
Output: logits, rows of the probability of each token in the data x, shape (B, T, C), C is the length of the vocab.
Output: loss, cross entropy of logits and targets. Note that input logits have to be reshaped to use crossEntropyLoss.

Generate: write a generic version that considers the history
Input: x, tensor (B, T)
Input: max_new_tokens
Output: x' after expanding
"""

class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
    
    def forward(self, x, target=None):
        logits = self.token_embedding_table(x)
        
        if target is None:
            loss = 0
        else:
            B, T, C = logits.shape
            logits_reshaped = logits.view(B*T, C)
            target = target.view(-1)
            loss = F.cross_entropy(logits_reshaped, target)

        return logits, loss

    def generate(self, x, predict_len):
        for step in range(predict_len):
            logits, _ = self(x)
            logits = logits[:, -1, :] # Draw the logits of last time step. Now the shape is (B, C)
            probs = F.softmax(logits, dim=1)
            char_new = torch.multinomial(probs, 1)
            x = torch.cat([x, char_new], dim=1)
        return x

model = BigramLanguageModel(len(chars))
model = model.to(device)
x, y = get_batch(train_data)
logits, loss = model(x, y)
print(logits.shape)
print(loss)

# Use .tolist() method to transform the tensor
print(''.join(decode(x[0].tolist())))
x_predicted = model.generate(x, 8)
print(''.join(decode(x_predicted[0].tolist())))

torch.Size([8, 8, 65])
tensor(4.6223, grad_fn=<NllLossBackward0>)
 argumen
 argumenkyzTwoes


In [15]:
# Train the Bigram model

optimizer = torch.optim.AdamW(model.parameters(), lr=0.001)

epochs = 10000
for _ in tqdm(range(epochs)):
    x, y = get_batch(train_data)
    _, loss = model(x, y)
    optimizer.zero_grad(set_to_none=True) # Save memory, and faster
    loss.backward()
    optimizer.step()
        
print(loss)

100%|██████████| 10000/10000 [00:06<00:00, 1568.66it/s]

tensor(2.3114, grad_fn=<NllLossBackward0>)





In [20]:
x, y = get_batch(train_data)

# Use .tolist() method to transform the tensor
print(''.join(decode(x[0].tolist())))
x_predicted = model.generate(x, 100)
print(''.join(decode(x_predicted[0].tolist())))

not have
not havenp, saves l fod,
Cllengathoy tlout,
Pe,
BHA: wh meverep t isese The.
CHouth ESSThavety'ancag thes' n
