In [15]:
import torch
from sklearn.model_selection import train_test_split

In [1]:
with open('input.txt', 'r', encoding = "utf-8") as file:
    text = file.read()

In [27]:
print(len(text))
vocab = len(sorted(set(text)))
print(vocab)

1115393
65


In [4]:
#find all unique characters in the text
chars = sorted(list(set(text)))
print('total chars:',chars,  len(chars))

total chars: ['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'] 65


In [7]:
#create tokens for each char
tokens = {char: idx for idx, char in enumerate(set(text))}
snekot = {idx: char for idx, char in enumerate(set(text))}
encode = lambda s: [tokens[c] for c in s]
decode = lambda a: ''.join([snekot[i] for i in a])
print(encode("Hello there"))
print(decode(encode("Hello there")))


[54, 35, 19, 19, 6, 61, 63, 59, 35, 32, 35]
Hello there


In [9]:
import sentencepiece as spm
spm.SentencePieceTrainer.train(input='input.txt', model_prefix='model', vocab_size=10000)


[89, 212, 25, 1370, 5]

In [12]:
sp = spm.SentencePieceProcessor(model_file='model.model')
sp.Encode("My name is John.")
sp.DecodeIds([89, 212, 25, 1370, 5])

'My name is John.'

In [14]:
data = torch.tensor(encode(text), dtype=torch.int64)

In [16]:
train_data, val_data = train_test_split(data, test_size=0.1, random_state=42)

In [17]:
block_size = 10
train_data[:block_size+1]
#this simulates a basic ngram proposition, where 50 implies 18. 50 and 18 implies 18. 50, 18, and 18 implies 63. etc.
#this trains the transformer to be used to context of n=1 all the way up to n=block_size.

tensor([50, 18, 18, 63, 37, 61, 36, 27, 32, 19, 57])

In [18]:
#blocks/chunks are groups of tokens
#batches are groups of blocks/chunks

In [19]:
torch.manual_seed(0)
block_size = 10
batch_size = 4
def get_batch(split):
    data = train_data if split == "train" else val_data
    ix = torch.randint(0,len(data) - block_size - 1, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

In [24]:
xb,yb = get_batch('train')
print(xb.shape,'\n',xb)
print(yb.shape,'\n',yb)
for b in range(batch_size):
    for c in range(block_size):
        con = xb[b, :c+1]
        tar = yb[b, c]
        print("When input is", con, "target is", tar)

torch.Size([4, 10]) 
 tensor([[32, 59, 50, 35, 16, 59, 19,  6, 18,  6],
        [60, 18, 63, 23, 61, 50, 32, 19, 52,  9],
        [54, 59, 63, 63, 47,  5, 61, 41, 61, 25],
        [14, 43,  5, 47, 18, 58, 32, 61,  5, 49]])
torch.Size([4, 10]) 
 tensor([[59, 50, 35, 16, 59, 19,  6, 18,  6,  5],
        [18, 63, 23, 61, 50, 32, 19, 52,  9, 18],
        [59, 63, 63, 47,  5, 61, 41, 61, 25, 23],
        [43,  5, 47, 18, 58, 32, 61,  5, 49, 23]])
When input is tensor([32]) target is tensor(59)
When input is tensor([32, 59]) target is tensor(50)
When input is tensor([32, 59, 50]) target is tensor(35)
When input is tensor([32, 59, 50, 35]) target is tensor(16)
When input is tensor([32, 59, 50, 35, 16]) target is tensor(59)
When input is tensor([32, 59, 50, 35, 16, 59]) target is tensor(19)
When input is tensor([32, 59, 50, 35, 16, 59, 19]) target is tensor(6)
When input is tensor([32, 59, 50, 35, 16, 59, 19,  6]) target is tensor(18)
When input is tensor([32, 59, 50, 35, 16, 59, 19,  6, 18]) 

In [52]:
import torch.nn as nn
import torch.nn.functional as F
class Bigram(nn.Module):
    def __init__(self,  vocab):
        super().__init__()
        self.token_embedding_table= nn.Embedding(vocab, vocab)
    def forward(self, idx, targets=None):
        logits= self.token_embedding_table(idx)#creates a [batch, chunk, vocab] tensor
            #logits represent the next scores for each embedding
       
        
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B * T, C)#reshaping for the loss fn
            targets = targets.view(-1)#reshaping for the loss fn
            loss = F.cross_entropy(logits, targets)
        return logits, loss
    def generate(self, idx, max):
        #idx is the current token
        for _ in range(max):
            #get predictions
            logits, loss = self(idx)
            logits = logits[:,-1,:]#get the timestep to form (B,C) tensor
            probs= F.softmax(logits, dim=-1)
            next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, next), dim=1)
        return idx
m = Bigram(vocab)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)


torch.Size([40, 65])
tensor(4.5501, grad_fn=<NllLossBackward0>)


In [54]:
#testing
idx = torch.zeros((1, 1), dtype=torch.long)
print(decode(m.generate(idx, 100)[0].tolist()))

MzLM


In [55]:
#training
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [66]:
batch_size=32
for i in range(10000):
    x,y = get_batch('train')
    logits, loss = m(x, y)
    optimizer.zero_grad(set_to_none=True)#set grads to 0
    loss.backward()#do backward prop
    optimizer.step()#update params based on grad values
print(loss.item())

3.302943706512451


In [67]:
print(decode(m.generate(idx, 100)[0].tolist()))

Mhd h
bTac'ohrremhabsysiwhwo:yi 
i ratgni n en,c eehau nuioitot,meulve ;bao l ee uilufsla
cItsri,tn
X


In [81]:
torch.manual_seed(0)
B, T,C = 8,4,2
x = torch.randn(B,T,C) #b is batches, t is chunks, c is vocab size
x.shape

torch.Size([8, 4, 2])

In [82]:
xbow = torch.zeros(B, T, C)
for b in range(B):
    for t in range(T):
        xprev = x[b, :t+1] 
        xbow[b, t] = torch.mean(xprev, 0)

In [83]:
xbow[0]

tensor([[-1.1258, -1.1524],
        [-0.6882, -0.7931],
        [-0.1759, -0.2981],
        [-0.2109, -0.7524]])

In [88]:
#@can be used for matrix multiplication
#more efficient way of memory using matrix averaging.
tril = torch.tril(torch.ones(T, T))
weights = torch.tril(torch.zeros(T, T)) #lower triange matrix
weights = weights/weights.sum(1, keepdim=True)#make each row sum to 1, as probabilities should be from 0 to 1
xbow2 = weights@x

In [92]:
weights = torch.tril(torch.ones(T, T)) #lower triange matrix
weights = weights.masked_fill(tril == 0, float('-inf'))
weights = F.softmax(weights, dim=-1)
weights #again, same thing, but uses a softmax function, which exponentiates the values and then normalizes them
xbow3 = weights@x
xbow3
#basically, the weights are used to represent each time step as a weighted average of all the previous time steps

tensor([[[-1.1258, -1.1524],
         [-0.6882, -0.7931],
         [-0.1759, -0.2981],
         [-0.2109, -0.7524]],

        [[ 0.3223, -1.2633],
         [ 0.3361, -0.4776],
         [ 0.2640,  0.0942],
         [ 0.4772,  0.0088]],

        [[-1.3527, -1.6959],
         [-0.3930, -0.4512],
         [-0.0624, -0.8192],
         [-0.1321, -0.1511]],

        [[ 0.7502, -0.5855],
         [ 0.2884, -0.2010],
         [ 0.6554,  0.3948],
         [ 0.7281,  0.0852]],

        [[-0.6136,  0.0316],
         [-0.5531,  0.1400],
         [-0.2222,  0.1308],
         [-0.0064,  0.2084]],

        [[-0.1023,  0.7924],
         [-0.1960,  0.4225],
         [ 0.0436,  1.0491],
         [-0.3345,  0.3901]],

        [[-0.6731,  0.8728],
         [ 0.1911,  0.5253],
         [ 0.0506,  0.2196],
         [ 0.1738,  0.0659]],

        [[-0.4462,  0.7440],
         [ 0.5374,  2.0773],
         [-0.1521,  0.9735],
         [ 0.3408,  0.5922]]])