In [1]:
import torch
import torch.nn as nn
from  torch.nn import functional as F
torch.manual_seed(1337)

<torch._C.Generator at 0x7f0194147050>

In [2]:
with open("input.txt", "r", encoding = "utf-8") as f:
    text=f.read()
print(len(text))    

1115394


In [3]:
vocab = list(set(text))
vocab.sort()
print("".join(vocab))
vocab_size = len(vocab)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz


In [4]:
#encoder
ctoi = { vocab[i]:i for i in range(len(vocab))}
itoc = { i:vocab[i] for i in range(len(vocab))}
def encode(s): return [ ctoi[i] for i in s]
def decode(t): return "".join([ itoc[i] for i in t])

tokens = encode("hi ho")
s = decode(tokens)
print(tokens, s)

[46, 47, 1, 46, 53] hi ho


In [5]:
tokens = encode(text)
print(tokens[:20])
print(decode(tokens[:20]))

[18, 47, 56, 57, 58, 1, 15, 47, 58, 47, 64, 43, 52, 10, 0, 14, 43, 44, 53, 56]
First Citizen:
Befor


In [6]:
data = torch.tensor(tokens, dtype=torch.long)
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

In [7]:
batch_size = 4
block_size = 8

def get_batch(data):
    indices = torch.randint(len(data)-block_size-1, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in indices], dim=0)
    y = torch.stack([data[i+1:i+block_size+1] for i in indices], dim=0)
    return x,y   

x,y =  get_batch(train_data)    
print(x)
print(y)

tensor([[53, 59,  6,  1, 58, 56, 47, 40],
        [49, 43, 43, 54,  1, 47, 58,  1],
        [13, 52, 45, 43, 50, 53,  8,  0],
        [ 1, 39,  1, 46, 53, 59, 57, 43]])
tensor([[59,  6,  1, 58, 56, 47, 40, 59],
        [43, 43, 54,  1, 47, 58,  1, 58],
        [52, 45, 43, 50, 53,  8,  0, 26],
        [39,  1, 46, 53, 59, 57, 43,  0]])


In [11]:
loss_fn = nn.CrossEntropyLoss()

class Bigram(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = nn.Linear(vocab_size, vocab_size, bias=False)
    def forward(self, x):
        return self.model(x)

class Generator(nn.Module):
    def __init__(self, model):
        super().__init__()
        self.model = model

    def forward(self, x, y = None):
        lx = F.one_hot(x.reshape(-1), vocab_size).type(torch.float32)
        #print(lx.dtype, lx.shape)
        p = self.model(lx)
        if y!=None:
            ly = F.one_hot(y.reshape(-1), vocab_size).type(torch.float32)
            loss = loss_fn(p, ly)
        else:
            loss = None
        return p, loss
    
    def generate(self, count):
        s = torch.zeros((1,1), dtype=torch.long)
        out = s
        for i in range(count):
            #print(s)
            p, _ = self.forward(s)
            probs = F.softmax(p, dim=1)
            s = torch.multinomial(probs,1)
            #print("sample", ss)
            out = torch.cat([out, s], dim=1)

        return decode(out[0].tolist())

In [12]:
def train(model, lr, iterations):
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)

    for j in range(iterations):
        for i in range(1000):   
            x,y =  get_batch(train_data)    
            _, loss = model(x,y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()    
        print(loss.item())

    torch.save(model.state_dict(), "./mymodel.pth")    

In [106]:
bm = Generator(Bigram())
#print(bm.generate(200))
train(bm, lr=1e-3, iterations=10)
print(bm.generate(200))

3.572542190551758
3.1686506271362305
3.091691255569458
2.9778928756713867
2.7426908016204834
2.5406339168548584
2.8756484985351562
2.8838741779327393
2.4446263313293457
2.3781981468200684

ONGRou,ave lxine de t, dyowg jQVZBUSe SSentit as GR: st oteGAn,kY3Lcpil t wal s the y at-S:
An t k;
ISae h Y: the' JArk amisth sthimd I


Antelean pl five thatheit of d Buem hisishint swo,
ds ck!gishi


In [107]:
class Attention(nn.Module):
    def __init__(self, input_size, output_size):
        super().__init__()
        # KQV size
        self.output_size = output_size
        self.key = nn.Linear(input_size, output_size, bias=False)
        self.query = nn.Linear(input_size, output_size, bias=False)
        self.value = nn.Linear(input_size, output_size, bias=False)

    def forward(self, x):
        em_key = self.key(x)
        em_query = self.query(x)
        em_value = self.value(x)

        # the attentions matrix must be the size of the context
        # as it is in reality an adjacency matrix
        att = em_key @ em_query.transpose(-2,-1)

        #print (att.shape)

        att /= self.output_size ** 0.5
        att = F.softmax(att, dim=1)
        return att @ em_value 



In [108]:
dropout=0.0

class Block(nn.Module):
    def __init__(self, num_heads, embedding_size):
        super().__init__()

        self.ln1 = nn.LayerNorm(embedding_size)

        attention_size = embedding_size//num_heads
        self.head = nn.ModuleList( [Attention(embedding_size, attention_size) for _ in range(num_heads)])

        self.linear = nn.Linear(embedding_size, embedding_size)
        self.dp1 = nn.Dropout(dropout)
        self.ln2 = nn.LayerNorm(embedding_size)

        self.ff = nn.Sequential(
            nn.Linear(embedding_size, 4 * embedding_size),
            nn.ReLU(),
            nn.Linear(4 * embedding_size, embedding_size),
            nn.Dropout(dropout),
        )


    def forward(self, x):

        x = self.ln1(x)
        x = x + torch.cat([head(x) for head in self.head], dim=-1)
        x = self.linear(x)
        x = self.dp1(x)

        x = self.ln2(x)
        x = x + self.ff(x)

        return x


class ChatGPT(nn.Module):
    def __init__(self, num_heads, embedding_size):
        super().__init__()

        self.pos = torch.arange(0, block_size, dtype=torch.long)

        self.tok_embedding = nn.Embedding(vocab_size, embedding_size)
        self.pos_embedding = nn.Embedding(block_size, embedding_size)

        self.blocks = nn.ModuleList( [Block(num_heads, embedding_size) for _ in range(num_heads)])

        self.ln = nn.LayerNorm(embedding_size) # final layer norm
        self.linear = nn.Linear(embedding_size, vocab_size)

    def forward(self, x):

        te = self.tok_embedding(x)
        pe = self.pos_embedding(self.pos)
        x = te+pe

        for block in self.blocks:
            x = block(x)
        x = self.ln(x)
        x = self.linear(x)

        return x

c = ChatGPT(2,8)

In [109]:
B = 1
T = 8

x = torch.zeros((B,T), dtype=torch.long)

for i in range(8):
    x[0,i]=i
print(x)

tensor([[0, 1, 2, 3, 4, 5, 6, 7]])


In [112]:
bm = Generator(ChatGPT(2,8))
#print(bm.generate(200))
train(bm, lr=1e-3, iterations=10)
print(bm.generate(200))


In [None]:
x = torch.randn(2, 1, 3)
y = torch.randn(2, 3, 1)

In [None]:
print(x)
print(y)
print(x @ y)


tensor([[[-0.3257, -0.8034, -1.1248]],

        [[ 0.0711,  1.4793,  0.4070]]])
tensor([[[ 0.3168],
         [ 1.1549],
         [-1.8994]],

        [[-0.3102],
         [ 0.0455],
         [-0.2954]]])
tensor([[[ 1.1053]],

        [[-0.0750]]])


In [None]:
x[0] @ y[0]


tensor([[1.1053]])

In [50]:
torch.arange(0, 5, dtype=torch.long)

tensor([0, 1, 2, 3, 4])