In [1]:
import torch
import torch.nn as nn
from  torch.nn import functional as F
torch.manual_seed(1337)

<torch._C.Generator at 0x7f3a1817b050>

In [2]:
with open("input.txt", "r", encoding = "utf-8") as f:
    text=f.read()
print(len(text))    

1115394


In [3]:
vocab = list(set(text))
vocab.sort()
print("".join(vocab))
vocab_size = len(vocab)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz


In [4]:
#encoder
ctoi = { vocab[i]:i for i in range(len(vocab))}
itoc = { i:vocab[i] for i in range(len(vocab))}
def encode(s): return [ ctoi[i] for i in s]
def decode(t): return "".join([ itoc[i] for i in t])

tokens = encode("hi ho")
s = decode(tokens)
print(tokens, s)

[46, 47, 1, 46, 53] hi ho


In [5]:
tokens = encode(text)
print(tokens[:20])
print(decode(tokens[:20]))

[18, 47, 56, 57, 58, 1, 15, 47, 58, 47, 64, 43, 52, 10, 0, 14, 43, 44, 53, 56]
First Citizen:
Befor


In [6]:
data = torch.tensor(tokens, dtype=torch.long)
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

In [41]:
batch_size = 4
block_size = 8

def get_batch(data):
    indices = torch.randint(len(data)-block_size-1, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in indices], dim=0)
    y = torch.stack([data[i+1:i+block_size+1] for i in indices], dim=0)
    return x,y   

x,y =  get_batch(train_data)    
print(x)
print(y)

tensor([[43,  1, 49, 47, 52, 45,  1, 56],
        [50, 53, 57, 43,  1, 58, 46, 43],
        [59,  1, 39, 56, 58,  1, 61, 53],
        [56, 57, 43,  1, 46, 39, 60, 43]])
tensor([[ 1, 49, 47, 52, 45,  1, 56, 43],
        [53, 57, 43,  1, 58, 46, 43,  1],
        [ 1, 39, 56, 58,  1, 61, 53, 51],
        [57, 43,  1, 46, 39, 60, 43,  1]])


In [38]:
def train(model, lr, iterations):
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)

    for j in range(iterations):
        for i in range(1000):   
            x,y =  get_batch(train_data)    
            _, loss = model(x,y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()    
        print(loss.item())

    torch.save(model.state_dict(), "./mymodel.pth")    

In [89]:
loss_fn = nn.CrossEntropyLoss()

class Bigram(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = nn.Embedding(vocab_size, vocab_size)
    def forward(self, x):
        return self.model(x)

class Generator(nn.Module):
    def __init__(self, model):
        super().__init__()
        self.model = model

    def forward(self, x, y = None):
        #lx = F.one_hot(x.reshape(-1), vocab_size).type(torch.float32)
        #print(lx.dtype, lx.shape)
        p = self.model(x)
        if y!=None:
            ly = F.one_hot(y, vocab_size).type(torch.float32)
            loss = loss_fn(p.permute(0,2,1), ly.permute(0,2,1))
        else:
            loss = None
        return p, loss
    
    def generate(self, count):
        s = torch.zeros((1,1), dtype=torch.long)
        out = s
        for i in range(count):
            #print(s)
            p, _ = self.forward(s)
            probs = F.softmax(p[0], dim=1)
            s = torch.multinomial(probs,1)
            #print("sample", ss)
            out = torch.cat([out, s], dim=1)

        return decode(out[0].tolist())

In [91]:

bm = Generator(Bigram())
print(bm.generate(200))
train(bm, lr=1e-3, iterations=10)
print(bm.generate(200))


3cnIFf.,u?e' cvOu?!&YXdDbn?&fffWtxvef,;bn'r$xWpnbUazN!YSaq3vTipGhJkTWR:G3!harUEe'bnzSENC;UjTF
:'dvgXXKZElpJPsDbkXdv
3ZAAgrAg$$jgOdN:i:HqOvZhFFNNLm3aQiuMrUi.xDY!T E;gnl, iOf'$;OwqXmQJdVcbJ,m'.rIocFP.;S
3.894418954849243
3.5484983921051025
3.3535237312316895
3.000521659851074
2.7436137199401855
2.645657777786255
2.6030640602111816
2.6691739559173584
2.526980400085449
2.479642868041992

MyQzhe!, do t ar MbedK:
HEShind r.f ts ckioca lin sause.
DYremawock
AUn as RGvinengongis
TO:
Lnd imyo,
JIn d wige me fr'ss grm.
WshOZ$CKAngahe Gwim, ayarr ard e cdend mngronsFlad thin t ft:

F-s.

O, 


In [92]:
class Attention(nn.Module):
    def __init__(self, input_size, output_size):
        super().__init__()
        # KQV size
        self.output_size = output_size
        self.key = nn.Linear(input_size, output_size, bias=False)
        self.query = nn.Linear(input_size, output_size, bias=False)
        self.value = nn.Linear(input_size, output_size, bias=False)

    def forward(self, x):
        em_key = self.key(x)
        em_query = self.query(x)
        em_value = self.value(x)

        # the attentions matrix must be the size of the context
        # as it is in reality an adjacency matrix
        att = em_key @ em_query.transpose(-2,-1)

        #print (att.shape)

        att /= self.output_size ** 0.5

        sz = att.shape[-1]
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        att += mask

        att = F.softmax(att, dim=1)
        return att @ em_value 



In [120]:
dropout=0.0

class Block(nn.Module):
    def __init__(self, num_heads, embedding_size):
        super().__init__()

        self.ln1 = nn.LayerNorm(embedding_size)

        attention_size = embedding_size//num_heads
        self.head = nn.ModuleList( [Attention(embedding_size, attention_size) for _ in range(num_heads)])

        self.linear = nn.Linear(embedding_size, embedding_size)
        self.dp1 = nn.Dropout(dropout)
        self.ln2 = nn.LayerNorm(embedding_size)

        self.ff = nn.Sequential(
            nn.Linear(embedding_size, 4 * embedding_size),
            nn.ReLU(),
            nn.Linear(4 * embedding_size, embedding_size),
            nn.Dropout(dropout),
        )


    def forward(self, x):

        x = self.ln1(x)
        x = x + torch.cat([head(x) for head in self.head], dim=-1)
        x = self.linear(x)
        x = self.dp1(x)

        x = self.ln2(x)
        x = x + self.ff(x)

        return x


class ChatGPT(nn.Module):
    def __init__(self, num_blocks, num_heads, embedding_size):
        super().__init__()

        self.pos = torch.arange(0, block_size, dtype=torch.long)

        self.tok_embedding = nn.Embedding(vocab_size, embedding_size)
        self.pos_embedding = nn.Embedding(block_size, embedding_size)

        self.blocks = nn.ModuleList( [Block(num_heads, embedding_size) for _ in range(num_blocks)])

        self.ln = nn.LayerNorm(embedding_size) # final layer norm
        self.linear = nn.Linear(embedding_size, vocab_size)

    def forward(self, x):

        te = self.tok_embedding(x)
        pe = self.pos_embedding(self.pos)
        x = te+pe

        for block in self.blocks:
            x = block(x)
        x = self.ln(x)
        x = self.linear(x)

        return x

c = ChatGPT(4,8,8*4)

In [121]:
class Generator(nn.Module):
    def __init__(self, model):
        super().__init__()
        self.model = model

    def forward(self, x, y = None):
        #lx = F.one_hot(x.reshape(-1), vocab_size).type(torch.float32)
        #print(lx.dtype, lx.shape)
        p = self.model(x)
        if y!=None:
            ly = F.one_hot(y, vocab_size).type(torch.float32)
            loss = loss_fn(p.permute(0,2,1), ly.permute(0,2,1))
        else:
            loss = None
        return p, loss
    
    def generate(self, count):
        s = torch.zeros((1,8), dtype=torch.long)
        out = s
        for i in range(count):
            #print(s)
            p, _ = self.forward(out[:,-8:])
            probs = F.softmax(p[0], dim=1)
            s = torch.multinomial(probs,1)
            #print("sample", ss)
            out = torch.cat([out, s[-1].unsqueeze(1)], dim=1)

        return decode(out[0].tolist())

In [134]:
#cg = Generator(ChatGPT(4,8,64))
train(cg, lr=1e-4, iterations=1)



0.42737531661987305


In [138]:

s = torch.zeros((1,8), dtype=torch.long)
out = s
for i in range(2000):
    #print(s)
    p, _ = cg(out[:,-8:])
    probs = F.softmax(p[0], dim=1)
    s = torch.multinomial(probs,1)
    #print("sample", ss)
    out = torch.cat([out, s[-1].unsqueeze(1)], dim=1)

print( decode(out[0].tolist()))










Stwhalite of is pe'dsend;
Aly enppevot nor acunkbecospis nosthe fey nicn alos met ave; oen. mese fareme:szeg,
It,
Thit do It entsent,
Sand!
'd ceres'dPWif aness of an!be oftse-taly geor astenuglhU:reves not sony tiv: la?
Heprad elsteat te.

I'rdpok isg incadetse!s?
Seger corangatcs lohate
CIbpay bogod to tae,'s fosent,
Fingsegporit at,n nce anfanginn,
Tigt whonge ant inete int lakcannd,
Go rorosengo uren Hencest enocer;
Ram to to dsal' seWitecncanontte, imes,
Mical,
I est' the at, istiate the sind itest ian.

Drise cadant fory,
dt to meise and ant, aveing!
Nongeasssceb oro fat odrseen ok coonente, ind asenlord minnsas sonecn faveantelea yo ancond ancu .

Yat oren hale, cand to my sat-O
Itsetls ripowears;
ewen! jut aveelt nor morat aint:
Yoseensd
eonuhinggelsf, pavessimek,, ke ached aenleent otl-ed
Had
And a manc,
Youdtinsang, esson ske are lave.

G ihat at moynst avhthsingefced sanke youse nat and tend, aso an no sago steis a faverche!
OUNr incakdeng larsinn athere, chagre ku p