In [1]:
import torch
import torch.nn as nn
from  torch.nn import functional as F
torch.manual_seed(1337)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [2]:
with open("input.txt", "r", encoding = "utf-8") as f:
    text=f.read()
print(len(text))    

1115394


In [3]:
vocab = list(set(text))
vocab.sort()
print("".join(vocab))
vocab_size = len(vocab)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz


In [4]:
#encoder
ctoi = { vocab[i]:i for i in range(len(vocab))}
itoc = { i:vocab[i] for i in range(len(vocab))}
def encode(s): return [ ctoi[i] for i in s]
def decode(t): return "".join([ itoc[i] for i in t])

tokens = encode("hi ho")
s = decode(tokens)
print(tokens, s)

[46, 47, 1, 46, 53] hi ho


In [5]:
tokens = encode(text)
print(tokens[:20])
print(decode(tokens[:20]))

[18, 47, 56, 57, 58, 1, 15, 47, 58, 47, 64, 43, 52, 10, 0, 14, 43, 44, 53, 56]
First Citizen:
Befor


In [6]:
data = torch.tensor(tokens, dtype=torch.long, device = device)
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

In [7]:
def get_batch(data, batch_size = 4, block_size = 8 ):
    indices = torch.randint(len(data)-block_size-1, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in indices], dim=0)
    y = torch.stack([data[i+1:i+block_size+1] for i in indices], dim=0)
    return x,y   

x,y =  get_batch(train_data)    
print(x)
print(y)

tensor([[53, 59,  6,  1, 58, 56, 47, 40],
        [49, 43, 43, 54,  1, 47, 58,  1],
        [13, 52, 45, 43, 50, 53,  8,  0],
        [ 1, 39,  1, 46, 53, 59, 57, 43]], device='cuda:0')
tensor([[59,  6,  1, 58, 56, 47, 40, 59],
        [43, 43, 54,  1, 47, 58,  1, 58],
        [52, 45, 43, 50, 53,  8,  0, 26],
        [39,  1, 46, 53, 59, 57, 43,  0]], device='cuda:0')


In [8]:
def compute_loss(model, dataset):
    model.eval()
    with torch.no_grad():
        total = 0
        for i in range(100):   
            x,y =  get_batch(dataset, 64, model.get_context_size())    
            _, loss = model(x,y)
            total += loss
        model.train()
        return float((total/100).cpu())

def train(model, lr, batch_size, iterations, iter_eval, run_name=""):
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)

    print(compute_loss(model, train_data), compute_loss(model, val_data))

    for it in range(iterations):
        x,y =  get_batch(train_data, batch_size, model.get_context_size())    
        _, loss = model(x,y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()    
        if it % iter_eval == 0:
            print(it//iter_eval, compute_loss(model, train_data), compute_loss(model, val_data))
        
    torch.save(model.state_dict(), f"./{model.model.name}_{run_name}.pth")    

In [9]:
loss_fn = nn.CrossEntropyLoss()

class Bigram(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = nn.Embedding(vocab_size, vocab_size)
        self.name = f"bigram_{vocab_size}"

    def forward(self, x):
        return self.model(x)

class Generator(nn.Module):
    def __init__(self, model):
        super().__init__()
        self.model = model

    def forward(self, x, y = None):
        p = self.model(x)
        if y!=None:
            ly = F.one_hot(y, vocab_size).type(torch.float32)
            loss = loss_fn(p.permute(0,2,1), ly.permute(0,2,1))
        else:
            loss = None
        return p, loss
    
    def generate(self, count):
        s = torch.zeros((1,1), dtype=torch.long, device = device)
        out = s
        for i in range(count):
            #print(s)
            p, _ = self.forward(s)
            probs = F.softmax(p[0], dim=1)
            s = torch.multinomial(probs,1)
            #print("sample", ss)
            out = torch.cat([out, s], dim=1)

        return decode(out[0].tolist())
    
    def get_context_size(self):
        return 1

In [10]:
bm = Generator(Bigram()).to(device)
print(bm.generate(200))
train(bm, lr=1e-3, batch_size=4, iterations=10000, iter_eval=1000)
print(bm.generate(200))


yq$;tfBfROkNdcuwdZZTkOMl;,ertK
w:!PLCkMBbeA$3:XaSGJO-3p&M-c?KL3auhpFYVXJFhNNNuhq$OMxv.tbVFYdXlrFZaAeNuw:cPPyREFkHDEZaYJFzyWNuX
Yo3&$LMtofBimzLB!!&V!Ox;Kl;l;ZcKe3 ixYeYEFngmi;;lxWvHFGEZEQG EsSXHB;kW3 J
4.627649307250977 4.62070894241333
0 4.639685153961182 4.646108150482178
1 4.330250263214111 4.339630126953125
2 4.061417579650879 4.086883544921875
3 3.82150936126709 3.805820941925049
4 3.6014459133148193 3.639233350753784
5 3.446291446685791 3.463571310043335
6 3.2874019145965576 3.3281590938568115
7 3.202650308609009 3.2097079753875732
8 3.050213575363159 3.106464147567749
9 2.992992401123047 3.0160021781921387

BYGENilerjbouselplind me l.
lishe cnchiry:
Uug;Mnisspllw y.O:ur n'SIREDmopetelivIEjMPithy wJd mothakllo W,Coo wh VCeiib3MI'Thom bMxWivDThenghim$Fs p-LK3gAY-xT3b

ALENxmntcrurt f so;;3QQDLETm:
EN,CI ma


In [11]:
class Attention(nn.Module):
    def __init__(self, context_size, input_size, output_size):
        super().__init__()
        # KQV size
        self.output_size = output_size
        self.key = nn.Linear(input_size, output_size, bias=False)
        self.query = nn.Linear(input_size, output_size, bias=False)
        self.value = nn.Linear(input_size, output_size, bias=False)

        sz = context_size
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        self.register_buffer("mask", mask)


    def forward(self, x):
        em_key = self.key(x)
        em_query = self.query(x)
        em_value = self.value(x)

        # the attentions matrix must be the size of the context
        # as it is in reality an adjacency matrix
        att = em_query @ em_key.transpose(-2,-1)

        #print (att.shape)

        att /= self.output_size ** 0.5

        att += self.mask

        att = F.softmax(att, dim=-1)
        return att @ em_value 

In [12]:
dropout=0.2

class Block(nn.Module):
    def __init__(self, context_size, num_heads, embedding_size):
        super().__init__()

        self.ln1 = nn.LayerNorm(embedding_size)

        self.head = nn.ModuleList( [Attention(context_size, embedding_size, embedding_size//num_heads) for _ in range(num_heads)])
        self.linear = nn.Linear(embedding_size, embedding_size)
        self.dp1 = nn.Dropout(dropout)
        
        self.ln2 = nn.LayerNorm(embedding_size)

        self.ff = nn.Sequential(
            nn.Linear(embedding_size, 4 * embedding_size),
            nn.ReLU(),
            nn.Linear(4 * embedding_size, embedding_size),
            nn.Dropout(dropout),
        )


    def forward(self, x):

        lx = self.ln1(x)
        x1 = self.linear(torch.cat([head(lx) for head in self.head], dim=-1))
        x1 = self.dp1(x1)
        x = x + x1
        
        lx = self.ln2(x)
        x2 = self.ff(lx)
        x = x + x2

        return x


class ChatGPT(nn.Module):
    def __init__(self, context_size, num_blocks, num_heads, embedding_size):
        super().__init__()
        self.name = f"gpt_{context_size}_{num_blocks}_{num_heads}_{embedding_size}"
        self.context_size = context_size
        pos = torch.arange(0, context_size, dtype=torch.long)
        self.register_buffer("pos", pos)

        self.tok_embedding = nn.Embedding(vocab_size, embedding_size)
        self.pos_embedding = nn.Embedding(context_size, embedding_size)

        self.blocks = nn.Sequential( *[Block(context_size, num_heads, embedding_size) for _ in range(num_blocks)])

        self.ln = nn.LayerNorm(embedding_size) # final layer norm
        self.linear = nn.Linear(embedding_size, vocab_size)

    def forward(self, x):
        
        te = self.tok_embedding(x)
        pe = self.pos_embedding(self.pos)
        x = te + pe

        x = self.blocks(x)
        x = self.ln(x)

        x = self.linear(x)

        return x

In [13]:
class Generator(nn.Module):
    def __init__(self, model):
        super().__init__()
        self.model = model

    def forward(self, x, y = None):
        p = self.model(x)
        if y!=None:
            ly = F.one_hot(y, vocab_size).type(torch.float32)
            loss = loss_fn(p.permute(0,2,1), ly.permute(0,2,1))
        else:
            loss = None
        return p, loss
    
    def generate(self, count, str=" "):
        self.eval()
        with torch.no_grad():
            s = torch.zeros((1,self.model.context_size), dtype=torch.long).to(device)

            prompt = torch.tensor([encode(str)], dtype=torch.long, device = device)
            prompt_len = len(str)

            s[0, -prompt_len:] = prompt
            out = s
            for i in range(count):
                p, _ = self.forward(out[:,-self.model.context_size:])
                probs = F.softmax(p[0], dim=1)
                s = torch.multinomial(probs,1)
                out = torch.cat([out, s[-1].unsqueeze(1)], dim=1)

            return decode(out[0].tolist()[self.model.context_size-prompt_len:])
        self.train()

    def get_context_size(self):
        return self.model.context_size

In [14]:
def Experiment(context_size = 8, num_blocks = 4, num_heads = 8, embedding_size = 64):
    print("configuration", context_size, num_blocks, num_heads, embedding_size)
    gen = ChatGPT(context_size, num_blocks, num_heads, embedding_size)
    cg = Generator(gen).to(device)
    print(gen.name+".pth")
    try:
        cg.load_state_dict(torch.load(gen.name+".pth"))
    except:
        pass

    #train(cg, lr=1e-4, iterations=10)
    return cg


In [15]:
e = Experiment(context_size = 8, num_blocks = 4, num_heads = 8, embedding_size = 8*8)
print(e.generate(500))
train(e, lr=1e-4, batch_size=64, iterations=2000, iter_eval=100 )
print(e.generate(500))


configuration 8 4 8 64
gpt_8_4_8_64.pth
 UUJ$rA!xpD:: x;fDbRaxnxeWGsdGQ3qf3alANY!jtdogKW?':-cj$QN.Sia!nlkCn$x&OqCCxDNsa33 sPPu:KyYTg!D$UQ3ayF;:eDxqQa3x !Ed' an?McW$NfZF,xaKc$3cN&S'MYJ&f-QAc&Y$wxsUX$sf- IR.?'Bp$DUx3&snfcYl$-e
qN3an$:bm tfrxJakN.OEYt3-?YXNeOqxowgpffQ&xcnva$bYk,mo-hh.JYDKnxxhkrNx',Ts3MY;KL$a!&-d j'L f?xYXcPadTT$xGxfmUXfk'ZjRxOagfoaqq!UH$f!QWIJ$xxkNroBNzYNWgYysQaefxLPfhfy,$eKVP$:ulczdfjxBXKEz$Dc$xfp cTv:;!!PWYMeDT-cYTfWlvkr,ckxx3.N
VObWWfZx,NJmafcxXeaNul ;$Ha?YW:fbgdg'?soQ:-fPxMVflz3FfcBqyf-h!NfHfZ-fg3NUdV:
GnxXd$ ;f&IzDkd
4.343344211578369 4.341740131378174
0 4.32589864730835 4.31722354888916
1 3.397289276123047 3.4277865886688232
2 3.1568245887756348 3.1844165325164795
3 2.9406394958496094 2.953723907470703
4 2.7771823406219482 2.796318292617798
5 2.6856842041015625 2.6903133392333984
6 2.621943712234497 2.6182382106781006
7 2.581357002258301 2.5816054344177246
8 2.5337867736816406 2.53439998626709
9 2.496969699859619 2.5030245780944824
10 2.463001012802124 2.48658084869

In [17]:
e2 = Experiment(context_size = 256, num_blocks = 6, num_heads = 6, embedding_size = 6*64)
train(e2, lr=1e-4, batch_size=64, iterations=3000, iter_eval=50 )
print(e2.generate(1500))

configuration 256 6 6 384


gpt_256_6_6_384.pth
4.3363566398620605 4.341015338897705
0 3.981755018234253 3.9955132007598877
1 2.6786558628082275 2.700760841369629
2 2.5303521156311035 2.542194366455078
3 2.4951159954071045 2.506218910217285
4 2.474208116531372 2.488203287124634
5 2.4575226306915283 2.475525140762329
6 2.441577911376953 2.459533452987671
7 2.423408031463623 2.4427058696746826
8 2.4047064781188965 2.427353620529175
9 2.3934988975524902 2.4105567932128906
10 2.35853910446167 2.377129077911377
11 2.3211984634399414 2.347334146499634
12 2.280453681945801 2.3132970333099365
13 2.2331511974334717 2.2614188194274902
14 2.178341865539551 2.2184650897979736
15 2.136789560317993 2.1865761280059814
16 2.0976104736328125 2.14700984954834
17 2.058650016784668 2.1214194297790527
18 2.029344320297241 2.0925214290618896
19 1.9949886798858643 2.0658798217773438
20 1.9657407999038696 2.050616979598999
21 1.9388052225112915 2.0284903049468994
22 1.913833737373352 2.0178563594818115
23 1.8914642333984375 2.0059235095

In [None]:
# overfitting test
e3 = Experiment(context_size = 256, num_blocks = 6, num_heads = 6, embedding_size = 6*64)
train(e3, lr=1e-4, batch_size=64, iterations=15000, iter_eval=100, run_name = "overfitting" )
print(e3.generate(500))