In [13]:
import torch
import torch.nn as nn
from torch.nn import functional as F

In [1]:
import os
print(os.getcwd())

c:\source-code\matrix-gpt


In [29]:
# read it in to inspect it
with open(r'text\tinymatrix_clean.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [3]:
print("len: ", len(text))

len:  147064


In [11]:
# let's look at the first 1000 characters
print(text[0:1000])

(Cellular)
Cypher: Yeah.

Trinity: Is everything in place?

Cypher: You weren't supposed to relieve me.

Trinity: I know, but I felt like taking your shift.

Cypher: You like him, don't you? You like watching him.

Trinity: Don't be ridiculous.

Cypher: We're going to kill him, do you understand that?

Trinity: Morpheus believes he is The One.

Cypher: Do you?

Trinity: It doesn't matter what I believe.

Cypher: You don't, do you?

Trinity: Did you hear that?

Cypher: Hear what?

Trinity: Are you sure this line is clean?

Cypher: Yeah, 'course I'm sure.

Trinity: I better go.

(Hotel room)
Cop: Freeze, Police. Hands on your head. Do it. Do it now.

(Street)
Agent Smith: Lieutenant...

Lieutenant: Oh shit.

Agent Smith: Lieutenant, you were given specific orders.

Lieutenant: Hey, I'm just doing my job. You give me that juris-my-dick-tion crap, you can cram it up your ass.

Agent Smith: The orders were for your protection.

Lieutenant: I think we can handle one little girl.... I sent tw

In [30]:
# here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)

# vocabs for tinyshakespeare:             !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
# vocab for tinymatrix (before cleaning): !#%()*,-./0123456789:;?ABCDEFGHIJKLMNOPQRSTUVWYZ[]`abcdefghijklmnopqrstuvwxyz{}âèé–‘’“”…
# vocab for tinymatrix (after cleaning):  !#%()*,-./0123456789:;?ABCDEFGHIJKLMNOPQRSTUVWYZ[]`abcdefghijklmnopqrstuvwxyz{}âèé–


 !#%'()*,-./0123456789:;?ABCDEFGHIJKLMNOPQRSTUVWYZ[]abcdefghijklmnopqrstuvwxyz{}âèé–
85


In [19]:
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

word_to_encode = "hello"
# word_to_encode = " !#%()*,-./0123456789:;?ABCDEFGHIJKLMNOPQRSTUVWYZ[]`abcdefghijklmnopqrstuvwxyz{}âèé–‘’“”…"
print(f"encoding (stoi) and decoding (itos) for {word_to_encode}:")
print(f"         {word_to_encode} --> \n    ", encode(word_to_encode), end="\n    -->")
print(" ", decode(encode(word_to_encode)))

encoding (stoi) and decoding (itos) for hello:
         hello --> 
     [61, 58, 65, 65, 68]
    -->  hello


In [26]:
a = decode(encode(text[0:500]))

with open('test.txt', 'w') as file:
    file.write(a)

In [22]:
# encode into a torch.Tensor
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:50]) # the first 50 characters after encoding

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56])


In [23]:
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

In [25]:
batch_size = 4
block_size = 8

def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    # generate randint from 0 to the size of the data with output size (8,)
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

In [26]:
encode("hello wo")

[46, 43, 50, 50, 53, 1, 61, 53]

In [27]:
torch.manual_seed(1337)
xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)

inputs:
torch.Size([4, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
targets:
torch.Size([4, 8])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])


In [28]:
class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):
        logits = self.token_embedding_table(idx) # (B, T, C)
        # print(logits.shape)

        if targets==None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T,C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            logits, loss = self(idx)
            
            logits = logits[:, -1]
            # print(logits)
            probs = F.softmax(logits, dim=-1)
            # print(probs)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
            # print('--')
        return idx

In [29]:
torch.manual_seed(1337)
m = BigramLanguageModel(vocab_size)
# logits, loss = m(xb, yb)
# print(logits)
# print(loss)
print(decode(m.generate(torch.zeros((1,1), dtype=torch.long), max_new_tokens=100)[0].tolist()))


Sr?qP-QWktXoL&jLDJgOLVz'RIoDqHdhsV&vLLxatjscMpwLERSPyao.qfzs$Ys$zF-w,;eEkzxjgCKFChs!iWW.ObzDnxA Ms$3


In [30]:
optimizer = torch.optim.AdamW(m.parameters(), lr = 1e-3)

In [31]:
batch_size = 32
for steps in range(5000):

    xb, yb = get_batch('train')

    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

    print(loss.item())

4.704006195068359
4.721118927001953
4.653193473815918
4.706261157989502
4.780904293060303
4.751267910003662
4.8395490646362305
4.667973041534424
4.743716716766357
4.774043083190918
4.6908278465271
4.789143085479736
4.61777925491333
4.650947093963623
4.886447429656982
4.703796863555908
4.757591724395752
4.65510892868042
4.709283828735352
4.6745147705078125
4.760501384735107
4.7892632484436035
4.653748512268066
4.6619181632995605
4.673007488250732
4.66577672958374
4.7301106452941895
4.755304336547852
4.712186813354492
4.745501518249512
4.726755619049072
4.735108375549316
4.777461051940918
4.643350601196289
4.6651835441589355
4.79764461517334
4.717412948608398
4.683647155761719
4.81886100769043
4.613771915435791
4.573785781860352
4.560741901397705
4.81563138961792
4.6061553955078125
4.619696140289307
4.725419521331787
4.650487899780273
4.5941481590271
4.7202863693237305
4.699342250823975
4.6724138259887695
4.727972984313965
4.66152286529541
4.616766929626465
4.599857807159424
4.6533403396

In [32]:
print(decode(m.generate(torch.zeros((1,1), dtype=torch.long), max_new_tokens=500)[0].tolist()))


Mavanghtang d wsramangT:
D: nd TINT:
Fof;AURKIfediTus, be:ghere
Dyo.O'd, t fed inks cerVis benes. owepwnof pre, athar wowir W:
ALI u
Bue;bOgfatho rend thic; is be wara!
PO.
WGListh R.
JO:
TERI! ce, ince:'
t.
Sin honk$zlerseestindovrer wat boue nodgh flle, mOM:

SCKpoto AKIFwinthind d me sesete gkerw'DWhe arDERe

HET:
ROLINanxueile to he bis wllagGomieg.
DWhelat s tropghallll,
Wh trnenQJMmllel $fupat iBOFokKENor-mu ed de atos are th, a!Oun my s l:
E-de pond
TII'henJAllong bogave tthe s me : t
Yo-
