In [14]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt

# hyper parameters
LR = 0.01
n_embd = 100 # size of hidden layer
n_embd2 = 50 # size of hidden layer
block_size = 8 # context length

%matplotlib inline
g = torch.Generator().manual_seed(2147483647) # for reproducibility

torch.__version__

'2.1.1+cu121'

In [1]:
import random
with open("names.txt", "r+") as f:
	words = f.read().splitlines()
	words = [word.strip() for word in words] # get rid of any trailing spaces
	names = [w for w in words if w] # get rid of any empty strings
	
with open("names.txt", "w") as f: 
	joined = "\n".join(names)
	f.write(joined)
min_chars = 1
max_chars = max(len(v) for v in names)
chars = sorted(list(set("".join(names))))

# in replacement of the start and end token. Every name should end with a period. and there should be no start token to begin a sequence
chars = ['.'] + chars
vocab_size = len(chars)
print("names: ", names[:5])
print("number of names: ", len(names))
print("(list of chars, count): ", ("".join(chars), vocab_size))
print("(max word length, min word length): ", (max_chars, min_chars))

atoi = {ch:i for i,ch in enumerate(chars)}
itoa = {i:ch for i,ch in enumerate(chars)}

# adding end token to each name
names = [list(name) + ['.'] for name in names]

names:  ['nain', 'augustine', 'lanay', 'kalany', 'marijose']
number of names:  32033
(list of chars, count):  ('.abcdefghijklmnopqrstuvwxyz', 27)
(max word length, min word length):  (15, 1)


In [4]:
class Embedding:
    def __init__(self, num_embeddings, embedding_dim):
        self.weight = torch.randn((num_embeddings, embedding_dim))
    def __call__(self, x):
        return self.weight[x]

    def parameters(self):
        return [self.weight]

class Linear:
    def __init__(self, in_features, out_features, bias=True, dtype=None):
        self.gain = torch.randn(
            (in_features, out_features), dtype=dtype)
        self.bias = torch.randn(
            out_features, dtype=dtype) if bias else None

    def __call__(self, input: torch.Tensor):
        out = input @ self.gain
        if self.bias is not None:
            out += self.bias
        return out

    def parameters(self):
        return [self.gain] + ([] if self.bias is None else [self.bias])

class Tanh:
    def __call__(self, x):
        self.out = torch.tanh(x)
        return self.out

    def parameters(self):
        return []

In [35]:
# model parameters
C = torch.randn(vocab_size, n_embd)
Wxh = torch.randn(n_embd, n_embd + n_embd2) # input to hidden
Whh = torch.randn(n_embd + n_embd2, n_embd2) # hidden to hidden 
Why = torch.randn(n_embd2, vocab_size) # hidden to output
bh = torch.zeros(n_embd + n_embd2,) # bias for hidden layer
by = torch.zeros(vocab_size,) # bias for output layer

# hidden layer RNN states
states = torch.zeros((1, n_embd))

params = [C, W1, b1, W2, W3, b3]
for p in params:
	p.requires_grad = True


In [9]:
# build_dset basically builds a rolling window on the dataset based on the context length.
def build_dset(dset, ctxt_len):
    X, Y = [], []
    for name in dset:
        context  = [0] * ctxt_len
        for ch in name:
            ix = atoi[ch]
            X.append(context)
            Y.append(ix)
            # print(''.join(itoa[i] for i in context), '--->', itoa[ix])
            context = context[1:] + [ix] # crop and append
    
    X = torch.tensor(X)
    Y = torch.tensor(Y)
    return X, Y
n1 = int(0.8*len(names))
n2 = int(0.9*len(names))

X_train, Y_train = build_dset(names[:n1], block_size)
X_val, Y_val = build_dset(names[n1:n2], block_size)
X_test, Y_test = build_dset(names[n2:], block_size)

for c, d in zip(X_train[:5], Y_train[:5]):
    print(''.join(itoa[i.item()] for i in c), "=>", itoa[d.item()])



........ => n
.......n => a
......na => i
.....nai => n
....nain => .


In [64]:
def rnn(x, y, hidden):
    hidden = torch.tanh()
    
X= X_train[:1000]
Y = Y_train[:1000]
emb = C[X]
l1 = emb @ W1
l1 += b1
hidden = torch.tanh(l1 + Whh @ hidden.pad(0) + bh)
xh.shape
# l2 = l1 @ W2
# l3 = l2 @ W3
# l3 += b3
# logits = F.softmax(l3, dim=-1)
states_0.shape, l1.shape
# logits.sum(axis=2)

(torch.Size([1, 100]), torch.Size([1000, 8, 150]), torch.Size([1000, 150]))

In [105]:
word = "hello"
print("word: ", word)
ixes = F.one_hot(torch.tensor([atoi[ch] for ch in word]), vocab_size).float()


word:  hello


data has 228145 characters, 27 unique.
----
 hv
crwrmrkmzxpgcsoyqhaopdvmwmptvgrctffrtrzospfmzyvrwsdjhukttflqulqzyxzqqnzfxsspjizjbanjoftgtkzqp
tzizndwakopzgteezeywbvrzxslnozz
kxbcvtplsdqqzuswv
rurvrxkk
efmdhlueijyeobycilj
prpnmlerijxxsybpflhz
brr 
----
iter 0, loss: 82.395928
----
 
gf
nz
a
dsaiknacl
aua
aiknk
rianwua
koa
ammeg
vianbnacu
ahrnwirpa
alaiaelij
anagl
lsamfiqia
ldmwanayanfilvhekjluaskne
acayaye
tjkraianr
piaiqiang
lme
aiailnazmlaya
rvauailyanpezhaia
ara
a
aql
lesiaif 
----
iter 100, loss: 83.431205
----
 
eiaiahehrray
bhnnbntmt
neouieh
erbmmed
nskkbhkliklnaslnudoera
mh
aaitashel
ira
eunii
erisi
eau
jnluohe
jxinahlnbn
vi
ie
ebw
ai
qyosaebohr
fhgos
srakaj
quhioap
irkerny
nay
haln

kllrdaird
oeiilnbelabt 
----
iter 200, loss: 82.803108
----
 a
yml
anrramklan

arb
sha
ardrny
anrau
riaieyanmaya
ora
alzylmar
en
lmliah
hteamar
mk
daeet
andrnma
ame
argy
ljla

yzrlmame
ala
iae

lgynulclmi
eeonamcvorle
vkrakghanayeinjaalraginglrhuramaid
wvbrarah 
----
iter 300, loss: 81.915724


KeyboardInterrupt: 