<a href="https://colab.research.google.com/github/abduljunaid02/LLMs/blob/main/Makemore_colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt # for making figures
%matplotlib inline

In [4]:
words = open('names.txt', 'r').read().splitlines()
words[:8]

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

In [5]:


chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
print(itos)


{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}


In [48]:
# build the dataset

block_size = 3 # context length: how many characters do we take to predict the next one?
X, Y = [], []
for w in words:

  #print(w)
  context = [0] * block_size
  for ch in w + '.':
    ix = stoi[ch]
    X.append(context)
    Y.append(ix)
    #print(''.join(itos[i] for i in context), '--->', itos[ix])
    context = context[1:] + [ix] # crop and append

X = torch.tensor(X)
Y = torch.tensor(Y)

In [49]:
  X.shape, Y.shape, X.dtype, Y.dtype

(torch.Size([228146, 3]), torch.Size([228146]), torch.int64, torch.int64)

In [8]:
C = torch.randn(27,2)
# C is our way of representing the 27 characters in 2 dimensional space.
#We will allocate the 27 characters to these 27-2 dimenstional embeddings now

In [9]:
#Direct allocation like C[5] is same as doing one hot encoding for 5 and then multiplying it with 5th entry in C
#F.one_hot(torch.tensor(5),num_classes=27).float() @ C

C[5]

tensor([-1.4749,  0.9151])

In [10]:
#Accessing works for multiple characters as well

C[torch.tensor([5,6,7])]

tensor([[-1.4749,  0.9151],
        [-0.2999,  0.6962],
        [ 0.5007,  0.2831]])

In [11]:

C[X].shape

#X is our dataset. We can embedd the whole of X in C with this C[X] it returns a (32,3) i.e., of X and a 2 dimension embedding of C
#If C were a 5 dimensional embedding it would have been (32,3,5)

torch.Size([32, 3, 2])

In [12]:
emb = C[X]
emb.shape

torch.Size([32, 3, 2])

In [13]:
#Create weights and biases

w1 = torch.rand(6, 100) #6 because we are using 3 characters represented in 2 dimensions to predict next character. 3*2 = 6 and 100 neurons
b1 = torch.rand(100)

In [14]:
#It is difficult to multiply a (32,3,2) i.e., our embedding vector with our weights that are (6,100). Matrix mul not possible
#So convert emb to (32,6)

h = torch.tanh(emb.view(-1,6) @ w1 + b1) # using -1 lets torch infer what must come in its place
h.shape

torch.Size([32, 100])

In [15]:
#Creating w2 and b2

w2 = torch.randn(100,27) # (100,27) because 100 comes from prev layers and output is 27 characters
b2 = torch.rand(27)

In [16]:
logits = h @ w2 + b2

In [17]:
logits.shape

torch.Size([32, 27])

In [18]:
#We have 27 outputs and we make it into the probabilities
counts = logits.exp()
prob = counts / counts.sum(dim=1, keepdims=True)
prob.shape

torch.Size([32, 27])

In [19]:
prob.shape

torch.Size([32, 27])

In [20]:
prob[torch.arange(32), Y]

#This is advanced indexing where we are getting the probabilities of the numbers (in y like 5,12,11,6) from all the 27 probabilities
#that prob has

tensor([8.8819e-14, 5.9384e-12, 3.3697e-08, 3.4759e-08, 3.6165e-05, 1.5945e-01,
        6.6612e-07, 7.3043e-10, 4.8271e-05, 9.2898e-14, 2.1269e-10, 3.3449e-05,
        1.8259e-09, 9.9476e-08, 1.8675e-09, 7.2076e-10, 1.5526e-14, 4.8154e-17,
        6.0614e-09, 3.0683e-05, 3.0906e-03, 1.1244e-07, 1.5560e-06, 1.1734e-06,
        5.8804e-08, 7.5055e-16, 5.0728e-01, 9.7256e-08, 7.3120e-11, 2.0486e-01,
        7.0513e-03, 1.1462e-05])

In [21]:
#Calculating negative loss likelikhood

loss = -prob[torch.arange(32), Y].log().mean()
loss

tensor(17.0177)

In [22]:
#To calculate the negative loss likelihood, we have a direct function
#exp of a high +ve number is NaN
#Internally pytorch takes the highest number and subtracts it from the input so exp of a high number could be calculated

F.cross_entropy(logits, Y)

tensor(17.0177)

In [50]:
#<------------------RESPECTABLE CODE HERE ------------------------------>#

g = torch.Generator().manual_seed(2147483647) # for reproducibility
C = torch.randn((27, 2), generator=g)
W1 = torch.randn((6, 100), generator=g)
b1 = torch.randn(100, generator=g)
W2 = torch.randn((100, 27), generator=g)
b2 = torch.randn(27, generator=g)
parameters = [C, W1, b1, W2, b2]

In [51]:
sum(p.nelement() for p in parameters)


3481

In [52]:
for p in parameters:
  p.requires_grad = True

In [53]:
#Forward pass
for _ in range(1000):
  emb = C[X] # (32, 3, 2)
  h = torch.tanh(emb.view(-1, 6) @ W1 + b1) # (32, 100)
  logits = h @ W2 + b2
  loss = F.cross_entropy(logits, Y)
  print(loss.item())
  #Backward pass

  for p in parameters:
    p.grad = None
  loss.backward()

  #update
  for p in parameters:
    p.data += -0.1 * p.grad


19.505226135253906
17.08449363708496
15.776531219482422
14.833340644836426
14.002603530883789
13.253260612487793
12.57991886138916
11.983101844787598
11.47049331665039
11.051856994628906
10.709586143493652
10.407632827758789
10.127808570861816
9.864365577697754
9.61450481414795
9.376440048217773
9.148944854736328
8.931111335754395
8.722230911254883
8.521749496459961
8.329227447509766
8.144326210021973
7.966792106628418
7.796451091766357


KeyboardInterrupt: 