<a href="https://colab.research.google.com/github/YasminBougammoura/nlp/blob/main/S_gpt.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
import torch
import torch.nn as nn
from torch.nn import functional as F

In [1]:
with open('input.txt','r', encoding = 'utf-8') as f:
  text = f.read()

In [2]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
#print(''.join(chars))
#print(vocab_size)

In [4]:
# TOKENIZE INPUT TEXT ---> see openai tokenizer
# Creating a mapping from characters to integers

stoi = {ch:i for i,ch in enumerate(chars)} #creating table for encoding
itos = {i:ch for i,ch in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])

In [7]:
# ENCODE OUR DATA (INPUT TEXT) AND STORE IT INTO A TENSOR

data = torch.tensor(encode(text), dtype=torch.long)
#print(data.shape, data.dtype)
#print(data[:1000])

In [8]:
# TRAINING AND TEST DATASET SPLITTING

n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

In [9]:
# SAMPLING CHUNKS TO TRAIN

block_size = 8 # what is the maximum context length for prediction?
#train_data[:block_size+1]

In [None]:
'''
# Understanding the code

x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
  context = x[:t+1]
  target = y[t]
  print(f"when input is {context} the target: {target}")
'''

In [None]:
# BATCHES SIZE (model parallelism)

torch.manual_seed(1337)
batch_size = 4 # how many independent sequences will we process in parallel?


def get_batch(split): #function for any arbitrary split
  data = train_data if split == 'train' else val_data
  ix = torch.randint(len(data)- block_size, (batch_size,)) # 4 numbers randomly generated between 0 and (my data - block size)

  # We positioned the batch and now we take the 1-dim tensors and stack'em up at rows
  # so they all become a row in a 4x8 tensor
  x = torch.stack([data[i:i+block_size] for i in ix])
  y = torch.stack([data[i+1:i+block_size+1] for i in ix]) # y is the offset of x (so we add 1)

  return x,y

xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)

print('----')

for b in range(batch_size):
  for t in range(block_size):
    context = xb[b,:t+1]
    target = yb[b,t]
    print(f"when input is {context.tolist()} the target is: {target}")

In [None]:
# USING BIGRAM NEURAL NETWORK MODEL

torch.manual_seed(1337)

# I'm taking the index idx of my input, passing it into the embedding table and
# what happens is that every integer of my input is referring to the table
# and is going to pluck out a row of the table corresponding to its index
# Torch is arranging all of this in a (Batch, Time, Channel) tensor - (4,8,65)
# This is interpreted as the logits - scores for the next char sequence

class BigramLanguageModel(nn.Module):

  def __init__(self, vocab_size):
    super().__init__()
    self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

  def forward(self, idx, targets = None):
    logits = self.token_embedding_table(idx) # Arranged as (B = 4,T = 8,C = 65) tensor

    if targets is None:
      loss = None
    else:
      B,T,C = logits.shape
      logits = logits.view(B*T, C)
      targets=targets.view(B*T)
      loss = F.cross_entropy(logits,targets)

    return logits, loss

  def generate(self, idx, max_new_tokens):

    for _ in range(max_new_tokens):
      #gets predictions
      logits, loss = self(idx)
      #focus only on last time step
      logits = logits[:,-1,:]
      #apply softmax to get probabilities
      probs = F.softmax(logits, dim=1)
      #sample from the distribution
      idx_next = torch.multinomial(probs, num_samples=1)
      #append sampled index to the running sequence
      idx = torch.cat((idx,idx_next), dim=1)
    return idx


m = BigramLanguageModel(vocab_size)
logits, loss = m(xb,yb)
print(logits.shape)
print(loss)
# We expect the loss to be the negative of loglikelihood, thus
# -ln(1/65) = 4.174..
'''
idx = torch.zeros((1,1),dtype=torch.long) # 1 by 1 tensor holding a zero
'''
print(decode(m.generate(torch.zeros((1,1),dtype=torch.long),max_new_tokens = 100)[0].tolist()))

torch.Size([32, 65])
tensor(4.8786, grad_fn=<NllLossBackward0>)

Sr?qP-QWktXoL&jLDJgOLVz'RIoDqHdhsV&vLLxatjscMpwLERSPyao.qfzs$Ys$zF-w,;eEkzxjgCKFChs!iWW.ObzDnxA Ms$3


In [None]:
# TRAINING THE MODEL

#Optimizer object
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

batch_size = 32
for steps in range(10000):

  #sample a batch of data
  xb,yb = get_batch('train')

  #evaluate the loss
  logits, loss = m(xb,yb)
  optimizer.zero_grad(set_to_none=True) #zeroing gradients from previous steps
  loss.backward() #obtaining gradients from current step
  optimizer.step() #optimizing and updating parameters

print(loss.item())

2.5727508068084717


In [None]:
print(decode(m.generate(torch.zeros((1,1),dtype=torch.long),max_new_tokens = 400)[0].tolist()))


Iyoteng h hasbe pave pirance
Rie hicomyonthar's
Plinseard ith henoure wounonthioneir thondy, y heltieiengerofo'dsssit ey
KIN d pe wither vouprrouthercc.
hathe; d!
My hind tt hinig t ouchos tes; st yo hind wotte grotonear 'so it t jod weancotha:
h hay.JUCle n prids, r loncave w hollular s O:
HIs; ht anjx?

DUThinqunt.

LaZAnde.
athave l.
KEONH:
ARThanco be y,-hedarwnoddy scace, tridesar, wnl'shenou


In [None]:
# self-attention trick

torch.manual_seed(1337)

B,T,C = 4,8,2
x = torch.randn(B,T,C)
x.shape

torch.Size([4, 8, 2])

In [None]:
# We want x[b,t] = mean_{i<=t}x[b,t]
xbow = torch.zeros((B,T,C)) # x bag of words
for b in range(B):
  for t in range(T):
    xprev = x[b,:t+1] # previous token included thus dimension is (t,C)
    xbow[b,t] = torch.mean(xprev,0)

In [None]:
# We can have the same computation but faster using matrix multiplication

wei = torch.tril(torch.ones(T,T))
wei = wei/wei.sum(1,keepdim=True)
xbow2 = wei @ x # (B,T,T,) @ (B,T,C) ---> (B,T,C)
torch.allclose(xbow,xbow2) # due to numerical precision

False

In [None]:
# We can again use softmax to compute this calculation

tril = torch.tril(torch.ones(T,T))
wei = torch.zeros((T,T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim = 1)
xbow3 = wei @ x

In [None]:
# Small self-attention for a single individual head
torch.manual_seed(1337)
B,T,C = 4,8,32
x = torch.randn(B,T,C)

head_size = 16
key = nn.Linear(C, head_size, bias = False)
query = nn.Linear(C, head_size, bias = False)
value = nn.Linear(C, head_size, bias = False)
k = key(x) #(B,T,16)
q = query(x) #(B,T,16)

# Communication
wei = q @ k.transpose(-2,-1) # (B,T,16) @ (B,16,T) --> (B,T,T)


tril = torch.tril(torch.ones(T,T))
#wei = torch.zeros((T,T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim = 1)

v = value(x)
out = wei @ x
#out = wei @ x

out.shape

In [None]:
k = torch.randn(B,T,head_size)
q = torch.randn(B,T, head_size)
wei = q @ k.transpose(--2,-1) * head_size**-0.5 # for variance