In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


**Set Parameters for the model**

In [2]:
context_length=128
batch_size=32
num_embed=128
num_heads=8
num_layers=6
dropout=0.2
max_iterations=5000
learning_rate=3e-4

**Read Input text**

In [3]:
with open('/content/drive/MyDrive/GPT_From_Scratch/hindi_song_lyrics.txt', 'r', encoding='utf-8') as f:
    text = f.read()

print("Length of data = ",len(text))
vocab_size=len(set(text))
unique_char_list=list(set(text))
vocab_dict={char:i for i,char in enumerate(unique_char_list)}

Length of data =  2592


In [4]:
vocab_size

48

**Functions for Text to Numbers and Numbers to Text**

In [5]:
def get_numerical_tokens(vocab_dict,text):
  numerical_tokens=[vocab_dict[c] for c in text ]
  return numerical_tokens

def get_text_from_tokens(vocab_dict,numerical_tokens):
  text=[list(vocab_dict.keys())[num] for num in numerical_tokens]
  return text


Split into train and test

In [6]:
import torch

data_tensor=torch.tensor(data=get_numerical_tokens(vocab_dict,text),dtype=torch.long)
training_volume=len(data_tensor)*0.9
train_data=data_tensor[:int(training_volume)]
test_data=data_tensor[int(training_volume):]

print("Length of train data :", train_data.shape)
print("Length of test data :", test_data.shape)

x=train_data[:context_length]
y=train_data[1:context_length+1]

Length of train data : torch.Size([2332])
Length of test data : torch.Size([260])


**Visualize the input and output**

In [7]:
for i in range(context_length):
  context=x[:i+1]
  output=y[i]
  print(f"When input = {context} then output is {output}")

When input = tensor([47]) then output is 13
When input = tensor([47, 13]) then output is 23
When input = tensor([47, 13, 23]) then output is 20
When input = tensor([47, 13, 23, 20]) then output is 3
When input = tensor([47, 13, 23, 20,  3]) then output is 11
When input = tensor([47, 13, 23, 20,  3, 11]) then output is 20
When input = tensor([47, 13, 23, 20,  3, 11, 20]) then output is 3
When input = tensor([47, 13, 23, 20,  3, 11, 20,  3]) then output is 27
When input = tensor([47, 13, 23, 20,  3, 11, 20,  3, 27]) then output is 34
When input = tensor([47, 13, 23, 20,  3, 11, 20,  3, 27, 34]) then output is 20
When input = tensor([47, 13, 23, 20,  3, 11, 20,  3, 27, 34, 20]) then output is 30
When input = tensor([47, 13, 23, 20,  3, 11, 20,  3, 27, 34, 20, 30]) then output is 13
When input = tensor([47, 13, 23, 20,  3, 11, 20,  3, 27, 34, 20, 30, 13]) then output is 23
When input = tensor([47, 13, 23, 20,  3, 11, 20,  3, 27, 34, 20, 30, 13, 23]) then output is 20
When input = tensor([4

**Function for Creating batches**

In [8]:
torch.manual_seed(42)

def get_batch(data_specifier):
  data=train_data if data_specifier=='train' else test_data
  random_indices=torch.randint(len(data)-context_length-1,(batch_size,))
  x=torch.stack([data[rand_index:rand_index+context_length] for rand_index in random_indices])
  y=torch.stack([data[rand_index+1:rand_index+context_length+1] for rand_index in random_indices])
  return x,y


In [9]:
xb,yb=get_batch("train")
print("Input")
print(xb.shape)
print(xb)
print("Output")
print(yb.shape)
print(yb)
print("-----------------")

for b in range(batch_size):
  for i in range(context_length):
    context=xb[b][:i+1]
    output=yb[b][i]
    #print(f"when input is {context} output is {output}")

Input
torch.Size([32, 128])
tensor([[20,  7,  3,  ..., 11, 12, 43],
        [20, 23, 43,  ...,  5,  3, 20],
        [29, 32, 32,  ..., 11, 44, 30],
        ...,
        [43, 43, 18,  ..., 16, 43, 33],
        [13, 23,  1,  ..., 11, 33, 43],
        [11, 20, 36,  ..., 43, 43, 42]])
Output
torch.Size([32, 128])
tensor([[ 7,  3, 43,  ..., 12, 43, 33],
        [23, 43, 11,  ...,  3, 20,  3],
        [32, 32,  2,  ..., 44, 30, 43],
        ...,
        [43, 18, 11,  ..., 43, 33, 11],
        [23,  1,  5,  ..., 33, 43, 42],
        [20, 36, 11,  ..., 43, 42, 32]])
-----------------


  Self Attention

In [10]:
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

class Head(nn.Module):
  def __init__(self,head_size) :
    super().__init__()
    self.key=nn.Linear(num_embed,head_size,bias=False)
    self.query=nn.Linear(num_embed,head_size,bias=False)
    self.value=nn.Linear(num_embed,head_size,bias=False)

    self.register_buffer('tril',torch.tril(torch.ones(context_length,context_length)))
    self.Dropout=nn.Dropout(dropout)

  def forward(self, x):
    B,T,C = x.shape

    k=self.key(x) #(B,T,C)
    q=self.query(x) #(B,T,C)
    v=self.value(x) #(B,T,C)
    x=torch.randn(B,T,C)

    wei=q @ k.transpose(-2,-1) * C**-0.5 #(B,T,C) @ (B,C,T) ---> (B,T,T)
    wei = wei.masked_fill(self.tril[:T,:T]==0,float('-inf'))
    wei=F.softmax(wei,dim=-1) #(B,T,T)
    wei=self.Dropout(wei)
    out=wei @ v
    return out


**Function to create Multi head attention**

In [11]:
class MultiHeadAttention(nn.Module):
  def __init__(self,num_heads,head_size):
    super().__init__()
    self.heads=nn.ModuleList([Head(head_size) for _ in range(num_heads)])
    self.proj=nn.Linear(num_embed,num_embed)

  def forward(self,x):
    out=torch.cat([h(x) for h in self.heads],dim=-1)
    out=self.proj(out)
    return out

In [12]:
class FeedForward(nn.Module):
  def __init__(self,num_embed):
    super().__init__()
    self.net=nn.Sequential(
        nn.Linear(num_embed,4*num_embed),
        nn.ReLU(),
        nn.Linear(4*num_embed,num_embed),
        nn.Dropout(dropout)

    )

  def forward(self,x):
    return self.net(x)

**Class for Block of Decoders**

In [13]:
class Block(nn.Module):
  def __init__(self,num_embed,num_heads):
    super().__init__()
    head_size=num_embed//num_heads
    self.self_attention=MultiHeadAttention(num_heads,head_size)
    self.ffd=FeedForward(num_embed)
    self.ln1=nn.LayerNorm(num_embed)
    self.ln2=nn.LayerNorm(num_embed)

  def forward(self,x):
    x=x+self.self_attention(self.ln1(x))
    x=x+self.ffd(self.ln2(x))
    return x


In [14]:
class BiagramLLMModel(nn.Module):
  def __init__(self):
    super().__init__()
    self.embedding_table=nn.Embedding(vocab_size,num_embed)
    self.pos_embeddings=nn.Embedding(context_length,num_embed )

    self.blocks=nn.Sequential(

       * [Block(num_embed,num_heads=num_heads) for _ in range(num_layers)]
    )
    self.ln_f=nn.LayerNorm(num_embed)
    self.lm_head=nn.Linear(num_embed,vocab_size)

  def forward(self,idx,targets=None):

    B,T=idx.shape
    tokens_emb=self.embedding_table(idx)
    pos_emb=self.pos_embeddings(torch.arange(T))
    x=tokens_emb+pos_emb
    x=self.blocks(x)
    x=self.ln_f(x)
    logits=self.lm_head(x)

    if targets is None:
      loss=None
    else:
      B,T,C=logits.shape
      #print(logits.shape)
      logits=logits.view(B*T,C)
      targets=targets.view(B*T)
      loss=F.cross_entropy(logits,targets)

    return logits,loss


  def generate(self,idx,max_new_tokens):
    for _ in range(max_new_tokens):
      idx_cond=idx[:,-context_length:]
      logits,loss=self(idx_cond)
      logits=logits[:,-1,:]
      probs=F.softmax(logits,dim=-1)
      idx_next=torch.multinomial(probs,num_samples=1)
      idx=torch.cat((idx,idx_next),dim=1)

    return idx

biagram_model=BiagramLLMModel()
output,loss=biagram_model(xb,yb)
print(output.shape)
print(loss)

torch.Size([4096, 48])
tensor(4.0230, grad_fn=<NllLossBackward0>)


**Train the model**

In [15]:
optimizer=torch.optim.AdamW(biagram_model.parameters(),lr=learning_rate)

for steps in range(max_iterations):

  #get batch data
  xb,yb=get_batch('train')

  #get loss
  logits,loss=biagram_model(xb,yb)
  optimizer.zero_grad(set_to_none=True)
  loss.backward()
  optimizer.step()

  #print(loss.item())
print(loss.item())


0.061518020927906036


In [16]:
print("".join(get_text_from_tokens(vocab_dict,biagram_model.generate(torch.zeros((1,1), dtype=torch.long),1000)[0])))

Naati Babta

Kaise hum jaane
Humein kya pata

Galliyan, galliyan
Teri galliyan
Mujhko bhaavein
Teri galliyan

Muskurane ki wajah tum ho
Gungunane ki wajah tum ho
Zinda rehne ke liye main sabab dhoondhta hoon

Tera hone laga hoon
Khud ko main tujh se juda hone laagi

Tera hone laga hoon
Khud ko main tujh se juda hone laga hoon

Sun raha hai na tu
Ro raha hoon main
Sun raha hai na tu
Kyun ro raha hoon main

Phir le aaya dil majboor kya kya keeje
Raas na aaya rehna door kya keeje

Tum se hi din hota hai
Surmayi shaam aati hai
Tum se hi, tum se hi

Kabira maan jaa
Kabira maan jaa
Aaja tujhko pukare teri parchhaiyaan

Tujhko jo paaya
Toh jeena aaya
Ab yeh lamha theheher jaaye
Bas jaaye tujh mein kahin

Laung da lashkara
O baby teri yaadon ka
Marikhi ban gaya aashiyana

Tera yaar hoon main
Tera yaar hoon main
Zindagi bhar ka saath hai
Ye rishta humara

Ae dil hai mushkil jeena yahan
Zara hatke zara bachke
Yeh hai Bombay meri jaan

Hum tere bin ab reh nahi sakte
Tere bina kya wajood mera

Cha

In [17]:
torch.save(biagram_model.state_dict(), '/content/drive/MyDrive/GPT_From_Scratch/biagram_model.pth')