In [1]:
!pip install pytorch_lightning

In [2]:
!pip install torch

In [3]:
import math
from dataclasses import dataclass

import torch
import torch.nn as nn
import torch.nn.functional as F
import pytorch_lightning as pl
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2TokenizerFast
from datasets import load_dataset
from sklearn.metrics import f1_score, accuracy_score
from torchmetrics import Accuracy, F1Score, Precision, Perplexity

## **1.Model_Components**

##### **SelfAttention**

In [4]:
class SelfAttention(nn.Module):
  def __init__(self, n_embd: int, seq_len: int, attn_pdrop: float= 0.0, resid_pdrop: float= 0.0):
    super().__init__(self)
    self.key = nn.Linear(n_embd, n_embd, bias= False)
    self.query = nn.Linear(n_embd, n_embd, bias= False)
    self.value = nn.Linear(n_embd, n_embd, bias=False)
    self.attn_drop = nn.Dropout(attn_pdrop)
    self.resid_drop = nn.Dropout(resid_pdrop)
    self.register_buffer('mask',torch.tril(torch.ones(seq_len,seq_len)).view(1,1,seq_len,seq_len))
    self.n_embd = n_embd

  def forward(self,x):
    B,T,C = x.size()
    k = self.key(x)
    q = self.query(x)
    v = self.value(x)
    att = (q @ k.transpose(-2,-1)) / math.sqrt(C)
    att = att.masked_fill(self.mask[:,:,:T,:T] == 0, float('-inf'))
    att = F.softmax(att,dim=-1)
    att = self.attn_drop(att)
    y = att @ v
    y = self.resid_drop(y)
    return y

##### **MultiHeadAttention**

In [5]:
class MultiHeadAttention(nn.Module):
  def __init__(self,n_embd:int,n_head:int,seq_len:int,attn_pdrop:float=0.0,resid_pdrop:float=0.0):
    super().__init__()
    assert n_embd % n_head == 0
    self.n_head = n_head
    self.head_dim = n_embd // n_head
    # concatenated attention weights : (B, T, n_embd) -> (B, T, 3 * n_embd)
    self.c_attn = nn.Linear(n_embd, 3 * n_embd, bias = False)
    self.out_proj = nn.Linear(n_embd, n_embd, bias = False)
    self.attn_drop = nn.Dropout(attn_pdrop)
    self.resid_drop = nn.Dropout(resid_pdrop)
    self.register_buffer('mask',torch.tril(torch.ones(seq_len,seq_len)).view(1,1,seq_len,seq_len))

  def forward(self,x):
    B,T,C = x.shape
    qkv = self.c_attn(x)   # Concat
    q,k,v = qkv.split(C, dim = 2)  # dim = 2 -> تقسیم میشود C بعد سوم به تکه هایی به اندازه
    q = q.view(B,T,self.n_head, C // self.n_head).transpose(1,2)
    k = k.view(B,T,self.n_head, C // self.n_head).transpose(1,2)
    v = v.view(B,T,self.n_head, C // self.n_head).transpose(1,2)
    att = (q @ k.transpose(-2,-1)) / math.sqrt(C//self.n_head)
    att = att.masked_fill(self.mask[:,:,:T,:T] == 0, float('-inf'))
    att = F.softmax(att, dim = -1)
    att = self.attn_drop(att)
    y = att @ v
    y = y.transpose(1,2).contiguous().view(B,T,C)
    y = self.resid_drop(self.out_proj(y))
    return y

##### **TransformerBlock**

In [6]:
class TransformerBlock(nn.Module):
  def __init__(self, n_embd: int, n_head: int, seq_len: int, mlp_ratio = 4.0, attn_pdrop=0.0, resid_pdrop=0.0):
    super().__init__()
    self.ln1 = nn.LayerNorm(n_embd)
    self.attn = MultiHeadAttention(n_embd,n_head,seq_len,attn_pdrop,resid_pdrop)
    self.ln2 = nn.LayerNorm(n_embd)
    self.mlp = nn.Sequential(
        nn.Linear(n_embd,int(mlp_ratio * n_embd)),
        nn.GELU(),
        nn.Linear(int(mlp_ratio * n_embd) ,n_embd),
        nn.Dropout(resid_pdrop)
    )

  def forward(self,x):
    x = x + self.attn(self.ln1(x))
    x = x + self.mlp(self.ln2(x))
    return x

##### **ConvBlock**

In [7]:
class ConvBlock(nn.Module):
  def __init__(self, n_embd: int, pdrop= 0.0):
    super().__init__()
    self.ln = nn.LayerNorm(n_embd)
    self.conv3 = nn.Conv1d(n_embd,n_embd,kernel_size=3,padding=1,groups=1)
    self.conv5 = nn.Conv1d(n_embd,n_embd,kernel_size=5,padding=2,groups=1)
    self.proj = nn.Linear(2*n_embd,n_embd)
    self.drop = nn.Dropout(pdrop)
    self.act = nn.GELU()

  def forward(self,x):
    B,T,C = x.shape
    h = self.ln(x)
    h = h.transpose(1,2)
    y3 = self.conv3(h)
    y5 = self.conv5(h)
    y = torch.cat([y3,y5],dim=1).transpose(1,2)
    y = self.proj(self.act(y))
    y = self.drop(y)
    return x + y

##### **MLPBlock**

In [8]:
class MLPBlock(nn.Module):
  def __init__(self,n_embd:int,mlp_ratio=4.0,pdrop=0.0):
    super().__init__()
    self.ln = nn.LayerNorm(n_embd)
    self.fc1 = nn.Linear(n_embd,int(mlp_ratio * n_embd))
    self.fc2 = nn.Linear(int(mlp_ratio * n_embd),n_embd)
    self.drop = nn.Dropout(pdrop)
    self.act = nn.GELU()

  def forward(self,x):
    h = self.ln(x)
    h = self.fc2(self.act(self.fc1(h)))
    h = self.drop(h)
    return x+h

##### **MODEL**

In [None]:
class HybridLM(nn.Module):
  def __init__(self, vocab_size:int, seq_len:int, n_layer:int, n_head:int, n_embd:int, attn_pdrop=0.0, resid_pdrop=0.0):
    super().__init__()
    assert n_layer == 10, "This script assumes 10 total layers in a fixed hybrid plan"
    self.seq_len = seq_len
    self.tok_emb = nn.Embedding(vocab_size, n_embd) # Added n_embd here
    self.pos_emb = nn.Parameter(torch.zeros(1,seq_len,n_embd))
    self.drop = nn.Dropout(resid_pdrop)
    self.layers = nn.ModuleList([
        ConvBlock(n_embd,pdrop=resid_pdrop), # 1, changed dropout to pdrop
        ConvBlock(n_embd,pdrop=resid_pdrop), # 2, changed dropout to pdrop
        TransformerBlock(n_embd,n_head,seq_len,attn_pdrop=attn_pdrop,resid_pdrop=resid_pdrop), # 3
        TransformerBlock(n_embd,n_head,seq_len,attn_pdrop=attn_pdrop,resid_pdrop=resid_pdrop), # 4
        TransformerBlock(n_embd,n_head,seq_len,attn_pdrop=attn_pdrop,resid_pdrop=resid_pdrop), # 5
        TransformerBlock(n_embd,n_head,seq_len,attn_pdrop=attn_pdrop,resid_pdrop=resid_pdrop), # 6
        TransformerBlock(n_embd,n_head,seq_len,attn_pdrop=attn_pdrop,resid_pdrop=resid_pdrop), # 7
        MLPBlock(n_embd,mlp_ratio=4.0,pdrop=resid_pdrop), # 8, changed dropout to pdrop
        MLPBlock(n_embd,mlp_ratio=4.0,pdrop=resid_pdrop), # 9, changed dropout to pdrop
        TransformerBlock(n_embd,n_head,seq_len,attn_pdrop=attn_pdrop,resid_pdrop=resid_pdrop) # 10
    ])
    self.ln_f = nn.LayerNorm(n_embd)
    self.head = nn.Linear(n_embd,vocab_size,bias=False)
    self.apply(self._init_weights)

  def _init_weights(self,m):
    if isinstance(m,nn.Linear):
      nn.init.normal_(m.weight,mean=0.0,std=0.02)
      if m.bias is not None: nn.init.zeros_(m.bias)
    elif isinstance(m,nn.Embedding):
      nn.init.normal_(m.weight,mean=0.0,std=0.02)

  def forward(self,idx,targets=None):
    B,T = idx.size()
    assert T <= self.seq_len
    x = self.tok_emb(idx) + self.pos_emb[:,:T,:]
    x = self.drop(x)
    for layer in self.layers:
      x = layer(x)
    x = self.ln_f(x)
    logits = self.head(x)
    loss = None
    if targets is not None:
      loss = F.cross_entropy(logits.view(-1,logits.size(-1)),targets.view(-1))
    return logits,loss

  @torch.no_grad()
  def generate(self,idx,max_new_tokens=100,temperature=1.0,top_k=None):
    for _ in range(max_new_tokens):
      idx_cond = idx[:,-self.seq_len:]
      logits,_ = self(idx_cond)
      logits = logits[:,-1,:] / max(1e-8,temperature) # Corrected temperature variable name
      if top_k is not None :
        v,_ = torch.topk(logits,top_k)
        logits[logits < v[:,[-1]]] = -float('Inf')
      probs = F.softmax(logits,dim=-1)
      next_id = torch.multinomial(probs,num_samples=1)
      idx = torch.cat((idx,next_id),dim=1)
    return idx

## **2.Dataset-Wrapper**

In [9]:
class TokenDataset(Dataset):
  def __init__(self,data_ids,seq_len):
    self.data = data_ids
    self.seq_len = seq_len

  def __len__(self):
    return len(self.data) // self.seq_len

  def __getitem__(self,idx):
    start = idx * self.seq_len
    x = self.data[start : start + self.seq_len]
    y = self.data[start + 1 : start + self.seq_len + 1]
    return x, y # Corrected to return x and y separately

## **3.Lightning-Module**

In [10]:
class LitHybridLM_Models(pl.LightningModule):
  def __init__(self,ModelClass,vocab_size,seq_len=256,num_classes=4,n_layer=10,n_head=6,n_embd=384,lr=3e-4):
    super().__init__()
    self.save_hyperparameters()
    self.model = ModelClass(vocab_size, seq_len, n_layer, n_head, n_embd)
    self.lr = lr
    self.perplexity = Perplexity(ignore_index=-100)

  def training_step(self, batch, batch_idx):
    x, y = batch
    logits, loss = self.model(x,y)
    self.log("train_loss", loss, prog_bar=True)
    return loss

  def validation_step(self, batch, batch_idx):
    x,y = batch
    logits,loss = self.model(x,y)
    ppl = self.perplexity(logits,y)

    self.log("val_loss",loss,prog_bar=True)
    self.log("val_perplexity",ppl)

  def configure_optimizers(self):
    return torch.optim.AdamW(self.parameters(),lr=self.lr)

## **4. Load Dataset & Tokenizer**

In [11]:
dataset = load_dataset("mikasenghaas/wikitext-2")

tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/493 [00:00<?, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/6.20M [00:00<?, ?B/s]

data/validation-00000-of-00001.parquet:   0%|          | 0.00/641k [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/713k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/17556 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1841 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2183 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

## **5.Utilities**

In [12]:
def encode_texts(texts):
  enc = tokenizer(texts, return_tensors="pt", padding=True, truncation=True)
  return enc["input_ids"].view(-1)

In [13]:
def data_loader(train_size:int,val_size:int,seq_len:int = 256 , batch_size:int=24):
  train_texts = dataset["train"]["text"][:train_size]
  val_texts = dataset["test"]["text"][:val_size]

  train_ids = encode_texts(train_texts)
  val_ids = encode_texts(val_texts)

  #seq_len = 256
  #batch_size = 24
  train_ds = TokenDataset(train_ids, seq_len = seq_len)
  val_ds = TokenDataset(val_ids, seq_len = seq_len)

  train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
  val_loader = DataLoader(val_ds, batch_size=batch_size)

  return train_loader,val_loader

In [14]:
def train_method(ModelClass,train_size:int,val_size:int,n_layer:int,seq_len:int=256,batch_size:int = 24):
  train_loader,val_loader = data_loader(train_size,val_size,seq_len,batch_size)

  lit_model = LitHybridLM_Models(ModelClass= ModelClass,vocab_size=len(tokenizer),n_layer=n_layer)

  trainer = pl.Trainer(
      max_epochs=3,
      accelerator="auto",
      devices=1,
      precision=16,
      log_every_n_steps=10
  )

  trainer.fit(lit_model, train_loader, val_loader)
  return trainer

## **6.Train**


In [None]:
trainer = train_method(ModelClass= HybridLM,train_size=30000,val_size=5000,n_layer=10)

/usr/local/lib/python3.12/dist-packages/lightning_fabric/connector.py:571: `precision=16` is supported for historical reasons but its usage is discouraged. Please set your precision to 16-mixed instead!
INFO:pytorch_lightning.utilities.rank_zero:Using 16bit Automatic Mixed Precision (AMP)
INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
/usr/local/lib/python3.12/dist-packages/pytorch_lightning/utilities/model_summary/model_summary.py:231: Precision 16-mixed is not supp

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=3` reached.


In [None]:
val_perp = trainer.callback_metrics["val_perplexity"].item()
val_loss = trainer.callback_metrics["val_loss"].item()
print(f"Validation Loss : {val_loss:.4f} | Perplexity : {val_perp:.2f}")

Validation Loss : 0.0242 | Perplexity : 1.02


## **7.Labratory(3)**

Dataset : __"mikasenghaas/wikitext-2"__

Changing the model architecture,
Changing train_size and val_size

### **Train(14)**

model ( 1 Layers ) :    MLPBlock

train_size : 30000  

val_size : 5000

In [None]:
class HybridLM_14(nn.Module):
  def __init__(self, vocab_size:int, seq_len:int, n_layer:int, n_head:int, n_embd:int, attn_pdrop=0.0, resid_pdrop=0.0):
    super().__init__()
    assert n_layer == 1, "This script assumes 2 total layers in a fixed hybrid plan"
    self.seq_len = seq_len
    self.tok_emb = nn.Embedding(vocab_size, n_embd) # Added n_embd here
    self.pos_emb = nn.Parameter(torch.zeros(1,seq_len,n_embd))
    self.drop = nn.Dropout(resid_pdrop)
    self.layers = nn.ModuleList([
      #  ConvBlock(n_embd,pdrop=resid_pdrop), # 1, changed dropout to pdrop
      #  ConvBlock(n_embd,pdrop=resid_pdrop), # 2, changed dropout to pdrop
      #  TransformerBlock(n_embd,n_head,seq_len,attn_pdrop=attn_pdrop,resid_pdrop=resid_pdrop), # 3
      #  TransformerBlock(n_embd,n_head,seq_len,attn_pdrop=attn_pdrop,resid_pdrop=resid_pdrop), # 4
      #  TransformerBlock(n_embd,n_head,seq_len,attn_pdrop=attn_pdrop,resid_pdrop=resid_pdrop), # 5
      #  TransformerBlock(n_embd,n_head,seq_len,attn_pdrop=attn_pdrop,resid_pdrop=resid_pdrop), # 6
      #  TransformerBlock(n_embd,n_head,seq_len,attn_pdrop=attn_pdrop,resid_pdrop=resid_pdrop), # 7
        MLPBlock(n_embd,mlp_ratio=4.0,pdrop=resid_pdrop), # 8, changed dropout to pdrop
     #   MLPBlock(n_embd,mlp_ratio=4.0,pdrop=resid_pdrop), # 9, changed dropout to pdrop
     #   TransformerBlock(n_embd,n_head,seq_len,attn_pdrop=attn_pdrop,resid_pdrop=resid_pdrop) # 10
    ])
    self.ln_f = nn.LayerNorm(n_embd)
    self.head = nn.Linear(n_embd,vocab_size,bias=False)
    self.apply(self._init_weights)

  def _init_weights(self,m):
    if isinstance(m,nn.Linear):
      nn.init.normal_(m.weight,mean=0.0,std=0.02)
      if m.bias is not None: nn.init.zeros_(m.bias)
    elif isinstance(m,nn.Embedding):
      nn.init.normal_(m.weight,mean=0.0,std=0.02)

  def forward(self,idx,targets=None):
    B,T = idx.size()
    assert T <= self.seq_len
    x = self.tok_emb(idx) + self.pos_emb[:,:T,:]
    x = self.drop(x)
    for layer in self.layers:
      x = layer(x)
    x = self.ln_f(x)
    logits = self.head(x)
    loss = None
    if targets is not None:
      loss = F.cross_entropy(logits.view(-1,logits.size(-1)),targets.view(-1))
    return logits,loss

  @torch.no_grad()
  def generate(self,idx,max_new_tokens=100,temperature=1.0,top_k=None):
    for _ in range(max_new_tokens):
      idx_cond = idx[:,-self.seq_len:]
      logits,_ = self(idx_cond)
      logits = logits[:,-1,:] / max(1e-8,temperature) # Corrected temperature variable name
      if top_k is not None :
        v,_ = torch.topk(logits,top_k)
        logits[logits < v[:,[-1]]] = -float('Inf')
      probs = F.softmax(logits,dim=-1)
      next_id = torch.multinomial(probs,num_samples=1)
      idx = torch.cat((idx,next_id),dim=1)
    return idx

In [None]:
trainer_14 = train_method(ModelClass=HybridLM_14,train_size=30000,val_size=5000,n_layer=1)

/usr/local/lib/python3.12/dist-packages/lightning_fabric/connector.py:571: `precision=16` is supported for historical reasons but its usage is discouraged. Please set your precision to 16-mixed instead!
INFO:pytorch_lightning.utilities.rank_zero:Using 16bit Automatic Mixed Precision (AMP)
INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
/usr/local/lib/python3.12/dist-packages/pytorch_lightning/utilities/model_summary/model_summary.py:231: Precision 16-mixed is not supp

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=3` reached.


In [None]:
val_perp_14 = trainer_14.callback_metrics["val_perplexity"].item()
val_loss_14 = trainer_14.callback_metrics["val_loss"].item()
print(f"Validation Loss : {val_loss_14:.4f} | Perplexity : {val_perp_14:.2f}")

Validation Loss : 1.4109 | Perplexity : 5.15


### **Train(15)**

model ( 5 Layers ) : ConvBlock -> TransformerBlock ( * 3 ) -> MLPBlock

train_size : 30000

val_size : 5000

In [None]:
class HybridLM_15(nn.Module):
  def __init__(self, vocab_size:int, seq_len:int, n_layer:int, n_head:int, n_embd:int, attn_pdrop=0.0, resid_pdrop=0.0):
    super().__init__()
    assert n_layer == 5, "This script assumes 5 total layers in a fixed hybrid plan"
    self.seq_len = seq_len
    self.tok_emb = nn.Embedding(vocab_size, n_embd) # Added n_embd here
    self.pos_emb = nn.Parameter(torch.zeros(1,seq_len,n_embd))
    self.drop = nn.Dropout(resid_pdrop)
    self.layers = nn.ModuleList([
      #  ConvBlock(n_embd,pdrop=resid_pdrop), # 1, changed dropout to pdrop
        ConvBlock(n_embd,pdrop=resid_pdrop), # 2, changed dropout to pdrop
      #  TransformerBlock(n_embd,n_head,seq_len,attn_pdrop=attn_pdrop,resid_pdrop=resid_pdrop), # 3
      #  TransformerBlock(n_embd,n_head,seq_len,attn_pdrop=attn_pdrop,resid_pdrop=resid_pdrop), # 4
        TransformerBlock(n_embd,n_head,seq_len,attn_pdrop=attn_pdrop,resid_pdrop=resid_pdrop), # 5
        TransformerBlock(n_embd,n_head,seq_len,attn_pdrop=attn_pdrop,resid_pdrop=resid_pdrop), # 6
        TransformerBlock(n_embd,n_head,seq_len,attn_pdrop=attn_pdrop,resid_pdrop=resid_pdrop), # 7
     #   MLPBlock(n_embd,mlp_ratio=4.0,pdrop=resid_pdrop), # 8, changed dropout to pdrop
        MLPBlock(n_embd,mlp_ratio=4.0,pdrop=resid_pdrop), # 9, changed dropout to pdrop
     #   TransformerBlock(n_embd,n_head,seq_len,attn_pdrop=attn_pdrop,resid_pdrop=resid_pdrop) # 10
    ])
    self.ln_f = nn.LayerNorm(n_embd)
    self.head = nn.Linear(n_embd,vocab_size,bias=False)
    self.apply(self._init_weights)

  def _init_weights(self,m):
    if isinstance(m,nn.Linear):
      nn.init.normal_(m.weight,mean=0.0,std=0.02)
      if m.bias is not None: nn.init.zeros_(m.bias)
    elif isinstance(m,nn.Embedding):
      nn.init.normal_(m.weight,mean=0.0,std=0.02)

  def forward(self,idx,targets=None):
    B,T = idx.size()
    assert T <= self.seq_len
    x = self.tok_emb(idx) + self.pos_emb[:,:T,:]
    x = self.drop(x)
    for layer in self.layers:
      x = layer(x)
    x = self.ln_f(x)
    logits = self.head(x)
    loss = None
    if targets is not None:
      loss = F.cross_entropy(logits.view(-1,logits.size(-1)),targets.view(-1))
    return logits,loss

  @torch.no_grad()
  def generate(self,idx,max_new_tokens=100,temperature=1.0,top_k=None):
    for _ in range(max_new_tokens):
      idx_cond = idx[:,-self.seq_len:]
      logits,_ = self(idx_cond)
      logits = logits[:,-1,:] / max(1e-8,temperature) # Corrected temperature variable name
      if top_k is not None :
        v,_ = torch.topk(logits,top_k)
        logits[logits < v[:,[-1]]] = -float('Inf')
      probs = F.softmax(logits,dim=-1)
      next_id = torch.multinomial(probs,num_samples=1)
      idx = torch.cat((idx,next_id),dim=1)
    return idx

In [None]:
trainer_15 = train_method(ModelClass=HybridLM_15,train_size=30000,val_size=5000,n_layer=5)

/usr/local/lib/python3.12/dist-packages/lightning_fabric/connector.py:571: `precision=16` is supported for historical reasons but its usage is discouraged. Please set your precision to 16-mixed instead!
INFO:pytorch_lightning.utilities.rank_zero:Using 16bit Automatic Mixed Precision (AMP)
INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
/usr/local/lib/python3.12/dist-packages/pytorch_lightning/utilities/model_summary/model_summary.py:231: Precision 16-mixed is not supp

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

In [None]:
val_perp_15 = trainer_15.callback_metrics["val_perplexity"].item()
val_loss_15 = trainer_15.callback_metrics["val_loss"].item()
print(f"Validation Loss : {val_loss_15:.4f} | Perplexity : {val_perp_15:.2f}")

Validation Loss : 0.0234 | Perplexity : 1.02


### **Train(16)**

model ( 1 Layers ) : ConvBlock

train_size : 30000

val_size : 5000

In [15]:
class HybridLM_16(nn.Module):
  def __init__(self, vocab_size:int, seq_len:int, n_layer:int, n_head:int, n_embd:int, attn_pdrop=0.0, resid_pdrop=0.0):
    super().__init__()
    assert n_layer == 1, "This script assumes 5 total layers in a fixed hybrid plan"
    self.seq_len = seq_len
    self.tok_emb = nn.Embedding(vocab_size, n_embd) # Added n_embd here
    self.pos_emb = nn.Parameter(torch.zeros(1,seq_len,n_embd))
    self.drop = nn.Dropout(resid_pdrop)
    self.layers = nn.ModuleList([
      #  ConvBlock(n_embd,pdrop=resid_pdrop), # 1, changed dropout to pdrop
        ConvBlock(n_embd,pdrop=resid_pdrop), # 2, changed dropout to pdrop
      #  TransformerBlock(n_embd,n_head,seq_len,attn_pdrop=attn_pdrop,resid_pdrop=resid_pdrop), # 3
      #  TransformerBlock(n_embd,n_head,seq_len,attn_pdrop=attn_pdrop,resid_pdrop=resid_pdrop), # 4
      #  TransformerBlock(n_embd,n_head,seq_len,attn_pdrop=attn_pdrop,resid_pdrop=resid_pdrop), # 5
      #  TransformerBlock(n_embd,n_head,seq_len,attn_pdrop=attn_pdrop,resid_pdrop=resid_pdrop), # 6
      #  TransformerBlock(n_embd,n_head,seq_len,attn_pdrop=attn_pdrop,resid_pdrop=resid_pdrop), # 7
     #   MLPBlock(n_embd,mlp_ratio=4.0,pdrop=resid_pdrop), # 8, changed dropout to pdrop
      #  MLPBlock(n_embd,mlp_ratio=4.0,pdrop=resid_pdrop), # 9, changed dropout to pdrop
     #   TransformerBlock(n_embd,n_head,seq_len,attn_pdrop=attn_pdrop,resid_pdrop=resid_pdrop) # 10
    ])
    self.ln_f = nn.LayerNorm(n_embd)
    self.head = nn.Linear(n_embd,vocab_size,bias=False)
    self.apply(self._init_weights)

  def _init_weights(self,m):
    if isinstance(m,nn.Linear):
      nn.init.normal_(m.weight,mean=0.0,std=0.02)
      if m.bias is not None: nn.init.zeros_(m.bias)
    elif isinstance(m,nn.Embedding):
      nn.init.normal_(m.weight,mean=0.0,std=0.02)

  def forward(self,idx,targets=None):
    B,T = idx.size()
    assert T <= self.seq_len
    x = self.tok_emb(idx) + self.pos_emb[:,:T,:]
    x = self.drop(x)
    for layer in self.layers:
      x = layer(x)
    x = self.ln_f(x)
    logits = self.head(x)
    loss = None
    if targets is not None:
      loss = F.cross_entropy(logits.view(-1,logits.size(-1)),targets.view(-1))
    return logits,loss

  @torch.no_grad()
  def generate(self,idx,max_new_tokens=100,temperature=1.0,top_k=None):
    for _ in range(max_new_tokens):
      idx_cond = idx[:,-self.seq_len:]
      logits,_ = self(idx_cond)
      logits = logits[:,-1,:] / max(1e-8,temperature) # Corrected temperature variable name
      if top_k is not None :
        v,_ = torch.topk(logits,top_k)
        logits[logits < v[:,[-1]]] = -float('Inf')
      probs = F.softmax(logits,dim=-1)
      next_id = torch.multinomial(probs,num_samples=1)
      idx = torch.cat((idx,next_id),dim=1)
    return idx

In [17]:
trainer_16 = train_method(ModelClass=HybridLM_16,train_size=30000,val_size=5000,n_layer=1)

INFO:pytorch_lightning.utilities.rank_zero:Using 16bit Automatic Mixed Precision (AMP)
INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name       | Type        | Params | Mode 
---------------------------------------------------
0 | model      | HybridLM_16 | 40.2 M | train
1 | perplexity | _Perplexity | 0      | train
---------------------------------------------------
40.2 M    Trainable params
0         Non-traina

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=3` reached.


In [18]:
val_perp_16 = trainer_16.callback_metrics["val_perplexity"].item()
val_loss_16 = trainer_16.callback_metrics["val_loss"].item()
print(f"Validation Loss : {val_loss_16:.4f} | Perplexity : {val_perp_16:.2f}")

Validation Loss : 0.0221 | Perplexity : 1.02


### **Train(17)**

model ( 1 Layers ) : TransformerBlock

train_size : 30000

val_size : 5000

In [19]:
class HybridLM_17(nn.Module):
  def __init__(self, vocab_size:int, seq_len:int, n_layer:int, n_head:int, n_embd:int, attn_pdrop=0.0, resid_pdrop=0.0):
    super().__init__()
    assert n_layer == 1, "This script assumes 5 total layers in a fixed hybrid plan"
    self.seq_len = seq_len
    self.tok_emb = nn.Embedding(vocab_size, n_embd) # Added n_embd here
    self.pos_emb = nn.Parameter(torch.zeros(1,seq_len,n_embd))
    self.drop = nn.Dropout(resid_pdrop)
    self.layers = nn.ModuleList([
      #  ConvBlock(n_embd,pdrop=resid_pdrop), # 1, changed dropout to pdrop
      #  ConvBlock(n_embd,pdrop=resid_pdrop), # 2, changed dropout to pdrop
        TransformerBlock(n_embd,n_head,seq_len,attn_pdrop=attn_pdrop,resid_pdrop=resid_pdrop), # 3
      #  TransformerBlock(n_embd,n_head,seq_len,attn_pdrop=attn_pdrop,resid_pdrop=resid_pdrop), # 4
      #  TransformerBlock(n_embd,n_head,seq_len,attn_pdrop=attn_pdrop,resid_pdrop=resid_pdrop), # 5
      #  TransformerBlock(n_embd,n_head,seq_len,attn_pdrop=attn_pdrop,resid_pdrop=resid_pdrop), # 6
      #  TransformerBlock(n_embd,n_head,seq_len,attn_pdrop=attn_pdrop,resid_pdrop=resid_pdrop), # 7
     #   MLPBlock(n_embd,mlp_ratio=4.0,pdrop=resid_pdrop), # 8, changed dropout to pdrop
      #  MLPBlock(n_embd,mlp_ratio=4.0,pdrop=resid_pdrop), # 9, changed dropout to pdrop
     #   TransformerBlock(n_embd,n_head,seq_len,attn_pdrop=attn_pdrop,resid_pdrop=resid_pdrop) # 10
    ])
    self.ln_f = nn.LayerNorm(n_embd)
    self.head = nn.Linear(n_embd,vocab_size,bias=False)
    self.apply(self._init_weights)

  def _init_weights(self,m):
    if isinstance(m,nn.Linear):
      nn.init.normal_(m.weight,mean=0.0,std=0.02)
      if m.bias is not None: nn.init.zeros_(m.bias)
    elif isinstance(m,nn.Embedding):
      nn.init.normal_(m.weight,mean=0.0,std=0.02)

  def forward(self,idx,targets=None):
    B,T = idx.size()
    assert T <= self.seq_len
    x = self.tok_emb(idx) + self.pos_emb[:,:T,:]
    x = self.drop(x)
    for layer in self.layers:
      x = layer(x)
    x = self.ln_f(x)
    logits = self.head(x)
    loss = None
    if targets is not None:
      loss = F.cross_entropy(logits.view(-1,logits.size(-1)),targets.view(-1))
    return logits,loss

  @torch.no_grad()
  def generate(self,idx,max_new_tokens=100,temperature=1.0,top_k=None):
    for _ in range(max_new_tokens):
      idx_cond = idx[:,-self.seq_len:]
      logits,_ = self(idx_cond)
      logits = logits[:,-1,:] / max(1e-8,temperature) # Corrected temperature variable name
      if top_k is not None :
        v,_ = torch.topk(logits,top_k)
        logits[logits < v[:,[-1]]] = -float('Inf')
      probs = F.softmax(logits,dim=-1)
      next_id = torch.multinomial(probs,num_samples=1)
      idx = torch.cat((idx,next_id),dim=1)
    return idx

In [20]:
trainer_17 = train_method(ModelClass=HybridLM_17,train_size=30000,val_size=5000,n_layer=1)

/usr/local/lib/python3.12/dist-packages/lightning_fabric/connector.py:571: `precision=16` is supported for historical reasons but its usage is discouraged. Please set your precision to 16-mixed instead!
INFO:pytorch_lightning.utilities.rank_zero:Using 16bit Automatic Mixed Precision (AMP)
INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
/usr/local/lib/python3.12/dist-packages/pytorch_lightning/utilities/model_summary/model_summary.py:231: Precision 16-mixed is not supp

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=3` reached.


In [21]:
val_perp_17 = trainer_17.callback_metrics["val_perplexity"].item()
val_loss_17 = trainer_17.callback_metrics["val_loss"].item()
print(f"Validation Loss : {val_loss_17:.4f} | Perplexity : {val_perp_17:.2f}")

Validation Loss : 1.2972 | Perplexity : 4.45
