In [None]:
!pip install pytorch_lightning

In [2]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
from torch.nn.utils.rnn import pad_sequence
from sklearn.metrics import f1_score
from datasets import load_dataset

In [6]:
# ===== Mini Dataset =====
data = [
    ("Ali lives in Tehran", ["PER","O","O","LOC"]),
    ("Sara works at Google", ["PER","O","O","ORG"]),
    ("I saw John in Paris", ["O","O","PER","O","LOC"])
]

label2id = {"O":0, "PER":1, "LOC":2, "ORG":3}  # توی این کد یک جورایی کار توکنایزر رو میکنه label2id
id2label = {0:"O", 1:"PER", 2:"LOC", 3:"ORG"}

In [4]:
# ===== vocab =====
vocab = {"<PAD>":0}
for sentence, _ in data:
  for tok in sentence.split():
    if tok not in vocab:
      vocab[tok] = len(vocab)

In [5]:
vocab

{'<PAD>': 0,
 'Ali': 1,
 'lives': 2,
 'in': 3,
 'Tehran': 4,
 'Sara': 5,
 'works': 6,
 'at': 7,
 'Google': 8,
 'I': 9,
 'saw': 10,
 'John': 11,
 'Paris': 12}

In [246]:
class MiniNERDataset(Dataset):
  def __init__(self, data, vocab, label2id):
    self.data = data
    self.vocab = vocab
    self.label2id = label2id
    print("MiniNERDataset")

  def __len__(self):
    return len(self.data)

  def __getitem__(self, idx):
    tokens, labels = self.data[idx]
    tokens = tokens.split()
    input_ids = [self.vocab[tok] for tok in tokens]
    label_ids = [self.label2id[l] for l in labels]
    return torch.tensor(input_ids), torch.tensor(label_ids)

**collate_fn**

متدی برای پد کردن داده ها برای اینکه همه هم اندازه باشند

یک بچ از داده ها را میگرد و همه داده های آن بچ را پد میکند

این متد در دیتالودر استفاده میشه

در واقع **هر بار که دیتالودر یک بچ برمیگردونه این متد روی اون بچ اعمال میشه**



In [305]:
def collate_fn(batch):
  print("\n\n🟨 START 🟨")
  print("🍁🍁🍁🍁🍁🍁🍁🍁🍁🍁🍁🍁 collate_fn 🍁🍁🍁🍁🍁🍁🍁🍁🍁🍁🍁🍁")
  input_ids, label_ids = zip(*batch)
  print(f"\n\t| input_ids : {input_ids} ")
  print(f"\t| label_ids : {label_ids} ")
  input_ids_padded = pad_sequence(input_ids, batch_first=True, padding_value=0)
  label_ids_padded = pad_sequence(label_ids, batch_first=True, padding_value=-100)
  print(f"\t| pad_input_ids: {input_ids_padded} ")
  print(f"\t| pad_label_ids: {label_ids_padded} ")
  return input_ids_padded, label_ids_padded

In [273]:
# ===== Simple Encoder =====
class SimpleEncoder(nn.Module):
  def __init__(self, vocab_size=1000, hidden_dim=32):
    super().__init__()
    self.embedding = nn.Embedding(vocab_size, hidden_dim)
    self.lstm = nn.LSTM(hidden_dim, hidden_dim, batch_first=True, bidirectional=True)
    print("Simple Encoder")

  def forward(self, input_ids):
    print("\t↪------ SimpleEncoder-class ------")
    print("\t\t ↪|Forward method :")
    print("\t\t\t ↪|lstm کلاس انکودر : یک لایه امبدینگ + یه لایه")
    print("\t\t\t ↪|➡➡➡➡➡ embedding ⬅⬅⬅⬅⬅")
    x = self.embedding(input_ids)
    print("\t\t\t ↪|➡➡➡➡➡ lstm ⬅⬅⬅⬅⬅")
    x,_ = self.lstm(x)
    print("\t\t\t ↪|OUT-OF-FORWARD (خروج از انکودر)\n")
    return x

In [274]:
# ===== TokenClassifier Model =====
class TokenClassifier(nn.Module):
  def __init__(self, encoder, num_labels):
    super().__init__()
    self.encoder = encoder
    self.classifier = nn.Linear(encoder.lstm.hidden_size * 2, num_labels)
    print("TokenClassification")

  def forward(self, input_ids):
    print("➡➡➡➡➡ TokenClassifier ⬅⬅⬅⬅⬅")
    print("\t ↪|forward method :")
    print("\t\t ↪|Encoder-Layer")
    print("\t\t ↪|Linear-Layer")
    print("\n➡➡➡➡➡ Encoder-Layer ⬅⬅⬅⬅⬅")
    x = self.encoder(input_ids)
    print("➡➡➡➡➡ Linear-Layer ⬅⬅⬅⬅⬅")
    logits = self.classifier(x)
    return logits

In [303]:
# ===== Lightning Module =====
class NERModule(pl.LightningModule):
  def __init__(self, model, lr=1e-3):
    super().__init__()
    self.model = model
    self.loss_fn = nn.CrossEntropyLoss()
    self.lr = lr
    self.dataset = MiniNERDataset(data, vocab, label2id) # Store dataset as an attribute
    print("Lightning Module")


  def train_dataloader(self):
    print("🟪🟪🟪🟪🟪🟪🟪🟪🟪🟪🟪 train_dataloader method 🟪🟪🟪🟪🟪🟪🟪🟪🟪🟪🟪\n")
    dataloader = DataLoader(self.dataset, batch_size=2, shuffle=True, collate_fn=collate_fn)
    #print("\n⭐ 2-BATCHES ⭐\n")
    return dataloader # Use collate_fn

  def training_step(self, batch, batch_idx):
    print("\n◼◼◼◼◼◼◼◼◼◼◼◼  training_step  ◼◼◼◼◼◼◼◼◼◼◼◼")
    input_ids, labels = batch
    print("\n◻◻◻◻◻◻◻◻◻◻◻◻ call model ◻◻◻◻◻◻◻◻◻◻◻◻")
    logits = self.model(input_ids)
    print("\n◻◻◻◻◻◻◻◻◻◻◻◻ calculate loss ◻◻◻◻◻◻◻◻◻◻◻◻")
    loss = self.loss_fn(logits.view(-1, logits.shape[-1]), labels.view(-1))
    print(f"loss : {loss}")
    self.log("train_loss", loss)
    return loss


  def configure_optimizers(self):
    print("\n\n\n 🟩🟩🟩🟩🟩🟩🟩🟩 configure_optimizers method 🟩🟩🟩🟩🟩🟩🟩🟩")
    print(f"optimizer : {torch.optim.Adam(self.parameters(), lr=self.lr)}\n\n")
    return torch.optim.Adam(self.parameters(), lr=self.lr)

In [308]:
# ===== Training =====
print("\n-- Encoder -->")
encoder = SimpleEncoder(vocab_size=len(vocab))
print("\n-- Model -->")
model = TokenClassifier(encoder, num_labels=len(label2id))
print("\n-- NERModule -->")
ner_module = NERModule(model)


trainer = pl.Trainer(max_epochs=3, accelerator='cpu',logger=False)
print("\n-- START Training -->")
trainer.fit(ner_module)

INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.callbacks.model_summary:
  | Name    | Type             | Params | Mode 
-----------------------------------------------------
0 | model   | TokenClassifier  | 17.6 K | train
1 | loss_fn | CrossEntropyLoss | 0      | train
-----------------------------------------------------
17.6 K    Trainable params
0         Non-trainable params
17.6 K    Total params
0.070     Total estimated model params size (MB)
6         Modules in train mode
0         Modules in eval mode



-- Encoder -->
Simple Encoder

-- Model -->
TokenClassification

-- NERModule -->
MiniNERDataset
Lightning Module

-- START Training -->



 🟩🟩🟩🟩🟩🟩🟩🟩 configure_optimizers method 🟩🟩🟩🟩🟩🟩🟩🟩
optimizer : Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    decoupled_weight_decay: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 0.001
    maximize: False
    weight_decay: 0
)


🟪🟪🟪🟪🟪🟪🟪🟪🟪🟪🟪 train_dataloader method 🟪🟪🟪🟪🟪🟪🟪🟪🟪🟪🟪



Training: |          | 0/? [00:00<?, ?it/s]



🟨 START 🟨
🍁🍁🍁🍁🍁🍁🍁🍁🍁🍁🍁🍁 collate_fn 🍁🍁🍁🍁🍁🍁🍁🍁🍁🍁🍁🍁

	| input_ids : (tensor([ 9, 10, 11,  3, 12]), tensor([5, 6, 7, 8])) 
	| label_ids : (tensor([0, 0, 1, 0, 2]), tensor([1, 0, 0, 3])) 
	| pad_input_ids: tensor([[ 9, 10, 11,  3, 12],
        [ 5,  6,  7,  8,  0]]) 
	| pad_label_ids: tensor([[   0,    0,    1,    0,    2],
        [   1,    0,    0,    3, -100]]) 

◼◼◼◼◼◼◼◼◼◼◼◼  training_step  ◼◼◼◼◼◼◼◼◼◼◼◼

◻◻◻◻◻◻◻◻◻◻◻◻ call model ◻◻◻◻◻◻◻◻◻◻◻◻
➡➡➡➡➡ TokenClassifier ⬅⬅⬅⬅⬅
	 ↪|forward method :
		 ↪|Encoder-Layer
		 ↪|Linear-Layer

➡➡➡➡➡ Encoder-Layer ⬅⬅⬅⬅⬅
	↪------ SimpleEncoder-class ------
		 ↪|Forward method :
			 ↪|lstm کلاس انکودر : یک لایه امبدینگ + یه لایه
			 ↪|➡➡➡➡➡ embedding ⬅⬅⬅⬅⬅
			 ↪|➡➡➡➡➡ lstm ⬅⬅⬅⬅⬅
			 ↪|OUT-OF-FORWARD (خروج از انکودر)

➡➡➡➡➡ Linear-Layer ⬅⬅⬅⬅⬅

◻◻◻◻◻◻◻◻◻◻◻◻ calculate loss ◻◻◻◻◻◻◻◻◻◻◻◻
loss : 1.3405110836029053


🟨 START 🟨
🍁🍁🍁🍁🍁🍁🍁🍁🍁🍁🍁🍁 collate_fn 🍁🍁🍁🍁🍁🍁🍁🍁🍁🍁🍁🍁

	| input_ids : (tensor([1, 2, 3, 4]),) 
	| label_ids : (tensor([1, 0, 0, 2]),) 
	| pad_input_ids: ten

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=3` reached.


---

شش بار فراخوانی شده training_step  اگر توجه کنید در بالا متد

به این دلیل که ایپاک برابر 3 است و بچ برابر 2

epoch = 3 , batch-size = 2 ----> 6 × call training_step

