<a href="https://colab.research.google.com/github/YasJanam/NLP_MODELS_2/blob/main/TinyNERModel_1/TinyNERModel_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pytorch_lightning

In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
from torch.nn.utils.rnn import pad_sequence
from sklearn.metrics import f1_score
from datasets import load_dataset

In [3]:
# ===== Mini Dataset =====
data = [
    ("Ali lives in Tehran", ["PER","O","O","LOC"]),
    ("Sara works at Google", ["PER","O","O","ORG"]),
    ("I saw John in Paris", ["O","O","PER","O","LOC"])
]

label2id = {"O":0, "PER":1, "LOC":2, "ORG":3}
id2label = {0:"O", 1:"PER", 2:"LOC", 3:"ORG"}

In [4]:
# ===== vocab =====
vocab = {"<PAD>":0}
for sentence, _ in data:
  for tok in sentence.split():
    if tok not in vocab:
      vocab[tok] = len(vocab)

In [5]:
class MiniNERDataset(Dataset):
  def __init__(self, data, vocab, label2id):
    self.data = data
    self.vocab = vocab
    self.label2id = label2id

  def __len__(self):
    return len(self.data)

  def __getitem__(self, idx):
    tokens, labels = self.data[idx]
    tokens = tokens.split()
    input_ids = [self.vocab[tok] for tok in tokens]
    label_ids = [self.label2id[l] for l in labels]
    return torch.tensor(input_ids), torch.tensor(label_ids)

In [6]:
def collate_fn(batch):
  input_ids, label_ids = zip(*batch)
  input_ids_padded = pad_sequence(input_ids, batch_first=True, padding_value=0)
  label_ids_padded = pad_sequence(label_ids, batch_first=True, padding_value=-100)
  return input_ids_padded, label_ids_padded

In [7]:
# ===== Simple Encoder =====
class SimpleEncoder(nn.Module):
  def __init__(self, vocab_size=1000, hidden_dim=32):
    super().__init__()
    self.embedding = nn.Embedding(vocab_size, hidden_dim)
    self.lstm = nn.LSTM(hidden_dim, hidden_dim, batch_first=True, bidirectional=True)

  def forward(self, input_ids):
    x = self.embedding(input_ids)
    x,_ = self.lstm(x)
    return x

In [12]:
# ===== TokenClassifier Model =====
class TokenClassifier(nn.Module):
  def __init__(self, encoder, num_labels):
    super().__init__()
    self.encoder = encoder
    self.classifier = nn.Linear(encoder.lstm.hidden_size * 2, num_labels)

  def forward(self, input_ids):
    x = self.encoder(input_ids)
    logits = self.classifier(x)
    return logits

In [9]:
# ===== Lightning Module =====
class NERModule(pl.LightningModule):
  def __init__(self, model, lr=1e-3):
    super().__init__()
    self.model = model
    self.loss_fn = nn.CrossEntropyLoss()
    self.lr = lr
    self.dataset = MiniNERDataset(data, vocab, label2id) # Store dataset as an attribute


  def train_dataloader(self):
    return DataLoader(self.dataset, batch_size=2, shuffle=True, collate_fn=collate_fn) # Use collate_fn

  def training_step(self, batch, batch_idx):
    # Unpack the tuple provided by the DataLoader with collate_fn
    input_ids, labels = batch

    logits = self.model(input_ids)
    loss = self.loss_fn(logits.view(-1, logits.shape[-1]), labels.view(-1))
    self.log("train_loss", loss)
    return loss


  def configure_optimizers(self):
    return torch.optim.Adam(self.parameters(), lr=self.lr)

In [10]:
dataset = MiniNERDataset(data, vocab, label2id)
loader = DataLoader(dataset, batch_size=2, shuffle=True, collate_fn=collate_fn)

In [11]:
# ===== Training =====
encoder = SimpleEncoder(vocab_size=len(vocab))
model = TokenClassifier(encoder, num_labels=len(label2id))
ner_module = NERModule(model)

trainer = pl.Trainer(max_epochs=3, accelerator='cpu',logger=False)
trainer.fit(ner_module, loader)

INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
/usr/local/lib/python3.12/dist-packages/pytorch_lightning/callbacks/model_checkpoint.py:751: Checkpoint directory /content/checkpoints exists and is not empty.
INFO:pytorch_lightning.callbacks.model_summary:
  | Name    | Type             | Params | Mode 
-----------------------------------------------------
0 | model   | TokenClassifier  | 17.6 K | train
1 | loss_fn | CrossEntropyLoss | 0      | train
-----------------------------------------------------
17.6 K    Trainable params
0         Non-trainable p

Training: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=3` reached.


In [14]:
print(ner_module)

NERModule(
  (model): TokenClassifier(
    (encoder): SimpleEncoder(
      (embedding): Embedding(13, 32)
      (lstm): LSTM(32, 32, batch_first=True, bidirectional=True)
    )
    (classifier): Linear(in_features=64, out_features=4, bias=True)
  )
  (loss_fn): CrossEntropyLoss()
)
