In [None]:
!pip install pytorch_lightning  seqeval evaluate

In [3]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
from torch.nn.utils.rnn import pad_sequence
from sklearn.metrics import f1_score
from datasets import load_dataset
from evaluate import load as load_metric
from transformers import AutoTokenizer

In [4]:
dataset = load_dataset("manu/wnut_17")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/926 [00:00<?, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/309k [00:00<?, ?B/s]

data/validation-00000-of-00001.parquet:   0%|          | 0.00/67.1k [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/116k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3394 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1009 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1287 [00:00<?, ? examples/s]

In [5]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 3394
    })
    validation: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 1009
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 1287
    })
})

In [20]:
# Dataset classes
id2label ={'0': "O",
            '1': "B-corporation",
            '2': "I-corporation",
            '3': "B-creative-work",
            '4': "I-creative-work",
            '5': "B-group",
            '6': "I-group",
            '7': "B-location",
            '8': "I-location",
            '9': "B-person",
            '10': "I-person",
            '11': "B-product",
            '12': "I-product"}

label2id = {k:v for v,k in id2label.items()}

In [7]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
vocab_size = tokenizer.vocab_size

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [9]:
def Preprocess_Dataset_2(dataset, split: str, tokenizer):
    processed_data = []
    for example in dataset[split]:
        tokens = example['tokens']
        ner_tags = example['ner_tags']

        # Tokenize the entire sentence
        tokenized_input = tokenizer(
            tokens,
            is_split_into_words=True,
            padding=False,
            truncation=False,
            return_offsets_mapping=True
        )

        word_ids = tokenized_input.word_ids()
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(ner_tags[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx

        processed_data.append({
            'input_ids': tokenized_input['input_ids'],
            'attention_mask': tokenized_input['attention_mask'],
            'labels': label_ids
        })
    return processed_data

In [10]:
train_data = Preprocess_Dataset_2(dataset,'train',tokenizer)
validation_data = Preprocess_Dataset_2(dataset,'validation',tokenizer)
test_data = Preprocess_Dataset_2(dataset,'test',tokenizer)

In [11]:
class NERDataset(Dataset):
  def __init__(self, data, tokenizer):
    self.data = data
    self.tokenizer = tokenizer

  def __len__(self):
    return len(self.data)

  def __getitem__(self, idx):
    return {

            "input_ids": torch.tensor(self.data[idx]['input_ids']),
            "attention_mask": torch.tensor(self.data[idx]['attention_mask']),
            "labels": torch.tensor(self.data[idx]['labels'])
    }

**collate_fn**

متدی برای پد کردن داده ها برای اینکه همه هم اندازه باشند

یک بچ از داده ها را میگرد و همه داده های آن بچ را پد میکند

این متد در دیتالودر استفاده میشه

در واقع **هر بار که دیتالودر یک بچ برمیگردونه این متد روی اون بچ اعمال میشه**

In [12]:
def collate_fn(batch):
  input_ids, attention_mask, label_ids = zip(*[(item['input_ids'], item['attention_mask'], item['labels']) for item in batch])

  input_ids_padded = pad_sequence(input_ids, batch_first=True, padding_value=0)
  attention_mask_padded = pad_sequence(attention_mask, batch_first=True, padding_value=0)
  label_ids_padded = pad_sequence(label_ids, batch_first=True, padding_value=-100)

  return {
      "input_ids": input_ids_padded,
      "attention_mask": attention_mask_padded,
      "labels": label_ids_padded
  }

In [13]:
# ===== Simple Encoder =====
class SimpleEncoder(nn.Module):
  def __init__(self, vocab_size= vocab_size, hidden_dim=32):
    super().__init__()
    self.embedding = nn.Embedding(vocab_size, hidden_dim)
    self.lstm = nn.LSTM(hidden_dim, hidden_dim, batch_first=True, bidirectional=True)

  def forward(self, input_ids):
    x = self.embedding(input_ids)
    x,_ = self.lstm(x)
    return x

In [14]:
# ===== TokenClassifier Model =====
class TokenClassifier(nn.Module):
  def __init__(self, encoder, num_labels):
    super().__init__()
    self.encoder = encoder
    self.classifier = nn.Linear(encoder.lstm.hidden_size * 2, num_labels)

  def forward(self, input_ids):
    x = self.encoder(input_ids)
    logits = self.classifier(x)
    return logits

In [15]:
# ===== Lightning Module =====
metric = load_metric("seqeval")

class NERModule(pl.LightningModule):
  def __init__(self, model, lr=1e-3):
    super().__init__()
    self.model = model
    self.loss_fn = nn.CrossEntropyLoss()
    self.lr = lr
    self.train_dataset = NERDataset(train_data, vocab_size) # Store dataset as an attribute
    self.validation_dataset = NERDataset(validation_data, vocab_size)


  def forward(self, input_ids):
      return self.model(input_ids)

  def train_dataloader(self):
    return DataLoader(self.train_dataset, batch_size=3, shuffle=True, collate_fn=collate_fn) # Use collate_fn

  def val_dataloader(self):
      return DataLoader(self.validation_dataset, batch_size=3, collate_fn=collate_fn)


  def training_step(self, batch, batch_idx):
    # Unpack the tuple provided by the DataLoader with collate_fn
    input_ids = batch['input_ids']
    labels = batch['labels']

    logits = self.model(input_ids)
    loss = self.loss_fn(logits.view(-1, logits.shape[-1]), labels.view(-1))
    self.log("train_loss", loss)
    return loss

  def validation_step(self,batch, batch_idx):
    input_ids = batch['input_ids']
    labels = batch['labels']

    logits = self.model(input_ids)
    pred = torch.argmax(logits, dim=-1)

    loss = self.loss_fn(logits.view(-1, logits.shape[-1]), labels.view(-1))
    self.log("val_loss", loss)

    # Convert predictions and labels to lists for seqeval
    predictions = []
    references = []
    for i in range(pred.shape[0]):
        preds_row = []
        labels_row = []
        for j in range(pred.shape[1]):
            # Only include non-padded tokens (labels != -100)
            if labels[i][j].item() != -100:
                preds_row.append(id2label[str(pred[i][j].item())])
                labels_row.append(id2label[str(labels[i][j].item())])
        predictions.append(preds_row)
        references.append(labels_row)

    results = metric.compute(predictions=predictions, references=references)

    # Log each metric individually
    for key, value in results.items():
        if isinstance(value, dict):
            # Log nested metrics (precision, recall, f1, number)
            for sub_key, sub_value in value.items():
                self.log(f"val_{key}_{sub_key}", sub_value, prog_bar=True)
        else:
            # Log overall metrics (accuracy, f1, precision, recall)
            self.log(f"val_{key}", value, prog_bar=True)



  def configure_optimizers(self):
    return torch.optim.Adam(self.parameters(), lr=self.lr)

Downloading builder script: 0.00B [00:00, ?B/s]

In [18]:
# ===== Training =====
device = "cuda" if torch.cuda.is_available() else "cpu"
num_classes = 13

encoder = SimpleEncoder(vocab_size=vocab_size)
model = TokenClassifier(encoder, num_labels=num_classes)
ner_module = NERModule(model)

trainer = pl.Trainer(max_epochs=6, accelerator=device ,logger=False)
trainer.fit(ner_module)

INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.callbacks.model_summary:
  | Name    | Type             | Params | Mode 
-----------------------------------------------------
0 | model   | TokenClassifier  | 994 K  | train
1 | loss_fn | CrossEntropyLoss | 0      | train
-----------------------------------------------------
994 K     Trainable params
0         Non-trainable params
994 K     Total params
3.978     Total estimated model params size (MB)
6         Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)


Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=6` reached.


##### **نتایج ارزیابی بالا در زیر تمیز نوشته شده**
در فرم ژوپیتری کد نتایج مشخص نیستند.ولی نسخه دیگر را در کولب باز کنید نتایج کامل مشخص اند

In [26]:
  {
    "corporation": {
        "precision": 0.0,
        "recall": 0.0,
        "f1": 0.0,
        "number": 971
    },
    "creative-work": {
        "precision": 0.0,
        "recall": 0.0,
        "f1": 0.0,
        "number": 1110
    },
    "group": {
        "precision": 0.0,
        "recall": 0.0,
        "f1": 0.0,
        "number": 1110
    },
    "location": {
        "precision": 0.022,
        "recall": 0.0165,
        "f1": 0.0183,
        "number": 813
    },
    "person": {
        "precision": 0.102,
        "recall": 0.071,
        "f1": 0.0803,
        "number": 1780
    },
    "product": {
        "precision": 0.0,
        "recall": 0.0,
        "f1": 0.0,
        "number": 1300
    },
    "overall_precision": 0.0818,
    "overall_recall": 0.0417,
    "overall_f1": 0.0527,
    "overall_accuracy": 0.909
}

{'corporation': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 971},
 'creative-work': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 1110},
 'group': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 1110},
 'location': {'precision': 0.022,
  'recall': 0.0165,
  'f1': 0.0183,
  'number': 813},
 'person': {'precision': 0.102, 'recall': 0.071, 'f1': 0.0803, 'number': 1780},
 'product': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 1300},
 'overall_precision': 0.0818,
 'overall_recall': 0.0417,
 'overall_f1': 0.0527,
 'overall_accuracy': 0.909}

In [19]:
# ============ Test ============
test_dataset = NERDataset(test_data, vocab_size)
test_dataloader = DataLoader(test_dataset, batch_size=3, collate_fn=collate_fn)

# Put the model in evaluation mode
ner_module.eval()
predictions = []
references = []

with torch.no_grad():
    for batch in test_dataloader:
        input_ids = batch['input_ids']
        labels = batch['labels']

        logits = ner_module.model(input_ids)  # Explicitly call the model's forward
        pred = torch.argmax(logits, dim=-1)

        # Convert predictions and labels to lists for seqeval
        for i in range(pred.shape[0]):
            preds_row = []
            labels_row = []
            for j in range(pred.shape[1]):
                # Only include non-padded tokens (labels != -100)
                if labels[i][j].item() != -100:
                    preds_row.append(id2label[str(pred[i][j].item())])
                    labels_row.append(id2label[str(labels[i][j].item())])
            predictions.append(preds_row)
            references.append(labels_row)


# Compute metrics
metric = load_metric("seqeval")
results = metric.compute(predictions=predictions, references=references)

# Convert numpy/torch objects to standard Python types
def convert_to_python_types(obj):
    if isinstance(obj, dict):
        return {k: convert_to_python_types(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [convert_to_python_types(item) for item in obj]
    elif isinstance(obj, (int, float, str, bool, type(None))):
        return obj
    elif isinstance(obj, (torch.Tensor)):
        return obj.tolist()
    elif hasattr(obj, 'item'): # Handle numpy scalars
        return obj.item()
    else:
        return str(obj) # Convert other types to string

results_python = convert_to_python_types(results)


import json
print(json.dumps(results_python, indent=4))

{
    "corporation": {
        "precision": 0.0,
        "recall": 0.0,
        "f1": 0.0,
        "number": 66
    },
    "creative-work": {
        "precision": 0.0,
        "recall": 0.0,
        "f1": 0.0,
        "number": 142
    },
    "group": {
        "precision": 0.16666666666666666,
        "recall": 0.006060606060606061,
        "f1": 0.011695906432748537,
        "number": 165
    },
    "location": {
        "precision": 0.08196721311475409,
        "recall": 0.06666666666666667,
        "f1": 0.07352941176470587,
        "number": 150
    },
    "person": {
        "precision": 0.1986754966887417,
        "recall": 0.06993006993006994,
        "f1": 0.10344827586206898,
        "number": 429
    },
    "product": {
        "precision": 0.0,
        "recall": 0.0,
        "f1": 0.0,
        "number": 127
    },
    "overall_precision": 0.13712374581939799,
    "overall_recall": 0.037998146431881374,
    "overall_f1": 0.059506531204644414,
    "overall_accuracy": 0.920140

#### **📌 توضیح کلاس‌ها:**

🟥**person** → اسم آدم‌ها (مثل: Elon Musk, Taylor Swift)

🟪**location** → مکان‌ها (شهر، کشور، منطقه، آدرس؛ مثل: New York, Mount Everest)

🟩**corporation** → شرکت‌ها و سازمان‌ها (مثل: Google, United Nations)

🟨**product** → محصولات (مثل: iPhone 15, Coca-Cola)

🟦**group** → گروه‌ها و تیم‌ها (مثل: The Beatles, Manchester United)

🟧**creative-work** → آثار هنری یا رسانه‌ای (مثل: Harry Potter, Game of Thrones, Star Wars)


این‌ها در واقع کلاس‌های موجودیت هستن که دیتاست تعریف کرده ⚡

 (یعنی هیچ موجودیتی نیست) می‌گیره  O یعنی هر توکنی که داخل متن هست، یا ⚡

 یا یکی از این برچسب‌ها

