## Task 4

In [2]:
## Named Entity Recognition (NER) from News Articles

In [3]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, AutoConfig
from tqdm import tqdm
import matplotlib.pyplot as plt


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
def load_sentences(filepath):
    final = []
    sentences = []
    with open(filepath, 'r', encoding="utf-8") as f:
        for line in f.readlines():
            if line.strip() == "" or line.startswith("-DOCSTART-"):
                if len(sentences) > 0:
                    final.append(sentences)
                    sentences = []
            else:
                parts = line.split()
                sentences.append((parts[0], parts[-1]))
    return final

# Update this to your folder where train.txt, valid.txt, test.txt exist
base_path = r"C:\Users\Nisha\Desktop\Internship_2\Task_4\\"

train_samples = load_sentences(base_path + "train.txt")
valid_samples = load_sentences(base_path + "valid.txt")
test_samples  = load_sentences(base_path + "test.txt")

samples = train_samples + test_samples

schema = ['_'] + sorted({tag for sentence in samples for _, tag in sentence})
tag2id = {tag: i for i, tag in enumerate(schema)}
id2tag = {i: tag for tag, i in tag2id.items()}

print(f"Train: {len(train_samples)}, Valid: {len(valid_samples)}, Test: {len(test_samples)}")
print(f"Entity tags: {schema}")


Train: 14041, Valid: 3250, Test: 3453
Entity tags: ['_', 'B-LOC', 'B-MISC', 'B-ORG', 'B-PER', 'I-LOC', 'I-MISC', 'I-ORG', 'I-PER', 'O']


In [5]:
MODEL_NAME = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_and_align_labels(sentences):
    input_ids = []
    labels = []

    for sentence in sentences:
        tokens = []
        tag_ids = []

        for word, tag in sentence:
            word_tokens = tokenizer.tokenize(word)
            if not word_tokens:
                continue
            tokens.extend(word_tokens)
            tag_ids.extend([tag2id[tag]] + [tag2id['O']]*(len(word_tokens)-1))

        tokens = ["[CLS]"] + tokens + ["[SEP]"]
        tag_ids = [tag2id['O']] + tag_ids + [tag2id['O']]

        input_ids.append(tokenizer.convert_tokens_to_ids(tokens))
        labels.append(tag_ids)

    max_len = max(len(x) for x in input_ids)
    input_ids = [x + [0]*(max_len-len(x)) for x in input_ids]
    labels = [x + [tag2id['O']]*(max_len-len(x)) for x in labels]

    return torch.tensor(input_ids), torch.tensor(labels)

X_train, y_train = tokenize_and_align_labels(train_samples)
X_valid, y_valid = tokenize_and_align_labels(valid_samples)
X_test, y_test   = tokenize_and_align_labels(test_samples)


In [6]:
BATCH_SIZE = 8

train_dataset = TensorDataset(X_train, y_train)
valid_dataset = TensorDataset(X_valid, y_valid)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE)


In [7]:
config = AutoConfig.from_pretrained(MODEL_NAME, num_labels=len(schema))
model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME, config=config)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

In [None]:
from torch.optim import AdamW

EPOCHS = 3
optimizer = AdamW(model.parameters(), lr=5e-5)
loss_fn = torch.nn.CrossEntropyLoss()

train_losses, valid_losses = [], []

for epoch in range(EPOCHS):
    # Training
    model.train()
    total_loss = 0
    for X, y in train_loader:
        X, y = X.to(device), y.to(device)
        outputs = model(X).logits
        loss = loss_fn(outputs.view(-1, len(schema)), y.view(-1))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    train_losses.append(total_loss / len(train_loader))




We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


In [None]:
    # Validation
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for X, y in valid_loader:
            X, y = X.to(device), y.to(device)
            outputs = model(X).logits
            loss = loss_fn(outputs.view(-1, len(schema)), y.view(-1))
            total_loss += loss.item()
    valid_losses.append(total_loss / len(valid_loader))

    print(f"Epoch {epoch+1}/{EPOCHS} | Train Loss: {train_losses[-1]:.4f} | Valid Loss: {valid_losses[-1]:.4f}")

In [None]:
plt.figure(figsize=(10,6))
plt.plot(train_losses, label="Train Loss")
plt.plot(valid_losses, label="Valid Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.show()
