In [47]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader, random_split
import torch.nn as nn

Data Preprocessing

In [None]:
dataset = pd.read_csv(r".\NER dataset.csv", encoding='ISO-8859-1')

print(dataset.head(50))
print(dataset.isna().sum())
print(dataset['Tag'].value_counts())

     Sentence #           Word  POS    Tag
0   Sentence: 1      Thousands  NNS      O
1           NaN             of   IN      O
2           NaN  demonstrators  NNS      O
3           NaN           have  VBP      O
4           NaN        marched  VBN      O
5           NaN        through   IN      O
6           NaN         London  NNP  B-geo
7           NaN             to   TO      O
8           NaN        protest   VB      O
9           NaN            the   DT      O
10          NaN            war   NN      O
11          NaN             in   IN      O
12          NaN           Iraq  NNP  B-geo
13          NaN            and   CC      O
14          NaN         demand   VB      O
15          NaN            the   DT      O
16          NaN     withdrawal   NN      O
17          NaN             of   IN      O
18          NaN        British   JJ  B-gpe
19          NaN         troops  NNS      O
20          NaN           from   IN      O
21          NaN           that   DT      O
22         

In [49]:
# Filling Na
dataset = dataset.ffill()

In [50]:
# Grouping words and tags by sentence

sentences = []
labels = []

grouped = dataset.groupby('Sentence #')

for _, group in grouped:
    words = list(group['Word'])
    tags = list(group['Tag'])
    sentences.append(words)
    labels.append(tags)

In [51]:
print(len(sentences))
print(len(labels))

47959
47959


In [52]:
class SentenceVocab:
    def __init__(self):
        
        self.max_sent_len = 0
        self.n_words = 0
        self.word2idx = {}
        self.idx2word = {}

        # Special tokens
        self.PAD_TOKEN = "<PAD>"
        self.UNK_TOKEN = "<UNK>"

        self.addWord(self.PAD_TOKEN) # index 0
        self.addWord(self.UNK_TOKEN) # index 1

    def addSentence(self, sentence):
        if len(sentence) > self.max_sent_len:
            self.max_sent_len = len(sentence)

        for word in sentence:
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2idx:
            self.word2idx[word] = self.n_words
            self.idx2word[self.n_words] = word
            self.n_words += 1

class TagVocab:
    def __init__(self):
        self.n_tags = 0
        self.tag2idx = {}
        self.idx2tag = {}

    def addTags(self, tags):
        for tag in tags:
            self.addTag(tag)

    def addTag(self, tag):
        if tag not in self.tag2idx:
            self.tag2idx[tag] = self.n_tags
            self.idx2tag[self.n_tags] = tag
            self.n_tags += 1


In [53]:
word_vocab = SentenceVocab()
tag_vocab = TagVocab()

for word, tag_seq in zip(sentences, labels):
    word_vocab.addSentence(word)
    tag_vocab.addTags(tag_seq)

Creating Dataloader class

In [54]:
class NERDataset(Dataset):
    def __init__(self, sentences, tags, word_vocab, tag_vocab):
        """
        sentences: list of list of words
        tags: list of list of tags
        word_vocab: Vocab object for words
        tag_vocab: Vocab object for tags
        """

        self.sentences = sentences
        self.tags = tags
        self.word_vocab = word_vocab
        self.tag_vocab = tag_vocab

    def __len__(self):
        return len(self.sentences)
    
    def __getitem__(self, idx):
        return self.sentences[idx], self.tags[idx]
        
def pad_sequence(sequences, pad_value):
    max_len = max(len(seq) for seq in sequences)
    return [seq + [pad_value]*(max_len- len(seq)) for seq in sequences]

def create_collate_fn(word_vocab:SentenceVocab, tag_vocab:TagVocab):
    def collate_fn(batch):
        """
        batch: list of (sentence, tag) pairs
        """

        sentences, tags = zip(*batch)

        # convert words to IDs
        word_ids = [[word_vocab.word2idx.get(w, word_vocab.word2idx['<UNK>']) for w in sent] for sent in sentences]
        tag_ids = [[tag_vocab.tag2idx.get(t) for t in tag_seq] for tag_seq in tags]

        # pad sequences
        word_ids = pad_sequence(word_ids, word_vocab.word2idx["<PAD>"])
        tag_ids = pad_sequence(tag_ids, tag_vocab.tag2idx["O"]) # default pad tag = "O" (outside)
        
        # Create attention mask (1 for real tokens, 0 for pad)
        attention_masks = [[1 if token != word_vocab.word2idx["<PAD>"] else 0 for token in seq] for seq in word_ids]

        return (
            torch.tensor(word_ids, dtype=torch.long),
            torch.tensor(tag_ids, dtype=torch.long),
            torch.tensor(attention_masks, dtype=torch.long)
        )
    
    return collate_fn

def get_device():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    return device

Creating Encoder Only Transformer

In [55]:
class MultiHeadAttention(nn.Module):
    def __init__(self, hidden_size, num_heads=8, masked=False):
        super(MultiHeadAttention, self).__init__()

        assert hidden_size % num_heads == 0, "hidden_size must be divisible by num_heads"

        self.hidden_size = hidden_size
        self.head_dim = hidden_size // num_heads
        self.num_heads = num_heads
        self.masked = masked

        # Defining query, key and value weights
        self.query_W = nn.Linear(self.hidden_size, hidden_size)
        self.key_W = nn.Linear(self.hidden_size, self.hidden_size)
        self.value_W = nn.Linear(self.hidden_size, self.hidden_size)

        # Output Layer
        self.output_layer = nn.Linear(self.hidden_size, self.hidden_size)

    
    def forward(self, x, mask):
        batch_size, seq_len, _ = x.size()

        key = self.key_W(x)
        query = self.query_W(x)
        value = self.value_W(x)

        Q = self.split_head(query)
        K = self.split_head(key)
        V = self.split_head(value)

        # Calculate Attention Scores.
        scores = torch.matmul(Q, K.transpose(-2, -1)) / (self.head_dim**0.5)

        if self.masked and mask is not None:
            scores = scores.masked_fill(mask==0, float("-1e20"))

        attn_weights = torch.softmax(scores, dim=-1)
        attn_output = torch.matmul(attn_weights, V)

        # Concatenate heads
        # Note: view() requires the underlying tensor to be contiguous. If you try to call view() on a non-contiguous tensor, PyTorch will raise a runtime error.
        attn_output = attn_output.transpose(1, 2).contiguous().view(batch_size, seq_len, self.num_heads * self.head_dim)

        # Final output projection
        output = self.output_layer(attn_output)

        return output

    def split_head(self, x):
        batch_size, seq_len, _ = x.size()
        return x.reshape(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
    
class TransformerBlock(nn.Module):
    def __init__(self, embd_size, heads, dropout, forward_expansion):
        super(TransformerBlock, self).__init__()

        self.attention = MultiHeadAttention(embd_size, heads)

        # Normalization layers
        self.norm_1 = nn.LayerNorm(embd_size)
        self.norm_2 = nn.LayerNorm(embd_size)

        # Feedforward Network
        self.feedforward = nn.Sequential(
            nn.Linear(embd_size, forward_expansion*embd_size),
            nn.ReLU(),
            nn.Linear(forward_expansion*embd_size, embd_size)
        )

        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask):
    
        # implementing pre normalization
        attn_out = self.attention(self.norm_1(x), mask)

        x = x + self.dropout(attn_out)

        ff_out = self.feedforward(self.norm_2(x))
        out = x + self.dropout(ff_out)

        return out


class Encoder(nn.Module):
    def __init__(self, embd_dim, heads, forward_expansion, dropout, vocab_size, max_sent_len):
        super(Encoder, self).__init__()

        self.embd_dim = embd_dim
        self.heads = heads

        self.word_embd = nn.Embedding(vocab_size, embd_dim)
        self.pos_embd = nn.Embedding(max_sent_len, embd_dim)

        self.layers = nn.ModuleList(
            [
                TransformerBlock(
                    embd_dim,
                    heads,
                    dropout,
                    forward_expansion
                )
            ]
        )

        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask):

        N, seq_length = x.shape
        positions = torch.arange(0, seq_length).expand(N, seq_length).to(x.device)

        out = self.dropout(self.word_embd(x) + self.pos_embd(positions))

        for layer in self.layers:
            out = layer(out, mask) # Key, Query, Value all will be same

        return out
    
class NERNeuralNetwork(nn.Module):
    def __init__(self, embd_dim, heads, forward_expansion, dropout, vocab_size, max_sent_len, total_tags):
        super(NERNeuralNetwork, self).__init__()

        self.encoder = Encoder(embd_dim=embd_dim, heads=heads, forward_expansion=forward_expansion, dropout=dropout, vocab_size=vocab_size, max_sent_len=max_sent_len)
        self.linear = nn.Linear(embd_dim, total_tags)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, attn_mask):
        x = self.encoder(x, attn_mask)
        x = self.dropout(x)
        x = self.linear(x)
        return x

In [56]:
# Implementing Training Loop
import os
import time
from torch import optim
from tqdm import tqdm

device = get_device()

def train(dataloader, ner_model, n_epochs, print_every, learning_rate):
    start = time.time()

    ner_model = ner_model.to(device)

    optimizer = optim.Adam(ner_model.parameters(), lr=learning_rate)
    criterion = nn.CrossEntropyLoss(ignore_index=0)

    # Initialize epoch progress bar
    epoch_bar = tqdm(range(1, n_epochs + 1), desc="Training", unit="epoch")

    for epoch in epoch_bar:
        
        ner_model.train()
        total_loss = 0
        num_batches = 0

        # Initialize batch progress bar for current epoch
        batch_bar = tqdm(dataloader, desc=f"Epoch {epoch}/{n_epochs}", leave=False)

        for inputs, labels, attn_mask in batch_bar:

            inputs = inputs.to(device)
            labels = labels.to(device)
            attn_mask = attn_mask.to(device)

            # Get predictions from the model
            output = ner_model(inputs, attn_mask)

            # Empty gradients
            optimizer.zero_grad()

            loss = criterion(output.view(-1, output.shape[-1]), labels.view(-1))

            # calculate gradients w.r.t loss function.
            loss.backward()

            # Update the weights.
            optimizer.step()

            total_loss += loss.item()
            num_batches += 1

            # Update batch progress bar
            batch_bar.set_postfix({'Loss': f'{loss.item():.4f}'})

        # Calculate average loss for this epoch
        avg_loss = total_loss / num_batches
        
        # Update epoch progress bar
        epoch_bar.set_postfix({'Avg Loss': f'{avg_loss:.4f}'})

        # Print summary every few epochs
        if epoch % print_every == 0:
            tqdm.write(f"Epoch {epoch}/{n_epochs} - Avg Loss: {avg_loss:.4f}")

    print(f"Training finished in {time.time() - start:.2f}s")

    print("Saving model...")
    ner_path = os.path.join("./", "ner_model.pth")
    torch.save(ner_model.state_dict(), ner_path)
    print(f"Model saved to {ner_path}")

def evaluate(dataloader, ner_model):
    # set the model to evaluate mode.
    ner_model.eval()
    total_loss = 0.0
    total_correct = 0
    total_tokens = 0

    criterion = nn.CrossEntropyLoss(ignore_index=0)

    with torch.no_grad():
        for inputs, labels, attn_mask in dataloader:
            inputs = inputs.to(device)
            labels = labels.to(device)
            attn_mask = attn_mask.to(device)

            outputs = ner_model(inputs, attn_mask)

            loss = criterion(outputs.view(-1, outputs.shape[-1]), labels.view(-1))
            total_loss += loss.item()

            # Calculate accuracy (optional)
            predictions = outputs.argmax(dim=-1)  # pick class with max logit
            mask = labels != 0  # ignore PAD tokens (if 0 is PAD)
            correct = (predictions == labels) & mask
            total_correct += correct.sum().item()
            total_tokens += mask.sum().item()

    avg_loss = total_loss / len(dataloader)
    accuracy = total_correct / total_tokens if total_tokens > 0 else 0.0
    return avg_loss, accuracy

In [57]:
dataset = NERDataset(sentences=sentences, tags=labels, word_vocab=word_vocab, tag_vocab=tag_vocab)

# Define split sizes (e.g., 80% train, 20% test)
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size

my_collate_fn = create_collate_fn(word_vocab, tag_vocab)

# Random split
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=my_collate_fn)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=my_collate_fn)

# train_dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=my_collate_fn)

ner_model = NERNeuralNetwork(embd_dim=256, heads=8, forward_expansion=3, dropout=0.5, vocab_size=word_vocab.n_words, max_sent_len=word_vocab.max_sent_len, total_tags=tag_vocab.n_tags)

train(dataloader=train_loader, ner_model=ner_model, print_every=10, learning_rate=0.001, n_epochs=50)

print('Evaluating the model.')

ner_model.load_state_dict(torch.load("./ner_model.pth"))

val_loss, val_acc = evaluate(ner_model=ner_model, dataloader=test_loader)
print(f"Validation Loss: {val_loss:.4f}, Accuracy: {val_acc:.4f}")

Training:  20%|██        | 10/50 [03:08<12:33, 18.84s/epoch, Avg Loss=0.5032]

Epoch 10/50 - Avg Loss: 0.5032


Training:  40%|████      | 20/50 [06:09<09:03, 18.10s/epoch, Avg Loss=0.3594]

Epoch 20/50 - Avg Loss: 0.3594


Training:  60%|██████    | 30/50 [09:10<06:04, 18.22s/epoch, Avg Loss=0.2766]

Epoch 30/50 - Avg Loss: 0.2766


Training:  80%|████████  | 40/50 [12:18<03:09, 18.94s/epoch, Avg Loss=0.2177]

Epoch 40/50 - Avg Loss: 0.2177


Training: 100%|██████████| 50/50 [15:28<00:00, 18.56s/epoch, Avg Loss=0.1724]


Epoch 50/50 - Avg Loss: 0.1724
Training finished in 928.10s
Saving model...
Model saved to ./ner_model.pth
Evaluating the model.
Validation Loss: 1.1910, Accuracy: 0.8502
