In [2]:
!pip install datasets --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[?25h

Import packages

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, TensorDataset

from sklearn.metrics import accuracy_score, f1_score, balanced_accuracy_score, precision_score, recall_score

import numpy as np

from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(f"Using {device}...")

Using cuda...


Load CoNLL 2003 dataset

In [4]:
from datasets import load_dataset

# Load CoNLL 2003 dataset
conll2003 = load_dataset("conll2003")

# Accessing train, validation, and test splits
train_dataset = conll2003['train']
validation_dataset = conll2003['validation']
test_dataset = conll2003['test']


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading data:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/312k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/283k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14041 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3250 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3453 [00:00<?, ? examples/s]

Download GloVe Embeddings

In [5]:
import torchtext.vocab as vocab

# Load GloVe embeddings
glove = vocab.GloVe(name='6B', dim=100)  # Load 100-dimensional GloVe embeddings

.vector_cache/glove.6B.zip: 862MB [02:39, 5.41MB/s]                           
100%|█████████▉| 399999/400000 [00:19<00:00, 20024.01it/s]


Pre-Processing Data

In [6]:
# Convert words to GloVe embeddings
def words_to_embeddings(words):
    embeddings = []
    for word in words:
        if word.lower() in glove.stoi:
            embeddings.append(glove.vectors[glove.stoi[word.lower()]])
        else:
            # Use zero vectors for out-of-vocabulary words
            embeddings.append(torch.zeros(100))
    return embeddings

# Convert labels to numerical representations
def convert_labels_to_indices(labels, split):
    label_indices = []
    for label in labels:
        if isinstance(label, str):
            label_indices.append(conll2003[split].features['ner_tags'].feature.str2int(label))
        else:
            # Handle the case where labels are already in numerical format
            label_indices.append(label)
    return torch.tensor(label_indices)


# Process the dataset
def process_dataset(dataset, split):
    tokenized_sentences = []
    label_indices = []
    for example in dataset:
        tokenized_sentences.append(words_to_embeddings(example['tokens']))
        label_indices.append(convert_labels_to_indices(example['ner_tags'], split))
    return tokenized_sentences, label_indices


# Process train, validation, and test datasets
train_sentences, train_labels = process_dataset(train_dataset, 'train')
validation_sentences, validation_labels = process_dataset(validation_dataset, 'validation')
test_sentences, test_labels = process_dataset(test_dataset, 'test')



Pad all the vectors

In [7]:
# Find the maximum sentence length among all splits
max_sentence_length = max(
    max(len(sentence) for sentence in train_sentences),
    max(len(sentence) for sentence in validation_sentences),
    max(len(sentence) for sentence in test_sentences)
)

# Pad sentences and labels to the maximum length
def pad_data(sentences, labels, max_length):
    padded_sentences = []
    padded_labels = []
    for sentence, label in zip(sentences, labels):
        # Calculate the number of padding tokens needed for sentences
        num_padding_tokens = max_length - len(sentence)
        # Pad the sentence
        padded_sentence = torch.nn.functional.pad(torch.stack(sentence), pad=(0, 0, 0, num_padding_tokens), mode='constant', value=0)
        padded_sentences.append(padded_sentence)

        # Pad the label
        padded_label = torch.nn.functional.pad(label, pad=(0, num_padding_tokens), mode='constant', value=0)
        padded_labels.append(padded_label)

    padded_sentences = torch.stack(padded_sentences, dim=0)
    padded_labels = torch.stack(padded_labels, dim=0)

    return padded_sentences, padded_labels

# Pad all splits
train_sentences_padded, train_labels_padded = pad_data(train_sentences, train_labels, max_sentence_length)
validation_sentences_padded, validation_labels_padded = pad_data(validation_sentences, validation_labels, max_sentence_length)
test_sentences_padded, test_labels_padded = pad_data(test_sentences, test_labels, max_sentence_length)


Check shapes of the arrays are similar

In [8]:
print(train_sentences_padded.shape, train_labels_padded.shape)
print(validation_sentences_padded.shape, validation_labels_padded.shape)
print(test_sentences_padded.shape, test_labels_padded.shape)


torch.Size([14041, 124, 100]) torch.Size([14041, 124])
torch.Size([3250, 124, 100]) torch.Size([3250, 124])
torch.Size([3453, 124, 100]) torch.Size([3453, 124])


# Models

Creating the Expert for the MoE. I've used a simple MLP as an expert

In [9]:
class Expert(nn.Module):
  """Simple expert for MoE layer"""
  def __init__(self, input_dim, expert_dim):
    super(Expert, self).__init__()
    self.input_dim = input_dim
    self.expert_dim = expert_dim

    self.fc1 = nn.Linear(input_dim, expert_dim*2)
    self.relu = nn.ReLU()
    self.dp1 = nn.Dropout(0.1)
    self.fc2 = nn.Linear(expert_dim*2, expert_dim)
    self.sigmoid = nn.Sigmoid()
    self.dp2 = nn.Dropout(0.1)

  def forward(self, x):
    """
    Forward pass of the expert
    """
    x = self.fc1(x)
    x = self.relu(x)
    x = self.dp1(x)
    x = self.fc2(x)
    x = self.sigmoid(x)
    x = self.dp2(x)
    return x

The MoE Layer

In [10]:
class MoE_Layer(nn.Module):
  """Custom Mixture of Experts Layer"""
  def __init__(self, input_dim, expert_dim, num_experts):
    super(MoE_Layer, self).__init__()
    self.input_dim = input_dim
    self.expert_dim = expert_dim
    self.num_experts = num_experts

    # Expert weights (weights for each expert input)
    self.expert_weights = nn.Linear(input_dim, num_experts, bias=True)
    self.softmax = nn.Softmax(dim=-1)

    # Expert layers
    self.experts = nn.ModuleList([Expert(input_dim, expert_dim) for _ in range(num_experts)])

  def forward(self, x):
    """
    Forward pass of the MoE layer:
      1. Compute expert weights based on input
      2. Pass input through each expert
      3. Gate expert outputs with weights
      4. Combine gated outputs

    Args:
      x: Input tensor (batch_size, input_dim)

    Returns:
      output: Combined output tensor (batch_size, expert_dim)
    """

    # 1. Compute expert weights
    weights = self.expert_weights(x) # (batch_size, num_experts)
    weights = self.softmax(weights) # Normalize weights using softmax

    # 2. Pass input through each expert
    expert_outputs = []
    for expert in self.experts:
      out = expert(x) # Pass through Expert
      expert_outputs.append(out) # (batch_size, expert_dim)

    # 3. Gate expert outputs with weights
    # print("exp_op0", torch.stack(expert_outputs, dim=0).shape)
    # print("exp_op1", torch.stack(expert_outputs, dim=1).shape)
    # print("exp_op2", torch.stack(expert_outputs, dim=2).shape)
    # print("weights", weights.shape)
    # print("weights_unsq", weights.unsqueeze(-1).shape)
    gated_outputs = torch.stack(expert_outputs, dim=2) * weights.unsqueeze(-1) # (batch_size, num_experts, expert_dim)

    # 4. Combine gated outputs
    output = torch.sum(gated_outputs, dim=2) # (batch_size, expert_dim)
    # print(torch.sum(gated_outputs, dim=0).shape)
    # print(torch.sum(gated_outputs, dim=1).shape)
    # print(torch.sum(gated_outputs, dim=2).shape)
    # print(torch.sum(gated_outputs, dim=3).shape)

    return output

LSTM + MoE Model

In [11]:
class LSTM_MoE(nn.Module):
    """LSTM model with a MoE layer in the middle"""
    def __init__(self, input_dim, hidden_dim, expert_dim, num_experts, output_dim):
        super(LSTM_MoE, self).__init__()

        self.lstm1 = nn.LSTM(input_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.moe = MoE_Layer(hidden_dim*2, expert_dim, num_experts)
        self.lstm2 = nn.LSTM(expert_dim, hidden_dim, batch_first=True, bidirectional=True)

        self.fc = nn.Linear(hidden_dim*2, output_dim)

    def forward(self, x):
        lstm_out1, _ = self.lstm1(x)        #   (batch size, 124, hidden_dim)
        moe_out = self.moe(lstm_out1)       #   (batch size, 124, expert_dim) <-- Best expert selected
        lstm_out2, _ = self.lstm2(moe_out)  #   (batch size, 124, hidden_dim)

        logits = self.fc(lstm_out2)
        probabilities = F.log_softmax(logits, dim=-1)

        return probabilities


Only LSTM Model

In [12]:
# Define the NER model
class NERModel(nn.Module):
    """Simple Stacked BiLSTM Model"""
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(NERModel, self).__init__()

        self.lstm1 = nn.LSTM(input_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.lstm2 = nn.LSTM(hidden_dim*2, hidden_dim, batch_first=True, bidirectional=True)

        self.fc = nn.Linear(hidden_dim * 2, output_dim)

    def forward(self, x):
        # print("in", x.shape)
        lstm_out1, _ = self.lstm1(x)
        # print("l1", lstm_out1.shape)
        lstm_out2, _ = self.lstm2(lstm_out1)
        # print("l2", lstm_out2.shape)

        logits = self.fc(lstm_out2)
        # print("op", logits.shape)
        probabilities = F.log_softmax(logits, dim=1)

        return probabilities


# Training

In [13]:
# Define hyperparameters
input_dim = 100  # Example input size
hidden_dim = 128
expert_dim = 64  # Dimension of the expert
num_experts = 8  # Number of experts in the MoE layer
output_dim = 9  # Example output size


In [14]:
# Instantiate the model
model = LSTM_MoE(input_dim, hidden_dim, expert_dim, num_experts, output_dim).to(device)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)


# Convert padded data to PyTorch DataLoader
train_data = TensorDataset(train_sentences_padded, train_labels_padded)
train_loader = DataLoader(train_data, batch_size=64, shuffle=True)

# Training loop
epochs = 10
for epoch in range(epochs):
    model.train()
    epoch_loss = 0
    total = 0
    correct = 0
    for batch_sentences, batch_labels in tqdm(train_loader, desc=f'Epoch {epoch + 1}/{epochs}', leave=False):
        optimizer.zero_grad()
        batch_sentences = batch_sentences.to(device)
        batch_labels = batch_labels.to(device)
        logits = model(batch_sentences)
        loss = criterion(logits.view(-1, output_dim), batch_labels.view(-1))
        loss.backward()
        optimizer.step()

        top_class = torch.argmax(logits, dim=-1)
        # Find accuracy
        total += top_class.flatten().size(0)
        correct += (top_class.flatten() == batch_labels.flatten()).sum().item()
        epoch_loss += loss.item()

    print(f'\nEpoch {epoch + 1}/{epochs}, Loss: {epoch_loss / len(train_loader):.4f}, Accuracy: {(100 * correct / total):.4f}%')

print('Training finished!')





Epoch 1/10, Loss: 0.1437, Accuracy: 97.4405%





Epoch 2/10, Loss: 0.0486, Accuracy: 98.4981%





Epoch 3/10, Loss: 0.0333, Accuracy: 98.9176%





Epoch 4/10, Loss: 0.0255, Accuracy: 99.2025%





Epoch 5/10, Loss: 0.0199, Accuracy: 99.3843%





Epoch 6/10, Loss: 0.0146, Accuracy: 99.5876%





Epoch 7/10, Loss: 0.0111, Accuracy: 99.6906%





Epoch 8/10, Loss: 0.0092, Accuracy: 99.7451%





Epoch 9/10, Loss: 0.0076, Accuracy: 99.7832%


                                                              


Epoch 10/10, Loss: 0.0065, Accuracy: 99.8170%
Training finished!




In [15]:
# Convert padded data to PyTorch DataLoader
test_data = TensorDataset(test_sentences_padded, test_labels_padded)
test_loader = DataLoader(test_data, batch_size=64, shuffle=True)

model.eval()
with torch.no_grad():
    test_loss = 0
    total = 0
    correct = 0
    preds = []
    trues = []
    for batch_sentences, batch_labels in tqdm(test_loader, desc=f'Testing...', leave=False):
        batch_sentences = batch_sentences.to(device)
        batch_labels = batch_labels.to(device)
        logits = model(batch_sentences)
        loss = criterion(logits.view(-1, output_dim), batch_labels.view(-1))

        top_class = torch.argmax(logits, dim=-1)
        # Find accuracy
        total += top_class.flatten().size(0)
        correct += (top_class.flatten() == batch_labels.flatten()).sum().item()
        epoch_loss += loss.item()
        for i in top_class.flatten().cpu().detach().numpy():
            preds.append(i)
        for i in batch_labels.flatten().cpu().detach().numpy():
            trues.append(i)

    print(f'\nTest Loss: {epoch_loss / len(test_loader):.4f}, Test Accuracy: {(100 * correct / total):.4f}%')
    preds = np.array(preds)
    trues = np.array(trues)

print('Testing finished!')




Test Loss: 0.0418, Test Accuracy: 99.5971%
Testing finished!


In [16]:
ac_sc = accuracy_score(trues, preds)

bac_sc = balanced_accuracy_score(trues, preds)

f1_sc_micro = f1_score(trues, preds, average='micro')
f1_sc_macro = f1_score(trues, preds, average='macro')
f1_sc_weighted = f1_score(trues, preds, average='weighted')

pr_sc_micro = precision_score(trues, preds, average='micro')
pr_sc_macro = precision_score(trues, preds, average='macro')
pr_sc_weighted = precision_score(trues, preds, average='weighted')

re_sc_micro = recall_score(trues, preds, average='micro')
re_sc_macro = recall_score(trues, preds, average='macro')
re_sc_weighted = recall_score(trues, preds, average='weighted')


In [17]:
print(f"Accuracy = {ac_sc:.5f}\n")

print(f"Balanced Accuracy = {bac_sc:.5f}\n")

print(f"F1 Score Macro = {f1_sc_macro:.5f}")
print(f"F1 Score Micro = {f1_sc_micro:.5f}")
print(f"F1 Score Weighted = {f1_sc_weighted:.5f}\n")

print(f"Precision Macro = {pr_sc_macro:.5f}")
print(f"Precision Micro = {pr_sc_micro:.5f}")
print(f"Precision Weighted = {pr_sc_weighted:.5f}\n")

print(f"Recall Macro = {re_sc_macro:.5f}")
print(f"Recall Micro = {re_sc_micro:.5f}")
print(f"Recall Weighted = {re_sc_weighted:.5f}\n")

Accuracy = 0.99597

Balanced Accuracy = 0.82213

F1 Score Macro = 0.82079
F1 Score Micro = 0.99597
F1 Score Weighted = 0.99597

Precision Macro = 0.82006
Precision Micro = 0.99597
Precision Weighted = 0.99598

Recall Macro = 0.82213
Recall Micro = 0.99597
Recall Weighted = 0.99597



In [18]:
# Instantiate the model
model = NERModel(input_dim, hidden_dim, output_dim).to(device)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)


# Convert padded data to PyTorch DataLoader
train_data = TensorDataset(train_sentences_padded, train_labels_padded)
train_loader = DataLoader(train_data, batch_size=64, shuffle=True)

# Training loop
epochs = 10
for epoch in range(epochs):
    model.train()
    epoch_loss = 0
    total = 0
    correct = 0
    for batch_sentences, batch_labels in tqdm(train_loader, desc=f'Epoch {epoch + 1}/{epochs}', leave=False):
        optimizer.zero_grad()
        batch_sentences = batch_sentences.to(device)
        batch_labels = batch_labels.to(device)
        logits = model(batch_sentences)
        loss = criterion(logits.view(-1, output_dim), batch_labels.view(-1))
        loss.backward()
        optimizer.step()

        top_class = torch.argmax(logits, dim=-1)
        # Find accuracy
        total += top_class.flatten().size(0)
        correct += (top_class.flatten() == batch_labels.flatten()).sum().item()
        epoch_loss += loss.item()

    print(f'\nEpoch {epoch + 1}/{epochs}, Loss: {epoch_loss / len(train_loader):.4f}, Accuracy: {(100 * correct / total):.4f}%')

print('Training finished!')





Epoch 1/10, Loss: 0.3152, Accuracy: 94.9127%





Epoch 2/10, Loss: 0.0668, Accuracy: 97.8581%





Epoch 3/10, Loss: 0.0506, Accuracy: 98.3534%





Epoch 4/10, Loss: 0.0411, Accuracy: 98.6218%





Epoch 5/10, Loss: 0.0358, Accuracy: 98.7458%





Epoch 6/10, Loss: 0.0330, Accuracy: 98.8187%





Epoch 7/10, Loss: 0.0311, Accuracy: 98.8667%





Epoch 8/10, Loss: 0.0294, Accuracy: 98.9060%





Epoch 9/10, Loss: 0.0282, Accuracy: 98.9278%


                                                              


Epoch 10/10, Loss: 0.0271, Accuracy: 98.9592%
Training finished!




In [19]:
# Convert padded data to PyTorch DataLoader
test_data = TensorDataset(test_sentences_padded, test_labels_padded)
test_loader = DataLoader(test_data, batch_size=64, shuffle=True)

model.eval()
with torch.no_grad():
    test_loss = 0
    total = 0
    correct = 0
    preds = []
    trues = []
    for batch_sentences, batch_labels in tqdm(test_loader, desc=f'Testing...', leave=False):
        batch_sentences = batch_sentences.to(device)
        batch_labels = batch_labels.to(device)
        logits = model(batch_sentences)
        loss = criterion(logits.view(-1, output_dim), batch_labels.view(-1))

        top_class = torch.argmax(logits, dim=-1)
        # Find accuracy
        total += top_class.flatten().size(0)
        correct += (top_class.flatten() == batch_labels.flatten()).sum().item()
        epoch_loss += loss.item()
        for i in top_class.flatten().cpu().detach().numpy():
            preds.append(i)
        for i in batch_labels.flatten().cpu().detach().numpy():
            trues.append(i)

    print(f'\nTest Loss: {epoch_loss / len(test_loader):.4f}, Test Accuracy: {(100 * correct / total):.4f}%')
    preds = np.array(preds)
    trues = np.array(trues)

print('Testing finished!')

                                                           


Test Loss: 0.1412, Test Accuracy: 98.8362%
Testing finished!




In [20]:
ac_sc = accuracy_score(trues, preds)

bac_sc = balanced_accuracy_score(trues, preds)

f1_sc_micro = f1_score(trues, preds, average='micro')
f1_sc_macro = f1_score(trues, preds, average='macro')
f1_sc_weighted = f1_score(trues, preds, average='weighted')

pr_sc_micro = precision_score(trues, preds, average='micro')
pr_sc_macro = precision_score(trues, preds, average='macro')
pr_sc_weighted = precision_score(trues, preds, average='weighted')

re_sc_micro = recall_score(trues, preds, average='micro')
re_sc_macro = recall_score(trues, preds, average='macro')
re_sc_weighted = recall_score(trues, preds, average='weighted')


In [21]:
print(f"Accuracy = {ac_sc:.5f}\n")

print(f"Balanced Accuracy = {bac_sc:.5f}\n")

print(f"F1 Score Macro = {f1_sc_macro:.5f}")
print(f"F1 Score Micro = {f1_sc_micro:.5f}")
print(f"F1 Score Weighted = {f1_sc_weighted:.5f}\n")

print(f"Precision Macro = {pr_sc_macro:.5f}")
print(f"Precision Micro = {pr_sc_micro:.5f}")
print(f"Precision Weighted = {pr_sc_weighted:.5f}\n")

print(f"Recall Macro = {re_sc_macro:.5f}")
print(f"Recall Micro = {re_sc_micro:.5f}")
print(f"Recall Weighted = {re_sc_weighted:.5f}\n")

Accuracy = 0.98836

Balanced Accuracy = 0.73907

F1 Score Macro = 0.62527
F1 Score Micro = 0.98836
F1 Score Weighted = 0.98924

Precision Macro = 0.58051
Precision Micro = 0.98836
Precision Weighted = 0.99135

Recall Macro = 0.73907
Recall Micro = 0.98836
Recall Weighted = 0.98836

