# 1.1 Word Embedding

In [1]:
import gensim.downloader

In [2]:
# Download the embeddings
w2v = gensim.downloader.load('word2vec-google-news-300')

# 1.2 Data

In [3]:
from datasets import Dataset, DatasetDict
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# read data
def read_conll_file(file_path):
    with open(file_path, "r") as f:
        content = f.read().strip()
        sentences = content.split("\n\n")
        data = []
        for sentence in sentences:
            tokens = sentence.split("\n")
            token_data = []
            for token in tokens:
                token_data.append(token.split())
            data.append(token_data)
    return data


train_data = read_conll_file("/mnt/lustre/yuxin/SC4002_G06/datasets/CoNLL2003/eng.train")
validation_data = read_conll_file("/mnt/lustre/yuxin/SC4002_G06/datasets/CoNLL2003/eng.testa")
test_data = read_conll_file("/mnt/lustre/yuxin/SC4002_G06/datasets/CoNLL2003/eng.testb")

In [5]:
# prepare data
def convert_to_dataset(data, label_map):
    formatted_data = {"tokens": [], "ner_tags": []}
    for sentence in data:
        tokens = [token_data[0] for token_data in sentence]
        ner_tags = [label_map[token_data[3]] for token_data in sentence]
        formatted_data["tokens"].append(tokens)
        formatted_data["ner_tags"].append(ner_tags)
    return Dataset.from_dict(formatted_data)


label_list = sorted(list(set([token_data[3] for sentence in train_data for token_data in sentence])))
label_map = {label: i for i, label in enumerate(label_list)}

train_dataset = convert_to_dataset(train_data, label_map)
validation_dataset = convert_to_dataset(validation_data, label_map)
test_dataset = convert_to_dataset(test_data, label_map)

# Dataset Preparation

## Tokenization

1. `<PAD>` Token:

This token is typically initialized to a zero vector because it's meant to be a neutral padding value that doesn't interfere with computation

2. `<UNK>` Token:
- Zero Vector: Similar to the `<PAD>` token, you can initialize it to a zero vector.
- Average Vector: Initialize it as the average of all word vectors in your pretrained embeddings. This gives it a kind of "average" representation of the langua
- Random Vector: Randomly initialize it, which might add some noise and robustness to the embeddi

For many tasks, initializing the `<UNK>` token as the average of all word vectors works well. It makes the `<UNK>` token have a representation that is, on average, similar to any random word from the vocabulary, which can be beneficial since the `<UNK>` token is used for words that aren't in the training vocabulary but could be anywhere in the semantic space.ngs.s.

In [6]:
# Out-of-vocabulary (OOV) words
# 1. can be replaced with a special token, such as "<OOV>" or "<UNK>".
# 2. can be ignored.

word2idx = w2v.key_to_index
print(f"whether <UNK> in w2v: {'<UNK>' in word2idx}") # False
print(f"whether <PAD> in w2v: {'<PAD>' in word2idx}") # False

# Define a vocabulary array by appending '<UNK>' and '<PAD>' tokens
#voc = list(word2idx.keys())
#voc.extend(['<UNK>', '<PAD>'])

# Add '<UNK>' and '<PAD>' tokens to the vocabulary index
word2idx['<UNK>'] = len(word2idx)
word2idx['<PAD>'] = len(word2idx)

print(f"word2idx['<UNK>']: {word2idx['<UNK>']}")
print(f"word2idx['<PAD>']: {word2idx['<PAD>']}")

whether <UNK> in w2v: False
whether <PAD> in w2v: False
word2idx['<UNK>']: 3000000
word2idx['<PAD>']: 3000001


In [7]:
print("init shape", w2v.vectors.shape)

# add the '<UNK>' word to the vocabulary of the Word2Vec model 
# initialize it with the average of all word vectors int he pretrained embeddings.
unk_vector = np.mean(w2v.vectors, axis=0)
w2v.vectors = np.vstack([w2v.vectors, unk_vector])
print("after insert UNK: ", w2v.vectors.shape)

# add the '<PAD>' word to the vocabulary of the Word2Vec model 
# initialize it with a row of zeros in the vectors matrix.
w2v.vectors = np.vstack([w2v.vectors, np.zeros(w2v.vectors[0].shape)])
print("after insert PAD: ", w2v.vectors.shape)

init shape (3000000, 300)
after insert UNK:  (3000001, 300)
after insert PAD:  (3000002, 300)


In [8]:
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader

# Map words to Indices
def sentence_to_indices(sentence, vocab):
    return [vocab.get(word, vocab.get('<UNK>')) for word in sentence]

tag2idx = {
    'B-LOC': 0,
    'B-MISC': 1,
    'B-ORG': 2,
    'I-LOC': 3,
    'I-MISC': 4,
    'I-ORG': 5,
    'I-PER': 6,
    'O': 7,
    'PAD': 8
}

class NERDataset(Dataset):
    def __init__(self, sentences, tags, vocab):
        self.sentences = [torch.tensor(sentence_to_indices(sentence, vocab)) for sentence in sentences]
        self.tags = [torch.tensor(tag) for tag in tags]
        
    def __len__(self):
        return len(self.sentences)
    
    def __getitem__(self, idx):
        return self.sentences[idx], self.tags[idx]

def collate_fn(batch):
    sentences, tags = zip(*batch)
    sentences_padded = pad_sequence(sentences, batch_first=True, padding_value=word2idx['<PAD>'])
    tags_padded = pad_sequence(tags, batch_first=True, padding_value=tag2idx['PAD'])  # Assuming 'O' is the padding value for tags
    return sentences_padded, tags_padded

# Create PyTorch datasets and data loaders
train_dataset = NERDataset(train_dataset['tokens'], train_dataset['ner_tags'], word2idx)
validation_dataset = NERDataset(validation_dataset['tokens'], validation_dataset['ner_tags'], word2idx)
test_dataset = NERDataset(test_dataset['tokens'], test_dataset['ner_tags'], word2idx)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
validation_loader = DataLoader(validation_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)

# 1.3 Model

In [9]:
# just to check the padding text token and tags
for batch in train_loader:
    print(len(batch[0]))
    print("sentence: ", batch[0][0])
    print("tagss: ", batch[1][0])
    break

32
sentence:  tensor([   2400, 3000000,     321, 3000000,    2612,    2119,       1,     778,
              1, 3000000,   10130,     517, 3000000,     196,      11,     850,
              8, 3000000,    5628,   24091, 3000000, 3000001, 3000001, 3000001,
        3000001, 3000001, 3000001, 3000001, 3000001, 3000001, 3000001, 3000001,
        3000001, 3000001, 3000001, 3000001, 3000001, 3000001, 3000001])
tagss:  tensor([6, 7, 7, 7, 7, 7, 7, 3, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8,
        8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8])


## Define metrics

In [14]:
from seqeval.metrics import f1_score
from seqeval.metrics import classification_report
from seqeval.scheme import IOB1

In [2]:
y_true = [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
y_pred = [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
f1_score(y_true, y_pred)

0.5

In [3]:
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

        MISC       0.00      0.00      0.00         1
         PER       1.00      1.00      1.00         1

   micro avg       0.50      0.50      0.50         2
   macro avg       0.50      0.50      0.50         2
weighted avg       0.50      0.50      0.50         2



# Models

In [9]:
import torch
import torch.nn as nn

embedding_matrix = torch.FloatTensor(w2v.vectors)

class LinearNERModel(nn.Module):
    def __init__(self, embedding_dim, output_dim):
        super(LinearNERModel, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, padding_idx=word2idx['<PAD>'], freeze=True)
        self.fc = nn.Linear(embedding_dim, output_dim)
        
    def forward(self, x):
        x = self.embedding(x)
        tag_space = self.fc(x)
        tag_scores = torch.log_softmax(tag_space, dim=-1)
        return tag_scores


class FeedforwardNERModel(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, output_dim):
        super(FeedforwardNERModel, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, padding_idx=word2idx['<PAD>'], freeze=True)
        self.fc1 = nn.Linear(embedding_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, x):
        x = self.embedding(x)
        x = self.fc1(x)
        x = self.relu(x)
        tag_space = self.fc2(x)
        tag_scores = torch.log_softmax(tag_space, dim=-1)
        return tag_scores


class DeepFeedforwardNERModel(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, output_dim, dropout_rate=0.5):
        super(DeepFeedforwardNERModel, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, padding_idx=word2idx['<PAD>'], freeze=True)
        self.fc1 = nn.Linear(embedding_dim, hidden_dim)
        self.relu1 = nn.ReLU()
        self.dropout1 = nn.Dropout(dropout_rate)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.relu2 = nn.ReLU()
        self.dropout2 = nn.Dropout(dropout_rate)
        self.fc3 = nn.Linear(hidden_dim, hidden_dim)
        self.relu3 = nn.ReLU()
        self.dropout3 = nn.Dropout(dropout_rate)
        self.fc4 = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, x):
        x = self.embedding(x)
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.dropout1(x)
        x = self.fc2(x)
        x = self.relu2(x)
        x = self.dropout2(x)
        x = self.fc3(x)
        x = self.relu3(x)
        x = self.dropout3(x)
        tag_space = self.fc4(x)
        return tag_space


class RNNNERModel(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, output_dim):
        super(RNNNERModel, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, padding_idx=word2idx['<PAD>'], freeze=True)
        self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, x):
        x = self.embedding(x)
        rnn_out, _ = self.rnn(x)
        tag_space = self.fc(rnn_out)
        tag_scores = torch.log_softmax(tag_space, dim=-1)
        return tag_scores


class LSTMNERModel(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, output_dim):
        super(LSTMNERModel, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix,  padding_idx=word2idx['<PAD>'], freeze=True)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, x):
        x = self.embedding(x)
        lstm_out, _ = self.lstm(x)
        tag_space = self.fc(lstm_out)
        tag_scores = torch.log_softmax(tag_space, dim=-1)
        return tag_scores


class BiLSTMNERModel(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, output_dim):
        super(BiLSTMNERModel, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, padding_idx=word2idx['<PAD>'], freeze=True)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim*2, output_dim)
        
    def forward(self, x):
        x = self.embedding(x)
        lstm_out, _ = self.lstm(x)
        tag_space = self.fc(lstm_out)
        tag_scores = torch.log_softmax(tag_space, dim=-1)
        return tag_scores


class TransformerEncNERModel(nn.Module):
    def __init__(self, embedding_dim, nhead, num_encoder_layers, output_dim):
        super(TransformerEncNERModel, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, padding_idx=word2idx['<PAD>'], freeze=True)
        self.transformer_encoder = nn.TransformerEncoder(nn.TransformerEncoderLayer(d_model=embedding_dim, nhead=nhead), num_layers=num_encoder_layers)
        self.fc = nn.Linear(embedding_dim, output_dim)
        
    def forward(self, x):
        x = self.embedding(x)
        transformer_out = self.transformer_encoder(x)
        tag_space = self.fc(transformer_out)
        tag_scores = torch.log_softmax(tag_space, dim=-1)
        return tag_scores

In [67]:
from torchcrf import CRF

class LSTMCRFNERModel(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, output_dim):
        super(LSTMCRFNERModel, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, padding_idx=word2idx['<PAD>'], freeze=True)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.crf = CRF(output_dim, batch_first=True)
        
        # manually set the transition scores for transitions to the <PAD> token
        PAD_TAG_INDEX = tag2idx['PAD']
        self.crf.transitions.data[:, PAD_TAG_INDEX] = -10000.0
        self.crf.transitions.data[PAD_TAG_INDEX, :] = -10000.0
        
    def forward(self, x, tags=None):
        x = self.embedding(x)
        lstm_out, _ = self.lstm(x)
        emissions = self.fc(lstm_out)
        if tags is not None:
            loss = -self.crf(emissions, tags)
            return loss
        else:
            predictions = self.crf.decode(emissions)
            return predictions

# Compile

In [11]:
# Hyperparameters
EMBEDDING_DIM = 300
HIDDEN_DIM = 150
# OUTPUT_DIM = len(label_list)  # Number of unique tags/labels
VOCAB_SIZE = len(word2idx)
TAGSET_SIZE = len(tag2idx)
DROPOUT_RATE = 0.5 
NHEAD = 6
NUM_ENCODER_LAYERS = 3


model_linear = LinearNERModel(EMBEDDING_DIM, TAGSET_SIZE)
model_ff = FeedforwardNERModel(EMBEDDING_DIM, HIDDEN_DIM, TAGSET_SIZE)
model_deep_ff = DeepFeedforwardNERModel(EMBEDDING_DIM, HIDDEN_DIM, TAGSET_SIZE, dropout_rate=DROPOUT_RATE)
model_rnn = RNNNERModel(EMBEDDING_DIM, HIDDEN_DIM, TAGSET_SIZE)
model_lstm = LSTMNERModel(EMBEDDING_DIM, HIDDEN_DIM, TAGSET_SIZE)
model_bilstm = BiLSTMNERModel(EMBEDDING_DIM, HIDDEN_DIM, TAGSET_SIZE)
model_transformer = TransformerEncNERModel(EMBEDDING_DIM, NHEAD, NUM_ENCODER_LAYERS, TAGSET_SIZE)
model_lstm_crf = LSTMCRFNERModel(EMBEDDING_DIM, HIDDEN_DIM, TAGSET_SIZE)

loss_function = nn.NLLLoss(ignore_index=tag2idx['PAD']) #  ignore the padding label during training
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Train

In [12]:
def idx_to_tags(indices, idx2tag):
    return [idx2tag[idx] for idx in indices]

idx2tag = {v: k for k, v in tag2idx.items()}
idx2tag

{0: 'B-LOC',
 1: 'B-MISC',
 2: 'B-ORG',
 3: 'I-LOC',
 4: 'I-MISC',
 5: 'I-ORG',
 6: 'I-PER',
 7: 'O',
 8: 'PAD'}

In [15]:
# Assuming you've created dataloaders for training and validation data
num_epochs = 50
patience = 5  # Number of epochs to wait for improvement before stopping

# Training loop
best_f1 = 0.0
epochs_no_improve = 0
for epoch in range(num_epochs):  # Number of epochs
    total_loss = 0
    for sentences, tags in train_loader:
        model.zero_grad()
        tag_scores = model(sentences)
        loss = loss_function(tag_scores.view(-1, TAGSET_SIZE), tags.view(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss}")

    if (epoch+1) % 1 == 0:
        print("evaluate.....")
        # Evaluate on the validation dataset
        # Placeholder to store true and predicted tags
        y_true = [] # true tags
        y_pred = [] # predicted tags
        
        # Evaluate the model on the validation dataset
        model.eval()  # Set the model to evaluation mode
        with torch.no_grad():
            for sentences, tags in validation_loader:
                tag_scores = model(sentences)
                predictions = tag_scores.argmax(dim=-1).tolist()
                
                # Convert index to tags
                # Note: filtering out padding tokens
                for sentence, true_seq, pred_seq in zip(sentences, tags.tolist(), predictions):
                    valid_length = (sentence != word2idx['<PAD>']).sum().item()
                    true_tags = [idx2tag[idx] for idx in true_seq[:valid_length]]
                    pred_tags = [idx2tag[idx] for idx in pred_seq[:valid_length]]
                    y_true.append(true_tags)
                    y_pred.append(pred_tags)
        
        # Compute F1 score
        f1 = f1_score(y_true, y_pred, mode='strict', scheme=IOB1)
        print(f"F1 Score (Epoch {epoch+1}): {f1}")
        
        # Early stopping based on F1 score
        if f1 > best_f1:
            best_f1 = f1
            epochs_no_improve = 0
            # Save the best model
            # best_model = model.state_dict()
        else:
            epochs_no_improve += 1
            if epochs_no_improve == patience:
                print("Early stopping triggered!")
                report = classification_report(y_true, y_pred, mode='strict', scheme=IOB1)
                print(f"Classification Report (Epoch {epoch+1}):\n{report}")
                break

Epoch 1, Loss: 49.96592352539301
evaluate.....
F1 Score (Epoch 1): 0.8334320277472296
Epoch 2, Loss: 39.020609095692635
evaluate.....
F1 Score (Epoch 2): 0.8494859261756279
Epoch 3, Loss: 31.90361682511866
evaluate.....
F1 Score (Epoch 3): 0.8663960764417385
Epoch 4, Loss: 26.3493274115026
evaluate.....
F1 Score (Epoch 4): 0.8721255833686891
Epoch 5, Loss: 21.5887668132782
evaluate.....
F1 Score (Epoch 5): 0.8758579781374459
Epoch 6, Loss: 17.58028271794319
evaluate.....
F1 Score (Epoch 6): 0.8775025449609772
Epoch 7, Loss: 13.400151531212032
evaluate.....
F1 Score (Epoch 7): 0.8802638254693049
Epoch 8, Loss: 10.22610445600003
evaluate.....
F1 Score (Epoch 8): 0.8818497501482172
Epoch 9, Loss: 7.748088906053454
evaluate.....
F1 Score (Epoch 9): 0.8806996688460559
Epoch 10, Loss: 5.589669164852239
evaluate.....
F1 Score (Epoch 10): 0.8711193153213628
Epoch 11, Loss: 4.21074063499691
evaluate.....
F1 Score (Epoch 11): 0.8768662976010736
Epoch 12, Loss: 3.132900462136604
evaluate.....
F1 

In [70]:
# Assuming you've created dataloaders for training and validation data
num_epochs = 50
patience = 5  # Number of epochs to wait for improvement before stopping

# Training loop for LSTMCRF Model
best_f1 = 0.0
epochs_no_improve = 0
for epoch in range(num_epochs):  # Number of epochs
    total_loss = 0
    for sentences, tags in train_loader:
        model.zero_grad()
        loss = model(sentences, tags)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss}")

    if (epoch+1) % 1 == 0:
        print("evaluate.....")
        # Evaluate on the validation dataset
        # Placeholder to store true and predicted tags
        y_true = [] # true tags
        y_pred = [] # predicted tags
        
        # Evaluate the model on the validation dataset
        model.eval()  # Set the model to evaluation mode
        with torch.no_grad():
            for sentences, tags in validation_loader:
                predictions = model(sentences)
                
                # Convert index to tags
                # Note: filtering out padding tokens
                for sentence, true_seq, pred_seq in zip(sentences, tags.tolist(), predictions):
                    valid_length = (sentence != word2idx['<PAD>']).sum().item()
                    true_tags = [idx2tag[idx] for idx in true_seq[:valid_length]]
                    pred_tags = [idx2tag[idx] for idx in pred_seq[:valid_length]]
                    y_true.append(true_tags)
                    y_pred.append(pred_tags)
        
        # Compute F1 score
        f1 = f1_score(y_true, y_pred, mode='strict', scheme=IOB1)
        print(f"F1 Score (Epoch {epoch+1}): {f1}")
        
        # Early stopping based on F1 score
        if f1 > best_f1:
            best_f1 = f1
            epochs_no_improve = 0
            # Save the best model
            # best_model = model.state_dict()
        else:
            epochs_no_improve += 1
            if epochs_no_improve == patience:
                print("Early stopping triggered!")
                report = classification_report(y_true, y_pred, mode='strict', scheme=IOB1)
                print(f"Classification Report (Epoch {epoch+1}):\n{report}")
                break

Epoch 1, Loss: 4128197860.75
evaluate.....
F1 Score (Epoch 1): 0.1656613211711049
Epoch 2, Loss: 4103683419.25
evaluate.....
F1 Score (Epoch 2): 0.1832657788071801
Epoch 3, Loss: 4028151970.875
evaluate.....
F1 Score (Epoch 3): 0.2119224187107815
Epoch 4, Loss: 3943554633.75
evaluate.....
F1 Score (Epoch 4): 0.2415200562983814
Epoch 5, Loss: 3912660472.0
evaluate.....
F1 Score (Epoch 5): 0.26759972008397487
Epoch 6, Loss: 3895783905.0
evaluate.....
F1 Score (Epoch 6): 0.28936761640027797
Epoch 7, Loss: 3776580132.75
evaluate.....
F1 Score (Epoch 7): 0.4296314617858027
Epoch 8, Loss: 3730692909.75
evaluate.....
F1 Score (Epoch 8): 0.5568013539242648
Epoch 9, Loss: 3656534283.5
evaluate.....
F1 Score (Epoch 9): 0.5853127474267618
Epoch 10, Loss: 3628299506.25
evaluate.....
F1 Score (Epoch 10): 0.6143416404086699
Epoch 11, Loss: 3536362421.25
evaluate.....
F1 Score (Epoch 11): 0.6292304783512952
Epoch 12, Loss: 3499862614.0
evaluate.....
F1 Score (Epoch 12): 0.6543994022043713
Epoch 13, L

# Test

In [16]:
# Placeholder to store true and predicted tags for the test set
y_true_test = []
y_pred_test = []

# Evaluate the model on the test dataset
model.eval()  # Set the model to evaluation mode
with torch.no_grad():
    for sentences, tags in test_loader:
        tag_scores = model(sentences)
        predictions = tag_scores.argmax(dim=-1).tolist()
        
        # Convert index to tags
        # Note: filtering out padding tokens
        for sentence, true_seq, pred_seq in zip(sentences, tags.tolist(), predictions):
            valid_length = (sentence != word2idx['<PAD>']).sum().item()
            true_tags = [idx2tag[idx] for idx in true_seq[:valid_length]]
            pred_tags = [idx2tag[idx] for idx in pred_seq[:valid_length]]
            y_true_test.append(true_tags)
            y_pred_test.append(pred_tags)

# Compute F1 score for the test set
f1_test = f1_score(y_true_test, y_pred_test, mode='strict', scheme=IOB1)
report_test = classification_report(y_true_test, y_pred_test, mode='strict', scheme=IOB1)

print("F1 Score on Test Set:", f1_test)
print("Classification Report on Test Set:\n", report_test)

F1 Score on Test Set: 0.8147759103641457
Classification Report on Test Set:
               precision    recall  f1-score   support

         LOC       0.87      0.88      0.87      1668
        MISC       0.70      0.68      0.69       702
         ORG       0.75      0.79      0.77      1661
         PER       0.85      0.87      0.86      1617

   micro avg       0.81      0.82      0.81      5648
   macro avg       0.79      0.80      0.80      5648
weighted avg       0.81      0.82      0.81      5648



In [72]:
# Placeholder to store true and predicted tags for the test set
y_true_test = []
y_pred_test = []

# Evaluate the model on the test dataset
model.eval()  # Set the model to evaluation mode
with torch.no_grad():
    for sentences, tags in test_loader:
        predictions = model(sentences) # for LSTMCRF Model
        
        # Convert index to tags
        # Note: filtering out padding tokens
        for sentence, true_seq, pred_seq in zip(sentences, tags.tolist(), predictions):
            valid_length = (sentence != word2idx['<PAD>']).sum().item()
            true_tags = [idx2tag[idx] for idx in true_seq[:valid_length]]
            pred_tags = [idx2tag[idx] for idx in pred_seq[:valid_length]]
            y_true_test.append(true_tags)
            y_pred_test.append(pred_tags)

# Compute F1 score for the test set
f1_test = f1_score(y_true_test, y_pred_test, mode='strict', scheme=IOB1)
report_test = classification_report(y_true_test, y_pred_test, mode='strict', scheme=IOB1)

print("F1 Score on Test Set:", f1_test)
print("Classification Report on Test Set:\n", report_test)

F1 Score on Test Set: 0.7044890162368672
Classification Report on Test Set:
               precision    recall  f1-score   support

         LOC       0.78      0.83      0.80      1668
        MISC       0.88      0.45      0.59       702
         ORG       0.69      0.51      0.58      1661
         PER       0.78      0.71      0.75      1617

   micro avg       0.76      0.65      0.70      5648
   macro avg       0.78      0.62      0.68      5648
weighted avg       0.77      0.65      0.70      5648



# Inference

In [17]:
def infer(sentence):
    # Tokenize the sentence
    tokens = sentence.split()

    # Convert tokens to indices
    token_indices = torch.tensor([sentence_to_indices(tokens, word2idx)])

    # Get predictions from the model
    model.eval()
    with torch.no_grad():
        tag_scores = model(token_indices)
        predictions = tag_scores.argmax(dim=-1).tolist()[0]

    # Convert index to tags
    predicted_tags = idx_to_tags(predictions, {v: k for k, v in tag2idx.items()})

    """
    # Display the results
    for token, tag in zip(tokens, predicted_tags):
        print(f"{token}: {tag}")
    """
    # Prepare aligned output
    token_line = ""
    tag_line = ""
    for token, tag in zip(tokens, predicted_tags):
        space_padding = max(len(token), len(tag)) + 2  # +2 to add some space between words for better readability
        token_line += token.ljust(space_padding)
        tag_line += tag.ljust(space_padding)

    # Display the results
    print(token_line)
    print(tag_line)


# Example usage:
sentence = "EU rejects German call to boycott British lamb ."
sentence = "Barack Obama was born in Hawaii and worked as the President of the United States."
infer(sentence)

sentence = "Jiang Yuxin was born in Shenyang and is now a student in Nanyang Technological University."
infer(sentence)


Barack  Obama  was  born  in  Hawaii  and  worked  as  the  President  of  the  United  States.  
I-PER   I-PER  O    O     O   I-LOC   O    O       O   O    O          O   O    I-ORG   O        
Jiang  Yuxin  was  born  in  Shenyang  and  is  now  a  student  in  Nanyang  Technological  University.  
I-PER  O      O    O     O   I-LOC     O    O   O    O  O        O   I-ORG    I-ORG          O            
