# 1.1 Word Embedding

In [1]:
import gensim.downloader

# Download the embeddings
w2v = gensim.downloader.load('word2vec-google-news-300')

# 1.2 Data
process: https://wandb.ai/mostafaibrahim17/ml-articles/reports/Named-Entity-Recognition-With-HuggingFace-Using-PyTorch-and-W-B--Vmlldzo0NDgzODA2

In [2]:
# read data
from datasets import Dataset, DatasetDict

def read_conll_file(file_path):
    with open(file_path, "r") as f:
        content = f.read().strip()
        sentences = content.split("\n\n")
        data = []
        for sentence in sentences:
            tokens = sentence.split("\n")
            token_data = []
            for token in tokens:
                token_data.append(token.split())
            data.append(token_data)
    return data


train_data = read_conll_file("/mnt/lustre/yuxin/SC4002_G06/datasets/CoNLL2003/eng.train")
validation_data = read_conll_file("/mnt/lustre/yuxin/SC4002_G06/datasets/CoNLL2003/eng.testa")
test_data = read_conll_file("/mnt/lustre/yuxin/SC4002_G06/datasets/CoNLL2003/eng.testb")

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# prepare data
def convert_to_dataset(data, label_map):
    formatted_data = {"tokens": [], "ner_tags": []}
    for sentence in data:
        tokens = [token_data[0] for token_data in sentence]
        ner_tags = [label_map[token_data[3]] for token_data in sentence]
        formatted_data["tokens"].append(tokens)
        formatted_data["ner_tags"].append(ner_tags)
    return Dataset.from_dict(formatted_data)


label_list = sorted(list(set([token_data[3] for sentence in train_data for token_data in sentence])))
label_map = {label: i for i, label in enumerate(label_list)}

train_dataset = convert_to_dataset(train_data, label_map)
validation_dataset = convert_to_dataset(validation_data, label_map)
test_dataset = convert_to_dataset(test_data, label_map)

In [5]:
label_map

{'B-LOC': 0,
 'B-MISC': 1,
 'B-ORG': 2,
 'I-LOC': 3,
 'I-MISC': 4,
 'I-ORG': 5,
 'I-PER': 6,
 'O': 7}

## Question 1.2
(a) Describe the size (number of sentences) of the training, development and test file for CoNLL2003.
Specify the complete set of all possible word labels based on the tagging scheme (IO, BIO,
etc.) you chos

(b) Choose an example sentence from the training set of CoNLL2003 that has at least two named
entities with more than one word. Explain how to form complete named entities from the label
for each word, and list all the named entities in this sentence.e.

In [6]:
# (a)
print("Dataset Sizes:")
print(f"Training:\t{train_dataset.num_rows} sentences")
print(f"Development:\t{validation_dataset.num_rows} sentences")
print(f"Test:\t\t{test_dataset.num_rows} sentences")

print("=======================================================================")
print("All Possible Word Labels (BIO):\n", label_list)


Dataset Sizes:
Training:	14987 sentences
Development:	3466 sentences
Test:		3684 sentences
All Possible Word Labels (BIO):
 ['B-LOC', 'B-MISC', 'B-ORG', 'I-LOC', 'I-MISC', 'I-ORG', 'I-PER', 'O']


!! (b) means finding the sentence that contains at least two distinct named entities, and each of those entities consists of more than one word.
=> but seems in training dataset, there isn't this kind of sentence

In [9]:
from tqdm import tqdm
lists2 = [0,1,2] # ['B-LOC', 'B-MISC', 'B-ORG']

def has_at_least_two_common_elements(list1, list2=[0,1,2]):
    common_elements = [value for value in list1 if value in list2]
    return len(common_elements) >= 2

for i in tqdm(range(len(train_dataset))):
    tokens = train_dataset['tokens'][i]
    ner_tags = train_dataset['ner_tags'][i]

    if has_at_least_two_common_elements(ner_tags):
        print(i)
        print(tokens)
        print(ner_tags)
        print([label_list[tag] for tag in ner_tags])
        break

  0%|                                               | 14/14987 [00:02<53:22,  4.68it/s]


KeyboardInterrupt: 

In [None]:
# [label_list[tag] for tag in train_dataset[5969]['ner_tags']]

In [7]:
# (b)
def form_complete_ne(dataset, i):
    # define sets of tags
    begin_tags = {'B-LOC', 'B-ORG', 'B-MISC'}
    inside_tags = {'I-ORG', 'I-LOC', 'I-PER', 'I-MISC'}
    outside_tags = {'O'}

    words = []
    word = []
    entities = []
    entity = []

    tokens = dataset['tokens'][i]
    ner_tags = dataset['ner_tags'][i]

    for token, tag in zip(tokens, ner_tags):
        tag = label_list[tag]

        if (tag in begin_tags or tag in outside_tags) and word:
            words.append(' '.join(word))
            entities.append(' '.join(entity))
            word = []
            entity = []

        if tag in begin_tags or tag in inside_tags:
            word.append(token)
            entity.append(tag)

    if word:
        words.append(' '.join(word))
        entities.append(' '.join(entity))

    return words, entities

form_complete_ne(train_dataset, 5969)

(['Swiss', 'Grand Prix', 'World Cup'],
 ['I-MISC', 'B-MISC I-MISC', 'B-MISC I-MISC'])

# 1.3 Model

1. `<PAD>` Token:

This token is typically initialized to a zero vector because it's meant to be a neutral padding value that doesn't interfere with computation

2. `<UNK>` Token:
- Zero Vector: Similar to the <PAD> token, you can initialize it to a zero vector.
- Average Vector: Initialize it as the average of all word vectors in your pretrained embeddings. This gives it a kind of "average" representation of the language.
- Random Vector: Randomly initialize it, which might add some noise and robustness to the embeddi

For many tasks, initializing the <UNK> token as the average of all word vectors works well. It makes the <UNK> token have a representation that is, on average, similar to any random word from the vocabulary, which can be beneficial since the <UNK> token is used for words that aren't in the training vocabulary but could be anywhere in the semantic space.ngs.s.

In [4]:
import numpy as np

# Out-of-vocabulary (OOV) words
# 1. can be replaced with a special token, such as "<OOV>" or "<UNK>".
# 2. can be ignored.

word2idx = w2v.key_to_index
print(f"whether <UNK> in w2v: {'<UNK>' in word2idx}") # False
print(f"whether <PAD> in w2v: {'<PAD>' in word2idx}") # False

# Add '<UNK>' and '<PAD>' tokens to the vocabulary index
word2idx['<UNK>'] = len(word2idx)
word2idx['<PAD>'] = len(word2idx)

print(f"word2idx['<UNK>']: {word2idx['<UNK>']}")
print(f"word2idx['<PAD>']: {word2idx['<PAD>']}")

# add the '<UNK>' word to the vocabulary of the Word2Vec model
# initialize it with the average of all word vectors in the pretrained embeddings.
unk_vector = np.mean(w2v.vectors, axis=0)
w2v.vectors = np.vstack([w2v.vectors, unk_vector])
print("after insert UNK: ", w2v.vectors.shape)

# add the '<PAD>' word to the vocabulary of the Word2Vec model
# initialize it with a row of zeros in the vectors matrix.
w2v.vectors = np.vstack([w2v.vectors, np.zeros(w2v.vectors[0].shape)])
print("after insert UNK: ", w2v.vectors.shape)


whether <UNK> in w2v: False
whether <PAD> in w2v: False
word2idx['<UNK>']: 3000000
word2idx['<PAD>']: 3000001
after insert UNK:  (3000001, 300)
after insert UNK:  (3000002, 300)


# Tokenize Dataset

In [5]:
# Map words to Indices
def sentence_to_indices(sentence, vocab):
    return [vocab.get(word, vocab.get('<UNK>')) for word in sentence]

tag2idx = {
    'B-LOC': 0,
    'B-MISC': 1,
    'B-ORG': 2,
    'I-LOC': 3,
    'I-MISC': 4,
    'I-ORG': 5,
    'I-PER': 6,
    'O': 7,
    'PAD': 8
}

In [6]:
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader

class NERDataset(Dataset):
    def __init__(self, sentences, tags, vocab):
        self.sentences = [torch.tensor(sentence_to_indices(sentence, vocab)) for sentence in sentences]
        self.tags = [torch.tensor(tag) for tag in tags]

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        return self.sentences[idx], self.tags[idx]

# Create PyTorch datasets and data loaders
train_dataset = NERDataset(train_dataset['tokens'], train_dataset['ner_tags'], word2idx)
validation_dataset = NERDataset(validation_dataset['tokens'], validation_dataset['ner_tags'], word2idx)
test_dataset = NERDataset(test_dataset['tokens'], test_dataset['ner_tags'], word2idx)

def collate_fn(batch):
    sentences, tags = zip(*batch)
    sentences_padded = pad_sequence(sentences, batch_first=True, padding_value=word2idx['<PAD>'])
    tags_padded = pad_sequence(tags, batch_first=True, padding_value=tag2idx['PAD'])
    return sentences_padded, tags_padded

# Model

In [7]:
embedding_matrix = torch.FloatTensor(w2v.vectors)

class BiLSTMNERModel(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, num_of_layers, output_dim):
        super(BiLSTMNERModel, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, padding_idx=word2idx['<PAD>'], freeze=True)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_of_layers, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim*2, output_dim)

    def forward(self, x):
        x = self.embedding(x)
        lstm_out, _ = self.lstm(x)
        tag_space = self.fc(lstm_out)
        tag_scores = torch.log_softmax(tag_space, dim=-1)
        return tag_scores

# Utilities

In [8]:
# Hyperparameters
EMBEDDING_DIM = w2v[0].shape[0]
print(f"EMBEDDING_DIM: {EMBEDDING_DIM}")
HIDDEN_DIM = 150
VOCAB_SIZE = len(word2idx)
print(f"VOCAB_SIZE: {VOCAB_SIZE}")
TAGSET_SIZE = len(tag2idx)
print(f"TAGSET_SIZE: {TAGSET_SIZE}")
MAX_EPOCHS = 50

idx2tag = {v: k for k, v in tag2idx.items()}
def idx_to_tags(indices):
    return [idx2tag[idx] for idx in indices]

class EarlyStopper:
    def __init__(self, patience=5):
        self.patience = patience
        self.counter = 0
        self.max_f1 = 0

    def early_stop(self, f1):
        if f1 > self.max_f1:
            self.max_f1 = f1
            self.counter = 0
        else:
            self.counter += 1
            if self.counter >= self.patience:
                return True
        return False


EMBEDDING_DIM: 300
VOCAB_SIZE: 3000002
TAGSET_SIZE: 9


# Test device

In [9]:
device = torch.device('cuda:0')
device

device(type='cuda', index=0)

# Hyperparamter tuning
https://necromuralist.github.io/Neurotic-Networking/posts/nlp/ner-evaluating-the-model/index.html

In [10]:
def evaluate(model, validation_loader, device):
    # print(next(model.parameters()).device) 
    print("evaluate.....")
    # Evaluate on the validation dataset
    # Placeholder to store true and predicted tags
    y_true = [] # true tags
    y_pred = [] # predicted tags
        
    # Evaluate the model on the validation dataset
    model.eval()  # Set the model to evaluation mode
    with torch.no_grad():
        for sentences, tags in validation_loader:
            # Move the data to the GPU
            sentences, tags = sentences.to(device), tags.to(device)
            tag_scores = model(sentences)
            # print(tag_scores.device)
            predictions = tag_scores.argmax(dim=-1).tolist()
            # print(predictions)
                
            # Convert index to tags
            # Note: filtering out padding tokens
            for sentence, true_seq, pred_seq in zip(sentences, tags.tolist(), predictions):
                valid_length = (sentence != word2idx['<PAD>']).sum().item()
                true_tags = [idx2tag[idx] for idx in true_seq[:valid_length]]
                pred_tags = [idx2tag[idx] for idx in pred_seq[:valid_length]]
                y_true.append(true_tags)
                y_pred.append(pred_tags)
        
    # Compute F1 score
    f1 = f1_score(y_true, y_pred)
    return f1

In [None]:
import itertools
from torch.optim import SGD, Adagrad, Adam, RMSprop
from sklearn.model_selection import ParameterGrid
from seqeval.metrics import f1_score
from seqeval.metrics import classification_report
from seqeval.scheme import IOB1
import gc  # Python's garbage collector interface
import json
import os

    
# Define the range of hyperparameters
param_grid = {
    'learning_rate': [0.1, 0.01, 0.001, 0.0001],
    'batch_size': [16, 32],
    'optimizer': [SGD, Adagrad, Adam, RMSprop],
    'hidden_dim': [128, 256, 512],
    'num_layers': [1, 2, 3],
}

# Check for a GPU
# device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device = "cpu"
print(f"Running on device: {device}")

# Function to save the best model to CPU and clear memory
def save_and_clear_best_model(current_best_model, best_state):
    # Save the model state dictionary
    torch.save(current_best_model.state_dict(), best_state)
    # Delete model and empty cache
    del current_best_model
    torch.cuda.empty_cache()

best_model_state = 'best_model_state.pth'

# placeholder for the best model and best F1 score
best_model = None
best_f1_score = 0
best_params = None

# Load existing results if they exist
if os.path.isfile('hyperparameter_tuning_results.json'):
    with open('hyperparameter_tuning_results.json', 'r') as f:
        results = json.load(f)
    existing_combinations = {(result['learning_rate'], result['batch_size'], result['optimizer'], result['hidden_dim'], result['num_layers']) for result in results}
    print(f"Resuming from {len(results)} existing results.")
else:
    results = []
    existing_combinations = set()
    
# Iterate over all combinations
for params in list(ParameterGrid(param_grid)):
    # Skip if this combination has already been evaluated
    if (params['learning_rate'], params['batch_size'], params['optimizer'].__name__, params['hidden_dim'], params['num_layers']) in existing_combinations:
        print("Skipping already evaluated combination:", params)
        continue
        
    # set up model
    model = BiLSTMNERModel(EMBEDDING_DIM, params['hidden_dim'], params['num_layers'], TAGSET_SIZE).to(device)
    loss_function = nn.NLLLoss(ignore_index=tag2idx['PAD']).to(device)
    optimizer = params['optimizer'](model.parameters(), lr=params['learning_rate'])
    
    # Initialize dataloaders
    train_loader = DataLoader(train_dataset, params['batch_size'], shuffle=True, collate_fn=collate_fn)
    validation_loader = DataLoader(validation_dataset, params['batch_size'], shuffle=False, collate_fn=collate_fn)

    # Initialize early stopper
    early_stopper = EarlyStopper()
    
    # Training loop
    print("================================================\n", params)
    for epoch in range(MAX_EPOCHS):
        total_loss = 0
        model.train()  # Make sure the model is in training mode
        for sentences, tags in train_loader:
            sentences, tags = sentences.to(device), tags.to(device)  # Move data to GPU
            model.zero_grad()
            tag_scores = model(sentences)
            loss = loss_function(tag_scores.view(-1, TAGSET_SIZE), tags.view(-1))
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}, Loss: {total_loss}")
        
        # Evaluate on validation set
        f1_validation = evaluate(model, validation_loader, device)  # You need to implement this function
        print(f"F1 Score (Epoch {epoch+1}): {f1_validation}")
        
        # Early stopping check
        if early_stopper.early_stop(f1_validation):
            print(f"Stopping early at epoch {epoch+1}")
            break
            
    # Record the results
    new_result = {
        'learning_rate': params['learning_rate'],
        'batch_size': params['batch_size'],
        'optimizer': params['optimizer'].__name__,
        'hidden_dim': params['hidden_dim'],
        'num_layers': params['num_layers'],
        'final_epoch': epoch,
        'f1_score': f1_validation
    }
    results.append(new_result)
    # Save the results to the JSON file after each iteration
    with open('hyperparameter_tuning_results.json', 'w') as f:
        json.dump(results, f, indent=4)
        
    # Keep track of the best model
    if f1_validation > best_f1_score:
        best_f1_score = f1_validation
        best_params = params
        print(f"New best F1 score: {best_f1_score}")
        print(f"Best hyperparameters: {best_params}")
        
    # Clean up after each model is evaluated to free GPU memory
    del model
    gc.collect()  # Clean up garbage
    torch.cuda.empty_cache()  # Clear the GPU cache
    print("================================================")
    
# sort results by best F1 score
results = sorted(results, key=lambda x: x['f1_score'], reverse=True)

# Print the best hyperparameters
print("Best hyperparameters:", results[0])

Running on device: cpu
Resuming from 127 existing results.
Skipping already evaluated combination: {'batch_size': 16, 'hidden_dim': 128, 'learning_rate': 0.1, 'num_layers': 1, 'optimizer': <class 'torch.optim.sgd.SGD'>}
Skipping already evaluated combination: {'batch_size': 16, 'hidden_dim': 128, 'learning_rate': 0.1, 'num_layers': 1, 'optimizer': <class 'torch.optim.adagrad.Adagrad'>}
Skipping already evaluated combination: {'batch_size': 16, 'hidden_dim': 128, 'learning_rate': 0.1, 'num_layers': 1, 'optimizer': <class 'torch.optim.adam.Adam'>}
Skipping already evaluated combination: {'batch_size': 16, 'hidden_dim': 128, 'learning_rate': 0.1, 'num_layers': 1, 'optimizer': <class 'torch.optim.rmsprop.RMSprop'>}
Skipping already evaluated combination: {'batch_size': 16, 'hidden_dim': 128, 'learning_rate': 0.1, 'num_layers': 2, 'optimizer': <class 'torch.optim.sgd.SGD'>}
Skipping already evaluated combination: {'batch_size': 16, 'hidden_dim': 128, 'learning_rate': 0.1, 'num_layers': 2, '

In [1]:
import json

# Save the results to a JSON file
with open('hyperparameter_tuning_results.json', 'w') as f:
    json.dump(results, f, indent=4)

# Save the best model state to a file
# torch.save(best_model_state, 'best_model.pth')

NameError: name 'results' is not defined

# Test the best model

In [None]:
# Placeholder to store true and predicted tags for the test set
y_true_test = []
y_pred_test = []

# Evaluate the model on the test dataset
best_model.eval()  # Set the model to evaluation mode
with torch.no_grad():
    for sentences, tags in test_loader:
        predictions = best_model(sentences)
        
        # Convert index to tags
        # Note: filtering out padding tokens
        for sentence, true_seq, pred_seq in zip(sentences, tags.tolist(), predictions):
            valid_length = (sentence != word2idx['<PAD>']).sum().item()
            true_tags = [idx2tag[idx] for idx in true_seq[:valid_length]]
            pred_tags = [idx2tag[idx] for idx in pred_seq[:valid_length]]
            y_true_test.append(true_tags)
            y_pred_test.append(pred_tags)

# Compute F1 score for the test set
f1_test = f1_score(y_true_test, y_pred_test)
report_test = classification_report(y_true_test, y_pred_test)

print("F1 Score on Test Set:", f1_test)
print("Classification Report on Test Set:\n", report_test)

# Final Training

In [32]:
# Final training
from torch.utils.data import DataLoader, ConcatDataset

model = BiLSTMNERModel(EMBEDDING_DIM, optimal_hidden_dim, optimal_num_of_layers, TAGSET_SIZE)
loss_function = nn.NLLLoss(ignore_index=tag2idx['PAD'])
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
print("Final Training")

merged_train_dataset = ConcatDataset([train_dataset, validation_dataset])
merged_train_loader = DataLoader(merged_train_dataset, optimal_batch_size, shuffle=True, collate_fn=collate_fn)
early_stopper = EarlyStopper()

for epoch in range(MAX_EPOCHS):
    total_loss = 0
    model.train()
    for sentences, tags in merged_train_loader:
        tag_scores = model(sentences)
        loss = loss_function(tag_scores.view(-1, TAGSET_SIZE), tags.view(-1))
        total_loss += loss.item()
        # Backpropagation
        model.zero_grad()
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}, Loss: {total_loss}")
    f1_test = test(model, optimal_batch_size)
    if early_stopper.early_stop(f1_test):
        break
print("F1 Score on Test Set:", f1_test)

Final Training
Epoch 1, Loss: 211.2243566084653
F1 Score on Test Set: 0.8203576830678887
Epoch 2, Loss: 85.49815228581429
F1 Score on Test Set: 0.8310655592860929
Epoch 3, Loss: 63.95651657227427
F1 Score on Test Set: 0.8478935698447895
Epoch 4, Loss: 49.66351771284826
F1 Score on Test Set: 0.8482000354672814
Epoch 5, Loss: 38.52881736995187
F1 Score on Test Set: 0.8559714795008913
Epoch 6, Loss: 31.395666539494414
F1 Score on Test Set: 0.8584579976985041
Epoch 7, Loss: 24.271331545722205
F1 Score on Test Set: 0.8539925306775743
Epoch 8, Loss: 19.308925223827828
F1 Score on Test Set: 0.8639305026150165
Epoch 9, Loss: 17.019748357968638
F1 Score on Test Set: 0.8573200992555832
Epoch 10, Loss: 13.347994626405125
F1 Score on Test Set: 0.8584507042253522
Epoch 11, Loss: 12.555585913782124
F1 Score on Test Set: 0.8533427912750488
Epoch 12, Loss: 11.178530612298346
F1 Score on Test Set: 0.8586966211999643
Epoch 13, Loss: 9.511184494767804
F1 Score on Test Set: 0.8604405286343613
F1 Score on 

# Inference

In [None]:
def infer(sentence):
    # Tokenize the sentence
    tokens = sentence.split()

    # Convert tokens to indices
    token_indices = torch.tensor([sentence_to_indices(tokens, word2idx)])

    # Get predictions from the model
    model.eval()
    with torch.no_grad():
        tag_scores = model(token_indices)
        predictions = tag_scores.argmax(dim=-1).tolist()[0]

    # Convert index to tags
    predicted_tags = idx_to_tags(predictions, {v: k for k, v in tag2idx.items()})

    """
    # Display the results
    for token, tag in zip(tokens, predicted_tags):
        print(f"{token}: {tag}")
    """
    # Prepare aligned output
    token_line = ""
    tag_line = ""
    for token, tag in zip(tokens, predicted_tags):
        space_padding = max(len(token), len(tag)) + 2  # +2 to add some space between words for better readability
        token_line += token.ljust(space_padding)
        tag_line += tag.ljust(space_padding)

    # Display the results
    print(token_line)
    print(tag_line)


# Example usage:
sentence = "EU rejects German call to boycott British lamb ."
sentence = "Barack Obama was born in Hawaii and worked as the President of the United States."
infer(sentence)

sentence = "Jiang Yuxin was born in Shenyang and is now a student in Nanyang Technological University."
infer(sentence)

# Analysis
e.g. f1 score per class: https://medium.com/illuin/named-entity-recognition-with-bilstm-cnns-632ba83d3d41
## data report
https://github.com/senadkurtisi/pytorch-NER