# 1.1 Word Embedding

In [1]:
import gensim.downloader
w2v = gensim.downloader.load('word2vec-google-news-300')


## Question 1.1
use cosine similarity to find the most similar 
word to each of these worsds

In [5]:
words = ["student", "Apple", "apple"]

# Print the header
print("Word\t\tMost similar word\tCosine similarity")
print("=======================================================================")

for word in words:
    # Use the downloaded vectors as usual:
    most_similar = w2v.most_similar(positive=[word], topn=1)[0]
    print("{:<15}\t{:<15}\t\t{:.4f}".format(word, most_similar[0], most_similar[1]))


Word		Most similar word	Cosine similarity
student        	students       		0.7295
Apple          	Apple_AAPL     		0.7457
apple          	apples         		0.7204


# 1.2 Data
process: https://wandb.ai/mostafaibrahim17/ml-articles/reports/Named-Entity-Recognition-With-HuggingFace-Using-PyTorch-and-W-B--Vmlldzo0NDgzODA2

In [7]:
# read data

from datasets import Dataset, DatasetDict

def read_conll_file(file_path):
    with open(file_path, "r") as f:
        content = f.read().strip()
        sentences = content.split("\n\n")
        data = []
        for sentence in sentences:
            tokens = sentence.split("\n")
            token_data = []
            for token in tokens:
                token_data.append(token.split())
            data.append(token_data)
    return data


train_data = read_conll_file("/content/drive/MyDrive/CoNLL2003/eng.train")
validation_data = read_conll_file("/content/drive/MyDrive/CoNLL2003/eng.testa")
test_data = read_conll_file("/content/drive/MyDrive/CoNLL2003/eng.testb")


In [9]:
# prepare data
def convert_to_dataset(data, label_map):
    formatted_data = {"tokens": [], "ner_tags": []}
    for sentence in data:
        tokens = [token_data[0] for token_data in sentence]
        ner_tags = [label_map[token_data[3]] for token_data in sentence]
        formatted_data["tokens"].append(tokens)
        formatted_data["ner_tags"].append(ner_tags)
    return Dataset.from_dict(formatted_data)


label_list = sorted(list(set([token_data[3] for sentence in train_data for token_data in sentence])))
label_map = {label: i for i, label in enumerate(label_list)}

train_dataset = convert_to_dataset(train_data, label_map)
validation_dataset = convert_to_dataset(validation_data, label_map)
test_dataset = convert_to_dataset(test_data, label_map)


In [41]:
label_map


{'B-LOC': 0,
 'B-MISC': 1,
 'B-ORG': 2,
 'I-LOC': 3,
 'I-MISC': 4,
 'I-ORG': 5,
 'I-PER': 6,
 'O': 7}

## Question 1.2
(a) Describe the size (number of sentences) of the training, development and test file for CoNLL2003.
Specify the complete set of all possible word labels based on the tagging scheme (IO, BIO,
etc.) you chos

(b) Choose an example sentence from the training set of CoNLL2003 that has at least two named
entities with more than one word. Explain how to form complete named entities from the label
for each word, and list all the named entities in this sentence.e.

In [10]:
# (a)
print("Dataset Sizes:")
print(f"Training:\t{train_dataset.num_rows} sentences")
print(f"Development:\t{validation_dataset.num_rows} sentences")
print(f"Test:\t\t{test_dataset.num_rows} sentences")

print("=======================================================================")
print("All Possible Word Labels (BIO):\n", label_list)


Dataset Sizes:
Training:	14987 sentences
Development:	3466 sentences
Test:		3684 sentences
All Possible Word Labels (BIO):
 ['B-LOC', 'B-MISC', 'B-ORG', 'I-LOC', 'I-MISC', 'I-ORG', 'I-PER', 'O']


!! (b) means finding the sentence that contains at least two distinct named entities, and each of those entities consists of more than one word.
=> but seems in training dataset, there isn't this kind of sentence

In [85]:
from tqdm import tqdm
lists2 = [0,1,2] # ['B-LOC', 'B-MISC', 'B-ORG']

def has_at_least_two_common_elements(list1, list2=[0,1,2]):
    common_elements = [value for value in list1 if value in list2]
    return len(common_elements) >= 2

for i in tqdm(range(len(train_dataset))):
    tokens = train_dataset['tokens'][i]
    ner_tags = train_dataset['ner_tags'][i]

    if has_at_least_two_common_elements(ner_tags):
        print(i)
        print(tokens)
        print(ner_tags)
        print([label_list[tag] for tag in ner_tags])
        break


 40%|███████████████████▉                              | 5969/14987 [19:27<29:24,  5.11it/s]

5969
['Swiss', 'Grand', 'Prix', 'World', 'Cup', 'cycling', 'race', 'on', 'Sunday', ':']
[4, 1, 4, 1, 4, 7, 7, 7, 7, 7]





In [11]:
[label_list[tag] for tag in train_dataset[5969]['ner_tags']]


['I-MISC', 'B-MISC', 'I-MISC', 'B-MISC', 'I-MISC', 'O', 'O', 'O', 'O', 'O']

In [12]:
# (b)
def form_complete_ne(dataset, i):
    # define sets of tags
    begin_tags = {'B-LOC', 'B-ORG', 'B-MISC'}
    inside_tags = {'I-ORG', 'I-LOC', 'I-PER', 'I-MISC'}
    outside_tags = {'O'}

    words = []
    word = []
    entities = []
    entity = []

    tokens = dataset['tokens'][i]
    ner_tags = dataset['ner_tags'][i]

    for token, tag in zip(tokens, ner_tags):
        tag = label_list[tag]

        if (tag in begin_tags or tag in outside_tags) and word:
            words.append(' '.join(word))
            entities.append(' '.join(entity))
            word = []
            entity = []

        if tag in begin_tags or tag in inside_tags:
            word.append(token)
            entity.append(tag)

    if word:
        words.append(' '.join(word))
        entities.append(' '.join(entity))

    return words, entities

form_complete_ne(train_dataset, 5969)


(['Swiss', 'Grand Prix', 'World Cup'],
 ['I-MISC', 'B-MISC I-MISC', 'B-MISC I-MISC'])

# 1.3 Model

1. `<PAD>` Token:

This token is typically initialized to a zero vector because it's meant to be a neutral padding value that doesn't interfere with computation

2. `<UNK>` Token:
- Zero Vector: Similar to the <PAD> token, you can initialize it to a zero vector.
- Average Vector: Initialize it as the average of all word vectors in your pretrained embeddings. This gives it a kind of "average" representation of the language.
- Random Vector: Randomly initialize it, which might add some noise and robustness to the embeddi

For many tasks, initializing the <UNK> token as the average of all word vectors works well. It makes the <UNK> token have a representation that is, on average, similar to any random word from the vocabulary, which can be beneficial since the <UNK> token is used for words that aren't in the training vocabulary but could be anywhere in the semantic space.ngs.s.

In [13]:
# prepare dataset
datasets = DatasetDict({
    "train": train_dataset,
    "validation": validation_dataset,
    "test": test_dataset,
})


In [20]:
import numpy as np

# Out-of-vocabulary (OOV) words
# 1. can be replaced with a special token, such as "<OOV>" or "<UNK>".
# 2. can be ignored.

word2idx = w2v.key_to_index
print(f"whether <UNK> in w2v: {'<UNK>' in word2idx}") # False
print(f"whether <PAD> in w2v: {'<PAD>' in word2idx}") # False

# Add '<UNK>' and '<PAD>' tokens to the vocabulary index
word2idx['<UNK>'] = len(word2idx)
word2idx['<PAD>'] = len(word2idx)

print(f"word2idx['<UNK>']: {word2idx['<UNK>']}")
print(f"word2idx['<PAD>']: {word2idx['<PAD>']}")

# add the '<UNK>' word to the vocabulary of the Word2Vec model
# initialize it with the average of all word vectors in the pretrained embeddings.
unk_vector = np.mean(w2v.vectors, axis=0)
w2v.vectors = np.vstack([w2v.vectors, unk_vector])
print("after insert UNK: ", w2v.vectors.shape)

# add the '<PAD>' word to the vocabulary of the Word2Vec model
# initialize it with a row of zeros in the vectors matrix.
w2v.vectors = np.vstack([w2v.vectors, np.zeros(w2v.vectors[0].shape)])
print("after insert UNK: ", w2v.vectors.shape)


after insert UNK:  (3000001, 300)
after insert UNK:  (3000002, 300)


# Tokenize Dataset

In [42]:
# Map words to Indices
def sentence_to_indices(sentence, vocab):
    return [vocab.get(word, vocab.get('<UNK>')) for word in sentence]

tag2idx = {
    'B-LOC': 0,
    'B-MISC': 1,
    'B-ORG': 2,
    'I-LOC': 3,
    'I-MISC': 4,
    'I-ORG': 5,
    'I-PER': 6,
    'O': 7,
    'PAD': 8
}


In [44]:
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader

class NERDataset(Dataset):
    def __init__(self, sentences, tags, vocab):
        self.sentences = [torch.tensor(sentence_to_indices(sentence, vocab)) for sentence in sentences]
        self.tags = [torch.tensor(tag) for tag in tags]

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        return self.sentences[idx], self.tags[idx]

# Create PyTorch datasets and data loaders
train_dataset = NERDataset(train_dataset['tokens'], train_dataset['ner_tags'], word2idx)
validation_dataset = NERDataset(validation_dataset['tokens'], validation_dataset['ner_tags'], word2idx)
test_dataset = NERDataset(test_dataset['tokens'], test_dataset['ner_tags'], word2idx)


def collate_fn(batch):
    sentences, tags = zip(*batch)
    sentences_padded = pad_sequence(sentences, batch_first=True, padding_value=word2idx['<PAD>'])
    tags_padded = pad_sequence(tags, batch_first=True, padding_value=tag2idx['PAD'])
    return sentences_padded, tags_padded


# Model

In [53]:
embedding_matrix = torch.FloatTensor(w2v.vectors)

class LSTMNERModel(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, output_dim):
        super(LSTMNERModel, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=True)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = self.embedding(x)
        lstm_out, _ = self.lstm(x)
        tag_space = self.fc(lstm_out)
        tag_scores = torch.log_softmax(tag_space, dim=-1)
        return tag_scores


# Compile

In [64]:
# Hyperparameters
EMBEDDING_DIM = w2v[0].shape[0]
HIDDEN_DIM = 150
VOCAB_SIZE = len(word2idx)
TAGSET_SIZE = len(tag2idx)
MAX_EPOCHS = 200

model = LSTMNERModel(EMBEDDING_DIM, HIDDEN_DIM, TAGSET_SIZE)
loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
early_stopper = EarlyStopper()

class EarlyStopper:
    def __init__(self, patience=5):
        self.patience = patience
        self.counter = 0
        self.max_f1 = 0

    def early_stop(self, f1):
        if f1 > self.max_f1:
            self.max_f1 = f1
            self.counter = 0
        elif f1 < (self.max_f1):
            self.counter += 1
            if self.counter >= self.patience:
                return True
        return False


# Train
https://necromuralist.github.io/Neurotic-Networking/posts/nlp/ner-evaluating-the-model/index.html

In [None]:
from seqeval.metrics import f1_score
from seqeval.metrics import classification_report
from seqeval.scheme import IOB1

def idx_to_tags(indices):
    idx2tag = {v: k for k, v in tag2idx.items()}
    return [idx2tag[idx] for idx in indices]

def find_optimal_batch_size(parameters):
    f1_score_dev = []
    for batch_size in parameters:
        model = LSTMNERModel(EMBEDDING_DIM, HIDDEN_DIM, TAGSET_SIZE)
        loss_function = nn.CrossEntropyLoss()
        optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
        print("Batch Size: ", batch_size)
        train_loader = DataLoader(train_dataset, batch_size, shuffle=True, collate_fn=collate_fn)
        validation_loader = DataLoader(validation_dataset, batch_size, shuffle=False, collate_fn=collate_fn)
        early_stopper = EarlyStopper()

        for epoch in range(MAX_EPOCHS):
            total_loss = 0
            model.train()
            for sentences, tags in train_loader:
                tag_scores = model(sentences)
                loss = loss_function(tag_scores.view(-1, TAGSET_SIZE), tags.view(-1))
                total_loss += loss.item()
                # Backpropagation
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
            print(f"Epoch {epoch+1}, Loss: {total_loss}")

            model.eval()
            y_true = []
            y_pred = []
            with torch.no_grad():
                for sentences, tags in validation_loader:
                    tag_scores = model(sentences)
                    predictions = tag_scores.argmax(dim=-1).tolist()
                    # Convert index to tags
                    tag_seqs = [idx_to_tags(seq) for seq in tags.tolist()]
                    pred_seqs = [idx_to_tags(seq) for seq in predictions]
                    y_true.extend(tag_seqs)
                    y_pred.extend(pred_seqs)
            # Compute F1 score
            # Remove <PAD> label from true and predicted labels
            # filtered_y_true = []
            # filtered_y_pred = []
            # for true_seq, pred_seq in zip(y_true, y_pred):
            #     filtered_true_seq = [label for label in true_seq if label != 'PAD']
            #     filtered_pred_seq = [label for label in pred_seq if label != 'PAD']
            #     shortest_len = min(len(filtered_true_seq), len(filtered_pred_seq))
            #     short_true_seq = filtered_true_seq[:shortest_len]
            #     short_pred_seq = filtered_pred_seq[:shortest_len]
            #     filtered_y_true.append(short_true_seq)
            #     filtered_y_pred.append(short_pred_seq)
            #f1 = f1_score(filtered_y_true, filtered_y_pred, mode='strict', scheme=IOB1)
            f1 = f1_score(y_true, y_pred)
            print("F1 Score:", f1)
            if early_stopper.early_stop(f1):
                f1_score_dev.append(f1)
                break
    return f1_score_dev

batch_sizes = [32, 64, 128, 256]
f1_score_dev = find_optimal_batch_size(batch_sizes)
optimal_batch_size = batch_sizes[np.argmax(f1_score_dev)]
f1_score_dev


In [None]:
# find optimal hidden dimension
def find_optimal_hidden_dim(parameters):
    f1_score_hidden = []
    for hidden_dim in parameters:
        model = LSTMNERModel(EMBEDDING_DIM, hidden_dim, TAGSET_SIZE)
        loss_function = nn.CrossEntropyLoss()
        optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
        print("Hidden Dimension: ", hidden_dim)
        train_loader = DataLoader(train_dataset, optimal_batch_size, shuffle=True, collate_fn=collate_fn)
        validation_loader = DataLoader(validation_dataset, optimal_batch_size, shuffle=False, collate_fn=collate_fn)
        early_stopper = EarlyStopper()

        for epoch in range(MAX_EPOCHS):
            total_loss = 0
            model.train()
            for sentences, tags in train_loader:
                tag_scores = model(sentences)
                loss = loss_function(tag_scores.view(-1, TAGSET_SIZE), tags.view(-1))
                total_loss += loss.item()
                # Backpropagation
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
            print(f"Epoch {epoch+1}, Loss: {total_loss}")

            model.eval()
            y_true = []
            y_pred = []
            with torch.no_grad():
                for sentences, tags in validation_loader:
                    tag_scores = model(sentences)
                    predictions = tag_scores.argmax(dim=-1).tolist()
                    # Convert index to tags
                    tag_seqs = [idx_to_tags(seq) for seq in tags.tolist()]
                    pred_seqs = [idx_to_tags(seq) for seq in predictions]
                    y_true.extend(tag_seqs)
                    y_pred.extend(pred_seqs)
            # Compute F1 score
            # Remove <PAD> label from true and predicted labels
            # filtered_y_true = []
            # filtered_y_pred = []
            # for true_seq, pred_seq in zip(y_true, y_pred):
            #     filtered_true_seq = [label for label in true_seq if label != 'PAD']
            #     filtered_pred_seq = [label for label in pred_seq if label != 'PAD']
            #     shortest_len = min(len(filtered_true_seq), len(filtered_pred_seq))
            #     short_true_seq = filtered_true_seq[:shortest_len]
            #     short_pred_seq = filtered_pred_seq[:shortest_len]
            #     filtered_y_true.append(short_true_seq)
            #     filtered_y_pred.append(short_pred_seq)
            f1 = f1_score(y_true, y_pred)
            print("F1 Score:", f1)
            if early_stopper.early_stop(f1):
                f1_score_hidden.append(f1)
                break
    return f1_score_hidden

hidden_dims = [64, 128, 256, 512]
f1_score_hidden = find_optimal_hidden_dim(hidden_dims)
optimal_hidden_dim = hidden_dims[np.argmax(f1_score_hidden)]
f1_score_hidden


# test

In [67]:
# Placeholder to store true and predicted tags for the test set
y_true_test = []
y_pred_test = []
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)

# Evaluate the model on the test dataset
model.eval()  # Set the model to evaluation mode
with torch.no_grad():
    for sentences, tags in test_loader:
        tag_scores = model(sentences)
        predictions = tag_scores.argmax(dim=-1).tolist()

        # Convert index to tags
        idx2tag = {v: k for k, v in tag2idx.items()}
        tag_seqs = [idx_to_tags(seq, idx2tag) for seq in tags.tolist()]
        pred_seqs = [idx_to_tags(seq, idx2tag) for seq in predictions]

        y_true_test.extend(tag_seqs)
        y_pred_test.extend(pred_seqs)

# Compute F1 score for the test set
f1_test = f1_score(y_true_test, y_pred_test)
# TODO: debugg KeyError: 'P'
# report_test = classification_report(y_true_test, y_pred_test, mode='strict', scheme=IOB1)

print("F1 Score on Test Set:", f1_test)
# print("Classification Report on Test Set:\n", report_test)


F1 Score on Test Set: 0.8473496707330238


# Inference

In [73]:
def infer(sentence):
    # Tokenize the sentence
    tokens = sentence.split()

    # Convert tokens to indices
    token_indices = torch.tensor([sentence_to_indices(tokens, word2idx)])

    # Get predictions from the model
    model.eval()
    with torch.no_grad():
        tag_scores = model(token_indices)
        predictions = tag_scores.argmax(dim=-1).tolist()[0]

    # Convert index to tags
    predicted_tags = idx_to_tags(predictions, {v: k for k, v in tag2idx.items()})

    """
    # Display the results
    for token, tag in zip(tokens, predicted_tags):
        print(f"{token}: {tag}")
    """
    # Prepare aligned output
    token_line = ""
    tag_line = ""
    for token, tag in zip(tokens, predicted_tags):
        space_padding = max(len(token), len(tag)) + 2  # +2 to add some space between words for better readability
        token_line += token.ljust(space_padding)
        tag_line += tag.ljust(space_padding)

    # Display the results
    print(token_line)
    print(tag_line)


# Example usage:
sentence = "EU rejects German call to boycott British lamb ."
sentence = "Barack Obama was born in Hawaii and worked as the President of the United States."
infer(sentence)

sentence = "Jiang Yuxin was born in Shenyang and is now a student in Nanyang Technological University."
infer(sentence)


Barack  Obama  was  born  in  Hawaii  and  worked  as  the  President  of  the  United  States.  
I-PER   I-PER  O    O     O   I-LOC   O    O       O   O    O          O   O    I-LOC   O        
Jiang  Yuxin  was  born  in  Shenyang  and  is  now  a  student  in  Nanyang  Technological  University.  
I-PER  O      O    O     O   I-LOC     O    O   O    O  O        O   I-LOC    I-ORG          O            


# Analysis
e.g. f1 score per class: https://medium.com/illuin/named-entity-recognition-with-bilstm-cnns-632ba83d3d41
## data report
https://github.com/senadkurtisi/pytorch-NER