# 1.1 Word Embedding

In [1]:
import gensim.downloader

In [2]:
# Show all available models in gensim-data
print(list(gensim.downloader.info()['models'].keys()))

['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']


In [3]:
# Download the embeddings
w2v = gensim.downloader.load('word2vec-google-news-300')

In [4]:
# retrieve the vector for 'computer'
print(f"embedding dim: {w2v['computer'].shape}")
w2v['computer'] 

embedding dim: (300,)


array([ 1.07421875e-01, -2.01171875e-01,  1.23046875e-01,  2.11914062e-01,
       -9.13085938e-02,  2.16796875e-01, -1.31835938e-01,  8.30078125e-02,
        2.02148438e-01,  4.78515625e-02,  3.66210938e-02, -2.45361328e-02,
        2.39257812e-02, -1.60156250e-01, -2.61230469e-02,  9.71679688e-02,
       -6.34765625e-02,  1.84570312e-01,  1.70898438e-01, -1.63085938e-01,
       -1.09375000e-01,  1.49414062e-01, -4.65393066e-04,  9.61914062e-02,
        1.68945312e-01,  2.60925293e-03,  8.93554688e-02,  6.49414062e-02,
        3.56445312e-02, -6.93359375e-02, -1.46484375e-01, -1.21093750e-01,
       -2.27539062e-01,  2.45361328e-02, -1.24511719e-01, -3.18359375e-01,
       -2.20703125e-01,  1.30859375e-01,  3.66210938e-02, -3.63769531e-02,
       -1.13281250e-01,  1.95312500e-01,  9.76562500e-02,  1.26953125e-01,
        6.59179688e-02,  6.93359375e-02,  1.02539062e-02,  1.75781250e-01,
       -1.68945312e-01,  1.21307373e-03, -2.98828125e-01, -1.15234375e-01,
        5.66406250e-02, -

## Question 1.1
use cosine similarity to find the most similar 
word to each of these worsds

In [5]:
words = ["student", "Apple", "apple"]

# Print the header
print("Word\t\tMost similar word\tCosine similarity")
print("=======================================================================")

for word in words:
    # Use the downloaded vectors as usual:
    most_similar = w2v.most_similar(positive=[word], topn=1)[0]
    print("{:<15}\t{:<15}\t\t{:.4f}".format(word, most_similar[0], most_similar[1]))


Word		Most similar word	Cosine similarity
student        	students       		0.7295
Apple          	Apple_AAPL     		0.7457
apple          	apples         		0.7204


# 1.2 Data
process: https://wandb.ai/mostafaibrahim17/ml-articles/reports/Named-Entity-Recognition-With-HuggingFace-Using-PyTorch-and-W-B--Vmlldzo0NDgzODA2

In [6]:
from datasets import Dataset, DatasetDict

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
# read data
def read_conll_file(file_path):
    with open(file_path, "r") as f:
        content = f.read().strip()
        sentences = content.split("\n\n")
        data = []
        for sentence in sentences:
            tokens = sentence.split("\n")
            token_data = []
            for token in tokens:
                token_data.append(token.split())
            data.append(token_data)
    return data


train_data = read_conll_file("/mnt/lustre/yuxin/SC4002_G06/datasets/CoNLL2003/eng.train")
validation_data = read_conll_file("/mnt/lustre/yuxin/SC4002_G06/datasets/CoNLL2003/eng.testa")
test_data = read_conll_file("/mnt/lustre/yuxin/SC4002_G06/datasets/CoNLL2003/eng.testb")

In [8]:
validation_data

[[['CRICKET', 'NNP', 'I-NP', 'O'],
  ['-', ':', 'O', 'O'],
  ['LEICESTERSHIRE', 'NNP', 'I-NP', 'I-ORG'],
  ['TAKE', 'NNP', 'I-NP', 'O'],
  ['OVER', 'IN', 'I-PP', 'O'],
  ['AT', 'NNP', 'I-NP', 'O'],
  ['TOP', 'NNP', 'I-NP', 'O'],
  ['AFTER', 'NNP', 'I-NP', 'O'],
  ['INNINGS', 'NNP', 'I-NP', 'O'],
  ['VICTORY', 'NN', 'I-NP', 'O'],
  ['.', '.', 'O', 'O']],
 [['LONDON', 'NNP', 'I-NP', 'I-LOC'], ['1996-08-30', 'CD', 'I-NP', 'O']],
 [['West', 'NNP', 'I-NP', 'I-MISC'],
  ['Indian', 'NNP', 'I-NP', 'I-MISC'],
  ['all-rounder', 'NN', 'I-NP', 'O'],
  ['Phil', 'NNP', 'I-NP', 'I-PER'],
  ['Simmons', 'NNP', 'I-NP', 'I-PER'],
  ['took', 'VBD', 'I-VP', 'O'],
  ['four', 'CD', 'I-NP', 'O'],
  ['for', 'IN', 'I-PP', 'O'],
  ['38', 'CD', 'I-NP', 'O'],
  ['on', 'IN', 'I-PP', 'O'],
  ['Friday', 'NNP', 'I-NP', 'O'],
  ['as', 'IN', 'I-PP', 'O'],
  ['Leicestershire', 'NNP', 'I-NP', 'I-ORG'],
  ['beat', 'VBD', 'I-VP', 'O'],
  ['Somerset', 'NNP', 'I-NP', 'I-ORG'],
  ['by', 'IN', 'I-PP', 'O'],
  ['an', 'DT', 'I-NP

In [9]:
# prepare data
def convert_to_dataset(data, label_map):
    formatted_data = {"tokens": [], "ner_tags": []}
    for sentence in data:
        tokens = [token_data[0] for token_data in sentence]
        ner_tags = [label_map[token_data[3]] for token_data in sentence]
        formatted_data["tokens"].append(tokens)
        formatted_data["ner_tags"].append(ner_tags)
    return Dataset.from_dict(formatted_data)


label_list = sorted(list(set([token_data[3] for sentence in train_data for token_data in sentence])))
label_map = {label: i for i, label in enumerate(label_list)}

train_dataset = convert_to_dataset(train_data, label_map)
validation_dataset = convert_to_dataset(validation_data, label_map)
test_dataset = convert_to_dataset(test_data, label_map)

In [41]:
label_map

{'B-LOC': 0,
 'B-MISC': 1,
 'B-ORG': 2,
 'I-LOC': 3,
 'I-MISC': 4,
 'I-ORG': 5,
 'I-PER': 6,
 'O': 7}

## Question 1.2
(a) Describe the size (number of sentences) of the training, development and test file for CoNLL2003.
Specify the complete set of all possible word labels based on the tagging scheme (IO, BIO,
etc.) you chos

(b) Choose an example sentence from the training set of CoNLL2003 that has at least two named
entities with more than one word. Explain how to form complete named entities from the label
for each word, and list all the named entities in this sentence.e.

In [10]:
# (a)
print("Dataset Sizes:")
print(f"Training:\t{train_dataset.num_rows} sentences")
print(f"Development:\t{validation_dataset.num_rows} sentences")
print(f"Test:\t\t{test_dataset.num_rows} sentences")

print("=======================================================================")
print("All Possible Word Labels (BIO):\n", label_list)

Dataset Sizes:
Training:	14987 sentences
Development:	3466 sentences
Test:		3684 sentences
All Possible Word Labels (BIO):
 ['B-LOC', 'B-MISC', 'B-ORG', 'I-LOC', 'I-MISC', 'I-ORG', 'I-PER', 'O']


!! (b) means finding the sentence that contains at least two distinct named entities, and each of those entities consists of more than one word.
=> but seems in training dataset, there isn't this kind of sentence

In [85]:
from tqdm import tqdm
lists2 = [0,1,2] # ['B-LOC', 'B-MISC', 'B-ORG']

def has_at_least_two_distinct_common_elements(list1, list2=[0,1,2]):
    common_elements = set(list1).intersection(list2)
    return len(common_elements) >= 2

def has_at_least_two_common_elements(list1, list2=[0,1,2]):
    common_elements = [value for value in list1 if value in list2]
    return len(common_elements) >= 2
    
for i in tqdm(range(len(train_dataset))):
    tokens = train_dataset['tokens'][i]
    ner_tags = train_dataset['ner_tags'][i]

    if has_at_least_two_common_elements(ner_tags):
        print(i)
        print(tokens)
        print(ner_tags)
        print([label_list[tag] for tag in ner_tags])
        break

 40%|███████████████████▉                              | 5969/14987 [19:27<29:24,  5.11it/s]

5969
['Swiss', 'Grand', 'Prix', 'World', 'Cup', 'cycling', 'race', 'on', 'Sunday', ':']
[4, 1, 4, 1, 4, 7, 7, 7, 7, 7]





In [11]:
[label_list[tag] for tag in train_dataset[5969]['ner_tags']]

['I-MISC', 'B-MISC', 'I-MISC', 'B-MISC', 'I-MISC', 'O', 'O', 'O', 'O', 'O']

In [12]:
# (b)
def form_complete_ne(dataset, i):
    # define sets of tags
    begin_tags = {'B-LOC', 'B-ORG', 'B-MISC'}
    inside_tags = {'I-ORG', 'I-LOC', 'I-PER', 'I-MISC'}
    outside_tags = {'O'}
    
    words = []
    word = []
    entities = []
    entity = []
    
    tokens = dataset['tokens'][i]
    ner_tags = dataset['ner_tags'][i]
    
    for token, tag in zip(tokens, ner_tags):
        tag = label_list[tag]
        
        if (tag in begin_tags or tag in outside_tags) and word:
            words.append(' '.join(word))
            entities.append(' '.join(entity))
            word = []
            entity = []
            
        if tag in begin_tags or tag in inside_tags:
            word.append(token)
            entity.append(tag)
            
    if word:
        words.append(' '.join(word))
        entities.append(' ', join(entity))

    return words, entities

form_complete_ne(train_dataset, 5969)

(['Swiss', 'Grand Prix', 'World Cup'],
 ['I-MISC', 'B-MISC I-MISC', 'B-MISC I-MISC'])

Explanation:
Using the BIO tagging scheme:

Named entities begin with a 'B-' prefix (although it seems 'B-' tags are not consistently present in this dataset).
The continuation of a named entity uses the 'I-' prefix.
From the selected sentence, the named entities are formed as follo- ws:

EU has the label I-ORG, indicating it is an organization. The named entity is-  "EU".
German has the label I-MISC, indicating it's a miscellaneous named entity. The named entity is "G- erman".
British has the label I-MISC, indicating it's another miscellaneous named entity. The named entity is "
British".
Question 1.2 (b) Answers:
The example sentence "EU rejects German call to boycott British lamb ." contains the following named entities:

"EU" (Organization)
"German" (Miscellaneous)
"British" (Miscellaneous)
To form complete named entities from the label for each word, we identify words labeled with 'B-' as the beginning of a named entity, and any subsequent 'I-' labels as a continuation of the same entity. If a 'B-' label is not present (as seems to be the case in some instances in this dataset), an 'I-' label at the beginning of a sequence is treated as a single-womed entity.






In [13]:
# prepare dataset
datasets = DatasetDict({
    "train": train_dataset,
    "validation": validation_dataset,
    "test": test_dataset,
})

In [40]:
train_dataset[0]

{'tokens': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.'],
 'ner_tags': [5, 7, 4, 7, 7, 7, 4, 7, 7]}

# 1.3 Model

In [14]:
import numpy as np

In [15]:
train_dataset[0]['tokens']

['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']

In [16]:
'of' in w2v

False

1. <PAD> Token:

This token is typically initialized to a zero vector because it's meant to be a neutral padding value that doesn't interfere with computation

2. <UNK> Token:
- Zero Vector: Similar to the <PAD> token, you can initialize it to a zero vector.
- .
Average Vector: Initialize it as the average of all word vectors in your pretrained embeddings. This gives it a kind of "average" representation of the langua
- e.
Random Vector: Randomly initialize it, which might add some noise and robustness to the embeddi

For many tasks, initializing the <UNK> token as the average of all word vectors works well. It makes the <UNK> token have a representation that is, on average, similar to any random word from the vocabulary, which can be beneficial since the <UNK> token is used for words that aren't in the training vocabulary but could be anywhere in the semantic space.ngs.s.

In [17]:
# Out-of-vocabulary (OOV) words
# 1. can be replaced with a special token, such as "<OOV>" or "<UNK>".
# 2. can be ignored.

word2idx = w2v.key_to_index
print(f"whether <UNK> in w2v: {'<UNK>' in word2idx}") # False
print(f"whether <PAD> in w2v: {'<PAD>' in word2idx}") # False

# Define a vocabulary array by appending '<UNK>' and '<PAD>' tokens
#voc = list(word2idx.keys())
#voc.extend(['<UNK>', '<PAD>'])

# Add '<UNK>' and '<PAD>' tokens to the vocabulary index
word2idx['<UNK>'] = len(word2idx)
word2idx['<PAD>'] = len(word2idx)

print(f"word2idx['<UNK>']: {word2idx['<UNK>']}")
print(f"word2idx['<PAD>']: {word2idx['<PAD>']}")

whether <UNK> in w2v: False
whether <PAD> in w2v: False
word2idx['<UNK>']: 3000000
word2idx['<PAD>']: 3000001


In [157]:
print(w2v.vectors.shape)
w2v['computer'].shape

(3000000, 300)


(300,)

In [20]:
# add the '<UNK>' word to the vocabulary of the Word2Vec model 
# initialize it with the average of all word vectors int he pretrained embeddings.
unk_vector = np.mean(w2v.vectors, axis=0)
w2v.vectors = np.vstack([w2v.vectors, unk_vector])
print("after insert UNK: ", w2v.vectors.shape)

# add the '<PAD>' word to the vocabulary of the Word2Vec model 
# initialize it with a row of zeros in the vectors matrix.
w2v.vectors = np.vstack([w2v.vectors, np.zeros(w2v.vectors[0].shape)])
print("after insert UNK: ", w2v.vectors.shape)

after insert UNK:  (3000001, 300)
after insert UNK:  (3000002, 300)


In [21]:
w2v['<UNK>']

array([-2.25728611e-04, -1.01306627e-03, -1.04773864e-02,  3.14636230e-02,
        1.94908399e-03, -4.83569540e-02,  8.71036202e-03, -4.18831371e-02,
        2.51461826e-02,  4.58311923e-02,  1.26088806e-03, -1.75505376e-03,
       -2.50608157e-02,  4.20754105e-02, -4.06269804e-02,  2.94472519e-02,
        1.80458277e-02,  3.59841287e-02, -2.66128872e-02, -7.92622752e-03,
        2.70865131e-02,  4.24901256e-03,  5.27169220e-02,  1.93178728e-02,
        5.10774646e-03, -2.76226867e-02, -3.38717587e-02,  4.39997353e-02,
        1.32414782e-02, -4.49229591e-02, -3.05306613e-02, -4.27834578e-02,
        4.64673946e-03, -2.74249297e-02,  3.94795975e-03, -7.11372793e-02,
        3.72331291e-02,  7.20562087e-03,  3.08779557e-03,  8.44953489e-03,
       -8.02358240e-03, -2.38431450e-02,  4.02598381e-02,  5.37732914e-02,
       -4.63994108e-02, -7.93176889e-02, -4.39509153e-02, -1.62945110e-02,
        7.95037951e-03,  4.17706594e-02, -6.89574629e-02,  3.71620990e-02,
       -1.33652124e-03, -

In [23]:
w2v['<PAD>']

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.

## Use Pre-trained Word Embeddings in Pytorch
https://clay-atlas.com/us/blog/2021/08/06/pytorch-en-use-nn-embedding-load-gensim-pre-trained-weights/
https://keras.io/examples/nlp/pretrained_word_embeddings/

In addition, when actually using the nn.Embedding model layer, you still have to pay attention to the so-called “unknown word”, which will not automatically help processing in nn.Embedding, so we need to manually read the weights Add a vector of unknown words to see if you want to fill in an average vector or a zero vector, and then when encoding the vocabulary, the vocabulary that is not in the pre-training vocabulary is numbered as the number of the unknown word.

In [24]:
# vectorize the text data
# Convert the preprocessed text data to a vector representation using the Word2Vec model.
import torch
import torch.nn as nn

weights = torch.FloatTensor(w2v.vectors)

# Build nn.Embedding() layer
embedding = nn.Embedding.from_pretrained(weights)
# embedding = nn.Embedding.from_pretrained(embedding_matrix, padding_idx=vocab.get('<PAD>', None), freeze=True)
embedding.requires_grad = False

In [25]:
# Query
query = '<UNK>'
query_id = torch.tensor(w2v.key_to_index['<UNK>'])
print(query_id)
gensim_vector = torch.tensor(w2v[query])
embedding_vector = embedding(query_id)

print(gensim_vector==embedding_vector)

tensor(3000000)
tensor([True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True

In [36]:
def sentence_to_indices(sentence, vocab):
    return [vocab.get(word, vocab.get('<UNK>')) for word in sentence]

In [37]:
# Map words to Indices

train_sentences_indices = [sentence_to_indices(sentence, word2idx) for sentence in [train_dataset['tokens']]] 

train_sentences_indices

[[1611, 11500, 1760, 315, 3000000, 8059, 882, 18927, 3000000]]

In [38]:
train_dataset[0]

{'tokens': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.'],
 'ner_tags': [5, 7, 4, 7, 7, 7, 4, 7, 7]}

# Tokenize Dataset

In [42]:
tag2idx = {
    'B-LOC': 0,
    'B-MISC': 1,
    'B-ORG': 2,
    'I-LOC': 3,
    'I-MISC': 4,
    'I-ORG': 5,
    'I-PER': 6,
    'O': 7,
    'PAD': 8
}

In [44]:
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader

class NERDataset(Dataset):
    def __init__(self, sentences, tags, vocab):
        self.sentences = [torch.tensor(sentence_to_indices(sentence, vocab)) for sentence in sentences]
        self.tags = [torch.tensor(tag) for tag in tags]
        
    def __len__(self):
        return len(self.sentences)
    
    def __getitem__(self, idx):
        return self.sentences[idx], self.tags[idx]

def collate_fn(batch):
    sentences, tags = zip(*batch)
    sentences_padded = pad_sequence(sentences, batch_first=True, padding_value=word2idx['<PAD>'])
    tags_padded = pad_sequence(tags, batch_first=True, padding_value=tag2idx['PAD'])  # Assuming 'O' is the padding value for tags
    return sentences_padded, tags_padded

# Create PyTorch datasets and data loaders
train_dataset = NERDataset(train_dataset['tokens'], train_dataset['ner_tags'], word2idx)
validation_dataset = NERDataset(validation_dataset['tokens'], validation_dataset['ner_tags'], word2idx)
test_dataset = NERDataset(test_dataset['tokens'], test_dataset['ner_tags'], word2idx)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
validation_loader = DataLoader(validation_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)


## Define metrics

In [57]:
from seqeval.metrics import f1_score
from seqeval.metrics import classification_report
from seqeval.scheme import IOB1

In [58]:
y_true = [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
y_pred = [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
f1_score(y_true, y_pred)

0.5

In [59]:
classification_report(y_true, y_pred)

'              precision    recall  f1-score   support\n\n        MISC       0.00      0.00      0.00         1\n         PER       1.00      1.00      1.00         1\n\n   micro avg       0.50      0.50      0.50         2\n   macro avg       0.50      0.50      0.50         2\nweighted avg       0.50      0.50      0.50         2\n'

In [60]:
classification_report(y_true, y_pred, mode='strict', scheme=IOB1)

'              precision    recall  f1-score   support\n\n        MISC       0.00      0.00      0.00         1\n         PER       1.00      1.00      1.00         1\n\n   micro avg       0.50      0.50      0.50         2\n   macro avg       0.50      0.50      0.50         2\nweighted avg       0.50      0.50      0.50         2\n'

In [176]:
def compute_metrics(eval_prediction):
    predictions, labels = eval_prediction
    predictions = np.argmax(predictions, axis=2)


    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]


    return {
        "precision": precision_score(true_labels, true_predictions),
        "recall": recall_score(true_labels, true_predictions),
        "f1": f1_score(true_labels, true_predictions),
        "classification_report": classification_report(true_labels, true_predictions),
    }


# Model

In [53]:
embedding_matrix = torch.FloatTensor(w2v.vectors)

class LSTMNERModel(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, output_dim):
        super(LSTMNERModel, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix,  padding_idx=word2idx['<PAD>'], freeze=True)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, x):
        x = self.embedding(x)
        lstm_out, _ = self.lstm(x)
        tag_space = self.fc(lstm_out)
        tag_scores = torch.log_softmax(tag_space, dim=-1)
        return tag_scores

# Compile

In [64]:
# Hyperparameters
EMBEDDING_DIM = 300
HIDDEN_DIM = 150
# OUTPUT_DIM = len(label_list)  # Number of unique tags/labels
VOCAB_SIZE = len(word2idx)
TAGSET_SIZE = len(tag2idx)

model = LSTMNERModel(EMBEDDING_DIM, HIDDEN_DIM, TAGSET_SIZE)
loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Train

In [55]:
def idx_to_tags(indices, idx2tag):
    return [idx2tag[idx] for idx in indices]

In [65]:
# Assuming you've created dataloaders for training and validation data
num_epochs = 10

# Training loop
for epoch in range(num_epochs):  # Number of epochs
    total_loss = 0
    for sentences, tags in train_loader:
        model.zero_grad()
        tag_scores = model(sentences)
        loss = loss_function(tag_scores.view(-1, TAGSET_SIZE), tags.view(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss}")

    if (epoch+1) % 2 == 0:
        print("evaluate.....")
        # Evaluate on the validation dataset
        # Placeholder to store true and predicted tags
        y_true = [] # true tags
        y_pred = [] # predicted tags
        
        # Evaluate the model on the validation dataset
        model.eval()  # Set the model to evaluation mode
        with torch.no_grad():
            for sentences, tags in validation_loader:
                tag_scores = model(sentences)
                predictions = tag_scores.argmax(dim=-1).tolist()
                
                # Convert index to tags
                tag_seqs = [idx_to_tags(seq, {v: k for k, v in tag2idx.items()}) for seq in tags.tolist()]
                pred_seqs = [idx_to_tags(seq, {v: k for k, v in tag2idx.items()}) for seq in predictions]
                
                y_true.extend(tag_seqs)
                y_pred.extend(pred_seqs)
        
        # Compute F1 score
        f1 = f1_score(y_true, y_pred)
        #report = classification_report(y_true, y_pred, mode='strict', scheme=IOB1)
        print("F1 Score:", f1)
        #print("Classification Report:\n", report)

Epoch 1, Loss: 103.0610625371337
Epoch 2, Loss: 24.09125162754208
evaluate.....
F1 Score: 0.8455458235419782
Epoch 3, Loss: 19.553296024911106
Epoch 4, Loss: 17.323238398879766
evaluate.....
F1 Score: 0.8677491720969982
Epoch 5, Loss: 15.539738575927913
Epoch 6, Loss: 14.351475215516984
evaluate.....
F1 Score: 0.8778352991407375
Epoch 7, Loss: 12.992547077592462
Epoch 8, Loss: 11.906782491132617
evaluate.....
F1 Score: 0.8859827052418063
Epoch 9, Loss: 11.12106039468199
Epoch 10, Loss: 9.972066201502457
evaluate.....
F1 Score: 0.8865050418365158


# test

In [67]:
# Placeholder to store true and predicted tags for the test set
y_true_test = []
y_pred_test = []

# Evaluate the model on the test dataset
model.eval()  # Set the model to evaluation mode
with torch.no_grad():
    for sentences, tags in test_loader:
        tag_scores = model(sentences)
        predictions = tag_scores.argmax(dim=-1).tolist()
        
        # Convert index to tags
        tag_seqs = [idx_to_tags(seq, {v: k for k, v in tag2idx.items()}) for seq in tags.tolist()]
        pred_seqs = [idx_to_tags(seq, {v: k for k, v in tag2idx.items()}) for seq in predictions]
        
        y_true_test.extend(tag_seqs)
        y_pred_test.extend(pred_seqs)

# Compute F1 score for the test set
f1_test = f1_score(y_true_test, y_pred_test)
# TODO: debugg KeyError: 'P'
# report_test = classification_report(y_true_test, y_pred_test, mode='strict', scheme=IOB1)

print("F1 Score on Test Set:", f1_test)
# print("Classification Report on Test Set:\n", report_test)

F1 Score on Test Set: 0.8473496707330238


# Inference

In [73]:
def infer(sentence):
    # Tokenize the sentence
    tokens = sentence.split()

    # Convert tokens to indices
    token_indices = torch.tensor([sentence_to_indices(tokens, word2idx)])

    # Get predictions from the model
    model.eval()
    with torch.no_grad():
        tag_scores = model(token_indices)
        predictions = tag_scores.argmax(dim=-1).tolist()[0]

    # Convert index to tags
    predicted_tags = idx_to_tags(predictions, {v: k for k, v in tag2idx.items()})

    """
    # Display the results
    for token, tag in zip(tokens, predicted_tags):
        print(f"{token}: {tag}")
    """
    # Prepare aligned output
    token_line = ""
    tag_line = ""
    for token, tag in zip(tokens, predicted_tags):
        space_padding = max(len(token), len(tag)) + 2  # +2 to add some space between words for better readability
        token_line += token.ljust(space_padding)
        tag_line += tag.ljust(space_padding)

    # Display the results
    print(token_line)
    print(tag_line)


# Example usage:
sentence = "EU rejects German call to boycott British lamb ."
sentence = "Barack Obama was born in Hawaii and worked as the President of the United States."
infer(sentence)

sentence = "Jiang Yuxin was born in Shenyang and is now a student in Nanyang Technological University."
infer(sentence)


Barack  Obama  was  born  in  Hawaii  and  worked  as  the  President  of  the  United  States.  
I-PER   I-PER  O    O     O   I-LOC   O    O       O   O    O          O   O    I-LOC   O        
Jiang  Yuxin  was  born  in  Shenyang  and  is  now  a  student  in  Nanyang  Technological  University.  
I-PER  O      O    O     O   I-LOC     O    O   O    O  O        O   I-LOC    I-ORG          O            


# Analysis
e.g. f1 score per class: https://medium.com/illuin/named-entity-recognition-with-bilstm-cnns-632ba83d3d41
## data report
https://github.com/senadkurtisi/pytorch-NER