In [1]:
from datasets import load_dataset
from collections import Counter
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torchcrf
import torch.optim as optim
from tqdm import tqdm
from torch.utils.tensorboard import SummaryWriter 
from sklearn.metrics import classification_report
import pickle

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
#### CONLL 2003 NER DATASET ####
# Loading the dataset from huggingface
dataset = load_dataset("eriktks/conll2003", trust_remote_code=True)

def build_vocab(dataset, min_freq=1):
    """ 
    Build a vocabulary from the dataset.
    
    Args:
        dataset: a dataset object
        min_freq: minimum frequency of a word to be included in the vocabulary
    Returns:
        word_vocab: a dictionary with words as keys and indices as values
    """
    word_freq = Counter()
    
    for example in dataset['train']:
        word_freq.update(example["tokens"])  # Counting word occurrences
    
    # Assigning the indices to words (PAD=0, UNK=1)
    word_vocab = {"<PAD>": 0, "<UNK>": 1}
    for word, freq in word_freq.items():
        if freq >= min_freq:
            word_vocab[word] = len(word_vocab)
    
    return word_vocab

word_vocab = build_vocab(dataset)

# NER labels
label_vocab = {'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6, 'B-MISC': 7, 'I-MISC': 8}
def encode_sentence(tokens, vocab, max_len=128):
    """
    Encode a sentence of tokens into indices using a vocabulary.
    Truncate to the maximum length if longer, or pad with zeros if shorter.
    
    Args:
        tokens (List[str]): List of tokens in the sentence
        vocab (Dict[str, int]): Mapping from token to index
        max_len (int): Maximum length of the output sequence
    Returns:
        indices (torch.Tensor): Tensor of token indices
    """
    
    indices = [vocab.get(word, vocab["<UNK>"]) for word in tokens]  
    indices = indices[:max_len] # Truncating
    indices += [word_vocab["<PAD>"]] * (max_len - len(indices)) # Padding
    return torch.tensor(indices, dtype=torch.long) 

def encode_labels(labels, max_len=128):
    """ 
    Encode a sequence of NER labels into indices.
    Truncate to the maximum length if longer, or pad with -100 if shorter.
    
    Args:
        labels (List[str]): List of NER labels
        max_len (int): Maximum length of the output sequence
    Returns:
        indices (torch.Tensor): Tensor of label indices
    """
    
    indices = labels[:max_len] # Truncating
    indices += [-100] * (max_len - len(indices))  # Padding
    return torch.tensor(indices, dtype=torch.long)  

def prepare_dataset(dataset, word_vocab, max_len=128):
    """ 
    Prepare the dataset by encoding the tokens and labels.
    
    Args:
        dataset: a dataset object
        word_vocab: a dictionary with words as keys and indices as values
        max_len: maximum length of the input sequences
    Returns:
        data: a list of tuples, each containing a tensor of token indices and a tensor of label indices
    """
    data = []
    
    for example in dataset:
        token_ids = encode_sentence(example["tokens"], word_vocab, max_len)
        label_ids = encode_labels(example["ner_tags"], max_len)
        data.append((token_ids, label_ids))
    
    return data

train_data = prepare_dataset(dataset["train"], word_vocab)
val_data = prepare_dataset(dataset["validation"], word_vocab)
test_data = prepare_dataset(dataset["test"], word_vocab)

# dataset class
class NERDataset(Dataset):
    def __init__(self, data):
        self.data = data 
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        # Return the token indices and label indices
        return self.data[index] 

train_dataset = NERDataset(train_data)
val_dataset = NERDataset(val_data)
test_dataset = NERDataset(test_data)

BATCH_SIZE = 32

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [None]:
dataset = load_dataset("DFKI-SLT/few-nerd", "supervised", trust_remote_code=True)

# entity labels
label_list = dataset['train'].features['ner_tags'].feature.names
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for label, i in label2id.items()}

# Building the vocabulary
def build_vocab(dataset, min_freq=1):
    word_freq = Counter()
    
    for example in dataset['train']:
        word_freq.update(example["tokens"])
    
    word_vocab = {"<PAD>": 0, "<UNK>": 1}
    for word, count in word_freq.items():
        if count >= min_freq:
            word_vocab[word] = len(word_vocab)
    
    return word_vocab

word_vocab = build_vocab(dataset)

class NERDataset(Dataset):
    def __init__(self, dataset_split, word_vocab, label2id, max_length=128):
        self.dataset = dataset_split
        self.word_vocab = word_vocab
        self.label2id = label2id
        self.max_length = max_length
    
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx):
        example = self.dataset[idx]
        tokens = example['tokens']
        labels = example['ner_tags']
        
        # Converting tokens to indices
        input_ids = [self.word_vocab.get(token, self.word_vocab["<UNK>"]) for token in tokens]
        label_ids = [self.label2id[label_list[label]] for label in labels]
        
        # Padding
        if len(input_ids) < self.max_length:
            pad_length = self.max_length - len(input_ids)
            input_ids += [self.word_vocab["<PAD>"]] * pad_length
            label_ids += [-100] * pad_length  # -100 for ignored tokens in loss function
        else:
            input_ids = input_ids[:self.max_length]
            label_ids = label_ids[:self.max_length]
        
        return torch.tensor(input_ids), torch.tensor(label_ids)

# DataLoaders
train_dataset = NERDataset(dataset['train'], word_vocab, label2id)
val_dataset = NERDataset(dataset['validation'], word_vocab, label2id)
test_dataset = NERDataset(dataset['test'], word_vocab, label2id)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

print(f"Train dataset size: {len(train_dataset)} sentences")
print(f"Validation dataset size: {len(val_dataset)} sentences")
print(f"Test dataset size: {len(test_dataset)} sentences")

Train dataset size: 131767 sentences
Validation dataset size: 18824 sentences
Test dataset size: 37648 sentences


In [3]:
# model class
class BiLSTM_CRF(nn.Module):
    def __init__(self, vocab_size, label_size, embedding_dim=100, hidden_dim=128, dropout=0.5):
        super(BiLSTM_CRF, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2, num_layers=2, 
                            bidirectional=True, batch_first=True, dropout=dropout)
        
        self.fc = nn.Linear(hidden_dim, label_size)
        self.crf = torchcrf.CRF(label_size, batch_first=True)

    def forward(self, x, mask):
        x = self.embedding(x)
        x, _ = self.lstm(x)
        emissions = self.fc(x)
        return emissions, mask
    
    def loss(self, x, tags, mask):
        emissions, mask = self.forward(x, mask)
        tags = torch.where(tags == -100, torch.tensor(0, device=tags.device), tags)

        return -self.crf(emissions, tags, mask=mask, reduction='mean')

    def predict(self, x, mask):
        emissions, mask = self.forward(x, mask)
        return self.crf.decode(emissions, mask)


In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# TensorBoard
writer = SummaryWriter("runs/ner_training")  # Logs will be saved in this directory

model = BiLSTM_CRF(vocab_size=len(word_vocab), label_size=len(id2label)).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-4)

num_epochs = 20  

for epoch in range(num_epochs):
    model.train() # training mode
    total_train_loss = 0

    # tqdm progress bar
    for tokens, labels in tqdm(train_loader, desc=f"Epoch {epoch+1} Training"):
        tokens, labels = tokens.to(device), labels.to(device)
        mask = tokens != word_vocab["<PAD>"]

        optimizer.zero_grad()
        loss = model.loss(tokens, labels, mask)  # loss computation
        loss.backward()
        optimizer.step()

        total_train_loss += loss.item()  # training loss

    avg_train_loss = total_train_loss / len(train_loader) 
    writer.add_scalar("Loss/Train", avg_train_loss, epoch) 

    # Validation
    model.eval()  # evaluation mode
    total_val_loss = 0
    correct_val = 0
    total_val = 0

    with torch.no_grad():  # No gradient updates for validation
        for tokens, labels in tqdm(val_loader, desc=f"Epoch {epoch+1} Validation"):
            tokens, labels = tokens.to(device), labels.to(device)
            mask = tokens != word_vocab["<PAD>"]

            loss = model.loss(tokens, labels, mask)  # validation loss
            total_val_loss += loss.item()

            # predictions
            predictions = model.predict(tokens, mask)
            labels_masked = labels[mask]

            predictions_masked = torch.cat([torch.tensor(p, dtype=torch.long, device=device) for p in predictions])

            if predictions_masked.shape != labels_masked.shape:
                min_len = min(predictions_masked.shape[0], labels_masked.shape[0])
                predictions_masked = predictions_masked[:min_len]
                labels_masked = labels_masked[:min_len]

            correct_val += (predictions_masked == labels_masked).sum().item()
            total_val += len(labels_masked)

    avg_val_loss = total_val_loss / len(val_loader) 
    val_accuracy = correct_val / total_val if total_val > 0 else 0

    writer.add_scalar("Loss/Validation", avg_val_loss, epoch) 
    writer.add_scalar("Accuracy/Validation", val_accuracy, epoch) 

    print(f"\nEpoch {epoch+1}:")
    print(f"Training Loss: {avg_train_loss:.4f}")
    print(f"Validation Loss: {avg_val_loss:.4f} | Validation Accuracy: {val_accuracy:.4f}")

# Close TensorBoard writer
writer.close()

Epoch 1 Training: 100%|██████████| 4118/4118 [09:26<00:00,  7.27it/s]
Epoch 1 Validation: 100%|██████████| 589/589 [00:44<00:00, 13.28it/s]



Epoch 1:
Training Loss: 8.0979
Validation Loss: 4.2975 | Validation Accuracy: 0.9199


Epoch 2 Training: 100%|██████████| 4118/4118 [09:37<00:00,  7.14it/s]
Epoch 2 Validation: 100%|██████████| 589/589 [00:43<00:00, 13.57it/s]



Epoch 2:
Training Loss: 3.7337
Validation Loss: 3.4408 | Validation Accuracy: 0.9303


Epoch 3 Training: 100%|██████████| 4118/4118 [28:26<00:00,  2.41it/s]    
Epoch 3 Validation: 100%|██████████| 589/589 [04:45<00:00,  2.06it/s]  



Epoch 3:
Training Loss: 3.0383
Validation Loss: 3.3269 | Validation Accuracy: 0.9318


Epoch 4 Training: 100%|██████████| 4118/4118 [1:00:14<00:00,  1.14it/s]   
Epoch 4 Validation: 100%|██████████| 589/589 [00:43<00:00, 13.65it/s]



Epoch 4:
Training Loss: 2.7520
Validation Loss: 3.1599 | Validation Accuracy: 0.9347


Epoch 5 Training: 100%|██████████| 4118/4118 [1:12:44<00:00,  1.06s/it]    
Epoch 5 Validation: 100%|██████████| 589/589 [00:43<00:00, 13.62it/s]



Epoch 5:
Training Loss: 2.5562
Validation Loss: 3.1171 | Validation Accuracy: 0.9356


Epoch 6 Training: 100%|██████████| 4118/4118 [1:30:41<00:00,  1.32s/it]     
Epoch 6 Validation: 100%|██████████| 589/589 [00:43<00:00, 13.68it/s]



Epoch 6:
Training Loss: 2.4283
Validation Loss: 3.0936 | Validation Accuracy: 0.9356


Epoch 7 Training: 100%|██████████| 4118/4118 [42:14<00:00,  1.63it/s]     
Epoch 7 Validation: 100%|██████████| 589/589 [00:45<00:00, 12.87it/s]



Epoch 7:
Training Loss: 2.3115
Validation Loss: 3.0721 | Validation Accuracy: 0.9362


Epoch 8 Training: 100%|██████████| 4118/4118 [1:21:11<00:00,  1.18s/it]     
Epoch 8 Validation: 100%|██████████| 589/589 [00:43<00:00, 13.64it/s]



Epoch 8:
Training Loss: 2.2120
Validation Loss: 3.1128 | Validation Accuracy: 0.9361


Epoch 9 Training: 100%|██████████| 4118/4118 [1:12:53<00:00,  1.06s/it]   
Epoch 9 Validation: 100%|██████████| 589/589 [00:43<00:00, 13.48it/s]



Epoch 9:
Training Loss: 2.1183
Validation Loss: 3.1005 | Validation Accuracy: 0.9363


Epoch 10 Training: 100%|██████████| 4118/4118 [44:36<00:00,  1.54it/s]    
Epoch 10 Validation: 100%|██████████| 589/589 [16:15<00:00,  1.66s/it]   



Epoch 10:
Training Loss: 2.0400
Validation Loss: 3.1363 | Validation Accuracy: 0.9376


Epoch 11 Training: 100%|██████████| 4118/4118 [53:38<00:00,  1.28it/s]    
Epoch 11 Validation: 100%|██████████| 589/589 [00:43<00:00, 13.41it/s]



Epoch 11:
Training Loss: 1.9638
Validation Loss: 3.1123 | Validation Accuracy: 0.9365


Epoch 12 Training: 100%|██████████| 4118/4118 [09:30<00:00,  7.21it/s]
Epoch 12 Validation: 100%|██████████| 589/589 [00:43<00:00, 13.47it/s]



Epoch 12:
Training Loss: 1.8947
Validation Loss: 3.2090 | Validation Accuracy: 0.9338


Epoch 13 Training: 100%|██████████| 4118/4118 [09:29<00:00,  7.23it/s]
Epoch 13 Validation: 100%|██████████| 589/589 [09:19<00:00,  1.05it/s]  



Epoch 13:
Training Loss: 1.8340
Validation Loss: 3.2056 | Validation Accuracy: 0.9364


Epoch 14 Training: 100%|██████████| 4118/4118 [55:22<00:00,  1.24it/s]     
Epoch 14 Validation: 100%|██████████| 589/589 [00:46<00:00, 12.66it/s]



Epoch 14:
Training Loss: 1.7756
Validation Loss: 3.1878 | Validation Accuracy: 0.9361


Epoch 15 Training: 100%|██████████| 4118/4118 [09:46<00:00,  7.02it/s]
Epoch 15 Validation: 100%|██████████| 589/589 [00:43<00:00, 13.43it/s]



Epoch 15:
Training Loss: 1.7334
Validation Loss: 3.3160 | Validation Accuracy: 0.9361


Epoch 16 Training: 100%|██████████| 4118/4118 [10:16<00:00,  6.68it/s]
Epoch 16 Validation: 100%|██████████| 589/589 [00:45<00:00, 13.00it/s]



Epoch 16:
Training Loss: 1.6788
Validation Loss: 3.3292 | Validation Accuracy: 0.9361


Epoch 17 Training: 100%|██████████| 4118/4118 [09:40<00:00,  7.09it/s]
Epoch 17 Validation: 100%|██████████| 589/589 [00:47<00:00, 12.47it/s]



Epoch 17:
Training Loss: 1.6300
Validation Loss: 3.3182 | Validation Accuracy: 0.9352


Epoch 18 Training: 100%|██████████| 4118/4118 [09:26<00:00,  7.27it/s]
Epoch 18 Validation: 100%|██████████| 589/589 [00:44<00:00, 13.21it/s]



Epoch 18:
Training Loss: 1.5967
Validation Loss: 3.3523 | Validation Accuracy: 0.9350


Epoch 19 Training: 100%|██████████| 4118/4118 [09:53<00:00,  6.94it/s]
Epoch 19 Validation: 100%|██████████| 589/589 [00:45<00:00, 13.02it/s]



Epoch 19:
Training Loss: 1.5470
Validation Loss: 3.4157 | Validation Accuracy: 0.9333


Epoch 20 Training: 100%|██████████| 4118/4118 [10:02<00:00,  6.83it/s]
Epoch 20 Validation: 100%|██████████| 589/589 [00:46<00:00, 12.60it/s]


Epoch 20:
Training Loss: 1.5071
Validation Loss: 3.4432 | Validation Accuracy: 0.9345





In [8]:
!tensorboard --logdir=runs

TensorFlow installation not found - running with reduced feature set.
Serving TensorBoard on localhost; to expose to the network, use a proxy or pass --bind_all
TensorBoard 2.18.0 at http://localhost:6006/ (Press CTRL+C to quit)
^C


In [None]:
model.eval() 

true_labels = []
predicted_labels = []

with torch.no_grad():  # No gradient updates for testing
    for tokens, labels in tqdm(test_loader, desc="Testing"):
        tokens, labels = tokens.to(device), labels.to(device)
        mask = tokens != word_vocab["<PAD>"]

        predictions = model.predict(tokens, mask)

        for i in range(len(labels)):  
            # Get true labels, ignoring padding (-100)
            true_seq = labels[i][mask[i]].cpu().tolist()
            pred_seq = predictions[i]  # Predictions are already masked

            true_labels.extend(true_seq)
            predicted_labels.extend(pred_seq)

# # label indices to actual class names
idx_to_label = {idx: label for label, idx in id2label.items()}

Testing: 100%|██████████| 1177/1177 [00:46<00:00, 25.46it/s]


In [10]:
import pandas as pd

pd.DataFrame(classification_report(true_labels, predicted_labels, output_dict=True)).T

Unnamed: 0,precision,recall,f1-score,support
0,0.970682,0.977903,0.974279,727795.0
1,0.780423,0.771687,0.77603,12150.0
2,0.67114,0.741578,0.704603,13892.0
3,0.697083,0.733021,0.714601,12162.0
4,0.816825,0.831975,0.82433,40976.0
5,0.788814,0.740906,0.76411,46103.0
6,0.759426,0.614216,0.679146,17839.0
7,0.883026,0.88528,0.884151,37221.0
8,0.735641,0.619359,0.67251,12697.0
accuracy,0.93429,0.93429,0.93429,0.93429


In [6]:
model_path = "bilstm_crf_ner_few-nerd10.pth"  
vocab_path = "vocab_few-nerd10.pkl"  

# saving
torch.save(model.state_dict(), model_path)

with open(vocab_path, "wb") as f:
    pickle.dump({"word_vocab": word_vocab, "label_vocab": id2label}, f)

In [7]:
with open(vocab_path, "rb") as f:
    vocab_data = pickle.load(f)
    
word_vocab = vocab_data["word_vocab"]
label_vocab = vocab_data["label_vocab"]

loaded_model = BiLSTM_CRF(vocab_size=len(word_vocab), label_size=len(label_vocab)).to(device)
loaded_model.load_state_dict(torch.load("bilstm_crf_ner_few-nerd10.pth", map_location=device))
loaded_model.eval() 

  loaded_model.load_state_dict(torch.load("bilstm_crf_ner_few-nerd10.pth", map_location=device))


BiLSTM_CRF(
  (embedding): Embedding(171197, 100)
  (lstm): LSTM(100, 64, num_layers=2, batch_first=True, dropout=0.5, bidirectional=True)
  (fc): Linear(in_features=128, out_features=9, bias=True)
  (crf): CRF(num_tags=9)
)

In [None]:
### CONLL 2003 NER PREDICTION ###
def predict_ner(sentence, model, word_vocab, label_vocab):
    """ 
    Predict NER entities in a sentence using a trained model.
    
    Args:
        sentence (str): Input sentence
        model: Trained NER model
        word_vocab: Vocabulary for words
        label_vocab: Vocabulary for NER labels
        
    Returns:
        entities (Dict[str, List[str]]): Dictionary of NER entities with their types"""
    tokens = sentence.split()

    # words to indices 
    token_ids = [word_vocab.get(word, word_vocab["<UNK>"]) for word in tokens]
    token_tensor = torch.tensor([token_ids], dtype=torch.long, device=device)

    # mask for valid tokens
    mask = torch.ones_like(token_tensor, dtype=torch.bool) 

    with torch.no_grad():
        predictions = model.predict(token_tensor, mask) # predictions

    # predictions to labels
    idx_to_label = {idx: label for label, idx in label_vocab.items()}
    predicted_labels = [idx_to_label[pred] for pred in predictions[0]]

    # NER entities into full names such as "B-PER" to "PERSON"
    entities = {}
    current_entity = []
    current_type = None

    for word, label in zip(tokens, predicted_labels):
        if label.startswith("B-"):  # beginning of an entity
            if current_entity:  
                entities.setdefault(current_type, []).append(" ".join(current_entity))
            current_type = label[2:]  
            current_entity = [word]  

        elif label.startswith("I-") and current_type == label[2:]:  
            current_entity.append(word)

        else:  # If O or new entity starts, saving previous entity
            if current_entity:
                entities.setdefault(current_type, []).append(" ".join(current_entity))
                current_entity = []
                current_type = None

    # last entity handling
    if current_entity:
        entities.setdefault(current_type, []).append(" ".join(current_entity))

    return entities 


In [None]:
### FEW-NERD PREDICTION ###
# prediction but without merging the entities
def infer(model, sentence, word_vocab, id2label, max_length=128):
    model.eval()
    tokens = sentence.split()
    input_ids = [word_vocab.get(token, word_vocab["<UNK>"]) for token in tokens]
    
    if len(input_ids) < max_length:
        input_ids += [word_vocab["<PAD>"]] * (max_length - len(input_ids))
    else:
        input_ids = input_ids[:max_length]
    
    input_tensor = torch.tensor([input_ids])
    
    mask = (input_tensor != word_vocab["<PAD>"]).long()
    
    with torch.no_grad():
        output = model(input_tensor, mask)
    
    if isinstance(output, tuple):
        output = output[0]  # Extract logits from tuple
    
    predicted_labels = torch.argmax(output, dim=2).squeeze(0).tolist()
    entity_predictions = [id2label[label] for label in predicted_labels[:len(tokens)]]
    
    return list(zip(tokens, entity_predictions))


# Example usage

sentence = "The capital of Nicaragua is Managua"
print(infer(loaded_model, sentence, word_vocab, id2label))

[('The', 'O'), ('capital', 'O'), ('of', 'O'), ('Nicaragua', 'location'), ('is', 'O'), ('Managua', 'location')]


In [None]:
### FEW-NERD PREDICTION ###
# prediction but with merging the consecutive entities
def infer(model, sentence, word_vocab, id2label, max_length=128):
    model.eval()
    tokens = sentence.split()
    input_ids = [word_vocab.get(token, word_vocab["<UNK>"]) for token in tokens]
    
    if len(input_ids) < max_length:
        input_ids += [word_vocab["<PAD>"]] * (max_length - len(input_ids))
    else:
        input_ids = input_ids[:max_length]
    
    input_tensor = torch.tensor([input_ids])
    
    mask = (input_tensor != word_vocab["<PAD>"]).long()
    
    with torch.no_grad():
        output = model(input_tensor, mask)
    
    if isinstance(output, tuple):
        output = output[0]  # Extract logits from tuple
    
    predicted_labels = torch.argmax(output, dim=2).squeeze(0).tolist()
    entity_predictions = [id2label[label] for label in predicted_labels[:len(tokens)]]
    
    # Merge consecutive entities of the same type
    merged_entities = []
    current_entity = None
    for token, entity in zip(tokens, entity_predictions):
        if entity != 'O':
            if current_entity and current_entity[1] == entity:
                current_entity[0] += " " + token
            else:
                if current_entity:
                    merged_entities.append(tuple(current_entity))
                current_entity = [token, entity]
        else:
            if current_entity:
                merged_entities.append(tuple(current_entity))
                current_entity = None
    if current_entity:
        merged_entities.append(tuple(current_entity))
    
    return merged_entities

model = BiLSTM_CRF(vocab_size=len(word_vocab), label_size=len(label_vocab)).to(device)
model.load_state_dict(torch.load("bilstm_crf_ner_few-nerd10.pth", map_location=device))
model.eval() 
# Example usage
sentence = "Barack Obama was the president of United States of America"
print(infer(model, sentence, word_vocab, id2label))

[('Barack Obama', 'person'), ('United', 'organization'), ('States', 'location'), ('America', 'location')]


  model.load_state_dict(torch.load("bilstm_crf_ner_few-nerd10.pth", map_location=device))
