# Named Entity Recognition with BERT.

In [1]:
import torch as T
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from transformers import AutoTokenizer, BertModel

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import preprocessing
from sklearn import model_selection

from typing import List, Dict, Tuple, Optional
import warnings
from tqdm import tqdm
warnings.simplefilter('ignore')

## Dataset

In [2]:
df = pd.read_csv('/kaggle/input/entity-annotated-corpus/ner_dataset.csv', encoding='latin-1')
df.head(10)

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O
5,,through,IN,O
6,,London,NNP,B-geo
7,,to,TO,O
8,,protest,VB,O
9,,the,DT,O


In [3]:
df['Sentence #'] = df['Sentence #'].fillna(method='ffill')
df.head(10)

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,Sentence: 1,of,IN,O
2,Sentence: 1,demonstrators,NNS,O
3,Sentence: 1,have,VBP,O
4,Sentence: 1,marched,VBN,O
5,Sentence: 1,through,IN,O
6,Sentence: 1,London,NNP,B-geo
7,Sentence: 1,to,TO,O
8,Sentence: 1,protest,VB,O
9,Sentence: 1,the,DT,O


In [4]:
class Config:
    max_len = 128
    epochs = 20
    base_model = "bert-base-uncased"
    tokenizer = AutoTokenizer.from_pretrained(base_model, do_lower_case=True)
    train_batch_size = 32
    valid_batch_size = 8
    device = T.device("cuda" if T.has_cuda else "mps" if T.has_mps else "cpu")

config = Config()

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

In [5]:
config.tokenizer("hello", add_special_tokens=False)

{'input_ids': [7592], 'token_type_ids': [0], 'attention_mask': [1]}

In [6]:
config.tokenizer("[PAD]", add_special_tokens=False)

{'input_ids': [0], 'token_type_ids': [0], 'attention_mask': [1]}

In [7]:
config.tokenizer("[CLS]", add_special_tokens=False)

{'input_ids': [101], 'token_type_ids': [0], 'attention_mask': [1]}

In [8]:
config.tokenizer("[SEP]", add_special_tokens=False)

{'input_ids': [102], 'token_type_ids': [0], 'attention_mask': [1]}

In [9]:
class NERDataset(Dataset):
    def __init__(self, sentences: List[List[str]], pos: List[List[int]], tags: List[List[int]]) -> None:
        super(NERDataset, self).__init__()
        self.sentences = sentences
        self._pos = pos
        self._tags = tags
    
    def __len__(self) -> int:
        return len(self.sentences)
    
    def __getitem__(self, ix) -> Dict[str, T.Tensor]:
        text = self.sentences[ix]
        pos = self._pos[ix]
        tags = self._tags[ix]
        
        ids = []
        target_pos = []
        target_tags = []
        
        for i, s in enumerate(text):
            inputs = config.tokenizer(s, add_special_tokens=False)
            
            input_len = len(inputs['input_ids'])
            
            ids.extend(inputs['input_ids'])
            target_pos.extend([pos[i]] * input_len)
            target_tags.extend([tags[i]] * input_len)
            
        # adding the [CLS] and [SEP] tokens.
        ids = ids[:config.max_len - 2] # because we also need to add the [CLS] and [SEP] tokens
        target_pos = target_pos[:config.max_len - 2]
        target_tags = target_tags[:config.max_len - 2]
        
        ids = [101] + ids + [102]
        target_pos = [0] + target_pos + [0]
        target_tags = [0] + target_tags + [0]
        
        attention_mask = [1] * len(ids)
        token_type_ids = [0] * len(ids)
        
        # add padding.
        padding_len = config.max_len - len(ids)
        ids = ids + [0] * padding_len
        target_pos = target_pos + [0] * padding_len
        target_tags = target_tags + [0] * padding_len
        
        # building attention masks and token_type_ids
        attention_mask = attention_mask + [0] * padding_len
        token_type_ids = token_type_ids + [0] * padding_len
        
        return {
            'input_ids': T.tensor(ids, dtype=T.long),
            'attention_mask': T.tensor(attention_mask, dtype=T.long),
            'token_type_ids': T.tensor(token_type_ids, dtype=T.long),
            'target_pos': T.tensor(target_pos, dtype=T.long),
            'target_tag': T.tensor(target_tags, dtype=T.long)
        }

### Encoding the POS and Tags

In [10]:
pos_encoder = preprocessing.LabelEncoder()
tag_encoder = preprocessing.LabelEncoder()

df['POS'] = pos_encoder.fit_transform(df['POS'])
df['Tag'] = tag_encoder.fit_transform(df['Tag'])

In [11]:
df

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,19,16
1,Sentence: 1,of,10,16
2,Sentence: 1,demonstrators,19,16
3,Sentence: 1,have,35,16
4,Sentence: 1,marched,34,16
...,...,...,...,...
1048570,Sentence: 47959,they,22,16
1048571,Sentence: 47959,responded,32,16
1048572,Sentence: 47959,to,29,16
1048573,Sentence: 47959,the,7,16


## Grouping the words

In [12]:
sentences = df.groupby('Sentence #')['Word'].apply(list).values

In [13]:
pos = df.groupby('Sentence #')['POS'].apply(list).values
tags = df.groupby('Sentence #')['Tag'].apply(list).values

In [14]:
for ix, s in enumerate(sentences):
    print(f'#{ix+1}\n')
    print(f's: {s}\n')
    print(f'pos: {pos[ix]}\n')
    print(f'tags: {tags[ix]}\n')
    
    if ix == 2:
        break
    print(f'\n\n')

#1

s: ['Thousands', 'of', 'demonstrators', 'have', 'marched', 'through', 'London', 'to', 'protest', 'the', 'war', 'in', 'Iraq', 'and', 'demand', 'the', 'withdrawal', 'of', 'British', 'troops', 'from', 'that', 'country', '.']

pos: [19, 10, 19, 35, 34, 10, 17, 29, 31, 7, 16, 10, 17, 5, 31, 7, 16, 10, 11, 19, 10, 7, 16, 2]

tags: [16, 16, 16, 16, 16, 16, 2, 16, 16, 16, 16, 16, 2, 16, 16, 16, 16, 16, 3, 16, 16, 16, 16, 16]




#2

s: ['Iranian', 'officials', 'say', 'they', 'expect', 'to', 'get', 'access', 'to', 'sealed', 'sensitive', 'parts', 'of', 'the', 'plant', 'Wednesday', ',', 'after', 'an', 'IAEA', 'surveillance', 'system', 'begins', 'functioning', '.']

pos: [11, 19, 35, 22, 35, 29, 31, 16, 29, 11, 11, 19, 10, 7, 16, 17, 1, 10, 7, 17, 16, 16, 36, 33, 2]

tags: [3, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 7, 16, 16, 16, 5, 16, 16, 16, 16, 16]




#3

s: ['Helicopter', 'gunships', 'Saturday', 'pounded', 'militant', 'hideouts', 'in', 'the', 'Orakzai', 'tribal', 'region

### Splitting data into train and validation

In [15]:
train_sentence, valid_sentence, train_pos, valid_pos, train_tags, valid_tags = \
model_selection.train_test_split(sentences, pos, tags, test_size=0.1, random_state=42)

## Build the datasets and dataloaders

In [16]:
train_dataset = NERDataset(
    sentences=train_sentence, 
    pos=train_pos, 
    tags=train_tags
)

valid_dataset = NERDataset(
    sentences=valid_sentence, 
    pos=valid_pos, 
    tags=valid_tags
)

In [17]:
trainloader = DataLoader(
    dataset=train_dataset, 
    batch_size=config.train_batch_size, 
    shuffle=True, 
    drop_last=True
)

validloader = DataLoader(
    dataset=valid_dataset, 
    batch_size=config.valid_batch_size, 
    drop_last=True
)

In [18]:
batch = next(iter(trainloader))

for k, v in batch.items():
    print(f'{k}: {v.shape}')

input_ids: torch.Size([32, 128])
attention_mask: torch.Size([32, 128])
token_type_ids: torch.Size([32, 128])
target_pos: torch.Size([32, 128])
target_tag: torch.Size([32, 128])


In [19]:
batch = next(iter(validloader))

for k, v in batch.items():
    print(f'{k}: {v.shape}')

input_ids: torch.Size([8, 128])
attention_mask: torch.Size([8, 128])
token_type_ids: torch.Size([8, 128])
target_pos: torch.Size([8, 128])
target_tag: torch.Size([8, 128])


## Model

In [20]:
n_pos = len(pos_encoder.classes_)
n_tag = len(tag_encoder.classes_)

n_pos, n_tag

(42, 17)

In [21]:
class NERModel(nn.Module):
    def __init__(self, n_pos, n_tag):
        super(NERModel, self).__init__()
        self.bert = BertModel.from_pretrained(config.base_model)
        self.pos_drop = nn.Dropout(p=0.3)
        self.tag_drop = nn.Dropout(p=0.3)
        self.pos_linear = nn.Linear(768, n_pos)
        self.tag_linear = nn.Linear(768, n_tag)
    
    def forward(self, input_ids, attention_mask, token_type_ids, target_pos, target_tag):
        bert_out = self.bert(
            input_ids=input_ids, 
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        ).last_hidden_state
        
        pos_out = self.pos_drop(bert_out)
        tag_out = self.pos_drop(bert_out)
        
        pos_out = self.pos_linear(pos_out)
        tag_out = self.tag_linear(tag_out)
        
        return pos_out, tag_out

### Testing

In [22]:
model = NERModel(n_pos=n_pos, n_tag=n_tag)
pos_out, tag_out = model.forward(**batch)

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [23]:
pos_out.shape, tag_out.shape

(torch.Size([8, 128, 42]), torch.Size([8, 128, 17]))

In [24]:
batch['target_pos'].shape, batch['target_tag'].shape

(torch.Size([8, 128]), torch.Size([8, 128]))

## Training

### Loss Function

In [25]:
def loss_fn(prediction, target, masks, n_classes):
    lfn = nn.CrossEntropyLoss()
    '''We don't want to calculate the loss for the PAD tokens.'''
    logits = prediction.view(-1, n_classes)
    
    masks = masks.view(-1)
    
    target = target.view(-1)
    target = T.where(
        masks == 1, 
        target, 
        T.tensor(lfn.ignore_index).type_as(target)
    )
    
    return lfn(logits, target)

In [26]:
loss_fn(pos_out, batch['target_pos'], batch['attention_mask'], n_pos)

tensor(3.8552, grad_fn=<NllLossBackward0>)

### Training and Eval functions

In [27]:
def train_fn(dataloader, model, optimizer, scheduler):
    model.train()
    
    running_loss = 0.0
    for batch in tqdm(dataloader):
        for k, v in batch.items():
            batch[k] = v.to(config.device)
        
        pos_out, tag_out = model.forward(**batch)
        
        pos_loss = loss_fn(pos_out, batch['target_pos'], batch['attention_mask'], n_pos)
        tag_loss = loss_fn(tag_out, batch['target_tag'], batch['attention_mask'], n_tag)
        
        total_loss = (pos_loss + tag_loss) / 2
        
        optimizer.zero_grad()
        total_loss.backward()
        optimizer.step()
        scheduler.step()
        
        running_loss += total_loss.item()
    
    return running_loss / len(dataloader)


def eval_fn(dataloader, model):
    model.eval()
    
    running_loss = 0.0
    for batch in dataloader:
        for k, v in batch.items():
            batch[k] = v.to(config.device)
        
        pos_out, tag_out = model.forward(**batch)
        
        pos_loss = loss_fn(pos_out, batch['target_pos'], batch['attention_mask'], n_pos)
        tag_loss = loss_fn(tag_out, batch['target_tag'], batch['attention_mask'], n_tag)
        
        total_loss = (pos_loss + tag_loss) / 2
        running_loss += total_loss.item()
    
    return running_loss / len(dataloader)

## Train the model

In [28]:
model = NERModel(n_pos=n_pos, n_tag=n_tag).to(config.device)
optimizer = optim.Adam(model.parameters())
scheduler = optim.lr_scheduler.StepLR(optimizer=optimizer, step_size=5, gamma=0.1)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [29]:
config.device

device(type='cuda')

In [30]:
for epoch in range(config.epochs):
    train_loss = train_fn(trainloader, model, optimizer, scheduler)
    valid_loss = eval_fn(validloader, model)
    
    print(f'Epoch #{epoch+1} / {config.epochs} -- train loss: {train_loss:.3f}; valid loss: {valid_loss:.3f}')

100%|██████████| 1348/1348 [09:09<00:00,  2.45it/s]


Epoch #1 / 20 -- train loss: 2.055; valid loss: 2.030


100%|██████████| 1348/1348 [09:07<00:00,  2.46it/s]


Epoch #2 / 20 -- train loss: 2.051; valid loss: 2.030


100%|██████████| 1348/1348 [09:08<00:00,  2.46it/s]


Epoch #3 / 20 -- train loss: 2.051; valid loss: 2.030


100%|██████████| 1348/1348 [09:08<00:00,  2.46it/s]


Epoch #4 / 20 -- train loss: 2.052; valid loss: 2.030


100%|██████████| 1348/1348 [09:08<00:00,  2.46it/s]


Epoch #5 / 20 -- train loss: 2.052; valid loss: 2.030


100%|██████████| 1348/1348 [09:07<00:00,  2.46it/s]


Epoch #6 / 20 -- train loss: 2.051; valid loss: 2.030


100%|██████████| 1348/1348 [09:08<00:00,  2.46it/s]


Epoch #7 / 20 -- train loss: 2.052; valid loss: 2.030


100%|██████████| 1348/1348 [09:07<00:00,  2.46it/s]


Epoch #8 / 20 -- train loss: 2.051; valid loss: 2.030


100%|██████████| 1348/1348 [09:10<00:00,  2.45it/s]


Epoch #9 / 20 -- train loss: 2.051; valid loss: 2.030


100%|██████████| 1348/1348 [09:09<00:00,  2.45it/s]


Epoch #10 / 20 -- train loss: 2.052; valid loss: 2.030


100%|██████████| 1348/1348 [09:10<00:00,  2.45it/s]


Epoch #11 / 20 -- train loss: 2.051; valid loss: 2.030


100%|██████████| 1348/1348 [09:08<00:00,  2.46it/s]


Epoch #12 / 20 -- train loss: 2.052; valid loss: 2.030


100%|██████████| 1348/1348 [09:09<00:00,  2.45it/s]


Epoch #13 / 20 -- train loss: 2.051; valid loss: 2.030


100%|██████████| 1348/1348 [09:09<00:00,  2.45it/s]


Epoch #14 / 20 -- train loss: 2.051; valid loss: 2.030


100%|██████████| 1348/1348 [09:09<00:00,  2.45it/s]


Epoch #15 / 20 -- train loss: 2.051; valid loss: 2.030


100%|██████████| 1348/1348 [09:11<00:00,  2.45it/s]


Epoch #16 / 20 -- train loss: 2.051; valid loss: 2.030


100%|██████████| 1348/1348 [09:11<00:00,  2.45it/s]


Epoch #17 / 20 -- train loss: 2.051; valid loss: 2.030


100%|██████████| 1348/1348 [09:11<00:00,  2.44it/s]


Epoch #18 / 20 -- train loss: 2.051; valid loss: 2.030


100%|██████████| 1348/1348 [09:11<00:00,  2.44it/s]


Epoch #19 / 20 -- train loss: 2.051; valid loss: 2.030


100%|██████████| 1348/1348 [09:11<00:00,  2.44it/s]


Epoch #20 / 20 -- train loss: 2.051; valid loss: 2.030


## Evaluation

In [31]:
sentence = "My name is Sarah and I live in London"
tokenized = config.tokenizer.encode(sentence)
tokenized

[101, 2026, 2171, 2003, 4532, 1998, 1045, 2444, 1999, 2414, 102]

In [32]:
eval_dataset = NERDataset(
    sentences=[sentence.split()],
    pos = [[0] * len(sentence.split())],
    tags = [[0] * len(sentence.split())]
)

In [33]:
batch = eval_dataset[0]

for k, v in batch.items():
    batch[k] = v.to(config.device).unsqueeze(0)

batch

{'input_ids': tensor([[ 101, 2026, 2171, 2003, 4532, 1998, 1045, 2444, 1999, 2414,  102,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0]], device='cuda:0'),
 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [34]:
pos_out, tag_out = model(**batch)
pos_out.shape, tag_out.shape

(torch.Size([1, 128, 42]), torch.Size([1, 128, 17]))

In [35]:
pos_enc = T.argmax(pos_out, dim=2).cpu().numpy().reshape(-1)
tag_enc = T.argmax(tag_out, dim=2).cpu().numpy().reshape(-1)

In [36]:
pos_encoder.inverse_transform(pos_enc)[:len(sentence.split())]

array(['NNP', 'NNP', 'NNP', 'NNP', 'NNP', 'NNP', 'NNP', 'NNP', 'NNP'],
      dtype=object)

In [37]:
tag_encoder.inverse_transform(tag_enc)[:len(sentence.split())]

array(['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], dtype=object)