## NER using bert

In [None]:
import torch #pytorch library

import random
import numpy as np
import pandas as pd

SEED = 1234

random.seed(SEED) #setting random seed
np.random.seed(SEED) #setting 
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [None]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/ae/05/c8c55b600308dc04e95100dc8ad8a244dd800fe75dfafcf1d6348c6f6209/transformers-3.1.0-py3-none-any.whl (884kB)
[K     |████████████████████████████████| 890kB 5.4MB/s 
Collecting tokenizers==0.8.1.rc2
[?25l  Downloading https://files.pythonhosted.org/packages/80/83/8b9fccb9e48eeb575ee19179e2bdde0ee9a1904f97de5f02d19016b8804f/tokenizers-0.8.1rc2-cp36-cp36m-manylinux1_x86_64.whl (3.0MB)
[K     |████████████████████████████████| 3.0MB 17.0MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 21.8MB/s 
[?25hCollecting sentencepiece!=0.1.92
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB

In [None]:
import transformers
from transformers import BertTokenizer
#loading the pre-trained bert-base-uncased tokenizer
TOKENIZER = BertTokenizer.from_pretrained('bert-base-uncased',do_lower_case=True)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




In [None]:
#length of bert tokenizer
len(TOKENIZER.vocab)

30522

In [None]:
from sklearn import preprocessing
df = pd.read_csv("/content/drive/My Drive/Colab Notebooks/ner_dataset.csv", encoding="latin-1")
df.loc[:, "Sentence #"] = df["Sentence #"].fillna(method="ffill")
#converting categorical values to numerical values
enc_pos = preprocessing.LabelEncoder()
enc_tag = preprocessing.LabelEncoder()

df.loc[:, "POS"] = enc_pos.fit_transform(df["POS"])
df.loc[:, "Tag"] = enc_tag.fit_transform(df["Tag"])
#grouping words, pos and tag by sentence numbers
sentences = df.groupby("Sentence #")["Word"].apply(list).values
pos = df.groupby("Sentence #")["POS"].apply(list).values
tag = df.groupby("Sentence #")["Tag"].apply(list).values

In [None]:
df

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,19,16
1,Sentence: 1,of,10,16
2,Sentence: 1,demonstrators,19,16
3,Sentence: 1,have,35,16
4,Sentence: 1,marched,34,16
...,...,...,...,...
1048570,Sentence: 47959,they,22,16
1048571,Sentence: 47959,responded,32,16
1048572,Sentence: 47959,to,29,16
1048573,Sentence: 47959,the,7,16


In [None]:
print(sentences[0],pos[0],tag[0])

['Thousands', 'of', 'demonstrators', 'have', 'marched', 'through', 'London', 'to', 'protest', 'the', 'war', 'in', 'Iraq', 'and', 'demand', 'the', 'withdrawal', 'of', 'British', 'troops', 'from', 'that', 'country', '.'] [19, 10, 19, 35, 34, 10, 17, 29, 31, 7, 16, 10, 17, 5, 31, 7, 16, 10, 11, 19, 10, 7, 16, 2] [16, 16, 16, 16, 16, 16, 2, 16, 16, 16, 16, 16, 2, 16, 16, 16, 16, 16, 3, 16, 16, 16, 16, 16]


In [None]:
#getting pos from numerical values
enc_pos.inverse_transform(pos[0])

array(['NNS', 'IN', 'NNS', 'VBP', 'VBN', 'IN', 'NNP', 'TO', 'VB', 'DT',
       'NN', 'IN', 'NNP', 'CC', 'VB', 'DT', 'NN', 'IN', 'JJ', 'NNS', 'IN',
       'DT', 'NN', '.'], dtype=object)

In [None]:
#getting tag from numerical values
enc_tag.inverse_transform(tag[0])

array(['O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O',
       'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-gpe', 'O', 'O', 'O', 'O', 'O'],
      dtype=object)

In [None]:
# number of unique pos and tag classes
num_pos = len(list(enc_pos.classes_))
num_tag = len(list(enc_tag.classes_))
print("number of pos classes {}".format(num_pos))
print("number of tag classes {}".format(num_tag))

number of pos classes 42
number of tag classes 17


In [None]:
#defining hyperparameters
MAX_LEN = 128
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 8
EPOCHS = 10

In [None]:
#splitting dataset to train data and test data
from sklearn import model_selection
(       train_sentences,
        test_sentences,
        train_pos,
        test_pos,
        train_tag,
        test_tag
 ) = model_selection.train_test_split(sentences, pos, tag, random_state=42, test_size=0.1)

In [None]:
class EntityDataset:
  """returns datasets from sentences, pos and tags"""
    def __init__(self, texts, pos, tags):
        self.texts = texts
        self.pos = pos
        self.tags = tags
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, item):
        text = self.texts[item]
        pos = self.pos[item]
        tags = self.tags[item]

        ids = []
        target_pos = []
        target_tag =[]

        for i, s in enumerate(text):
            #tokenizing using bert tokenizer
            inputs = TOKENIZER.encode(
                s,
                add_special_tokens=False
            )
            
            input_len = len(inputs)
            #adding to end of list ids,target_pos,target_tag
            ids.extend(inputs)
            target_pos.extend([pos[i]] * input_len)
            target_tag.extend([tags[i]] * input_len)
        # we need to add two tokens indicating start and end of sentence for each sentence
        ids = ids[:MAX_LEN - 2]
        target_pos = target_pos[:MAX_LEN - 2]
        target_tag = target_tag[:MAX_LEN - 2]
        
        ids = [101] + ids + [102]
        target_pos = [0] + target_pos + [0]
        target_tag = [0] + target_tag + [0]
        # to count the paddings
        mask = [1] * len(ids)
        token_type_ids = [0] * len(ids)

        padding_len = MAX_LEN - len(ids)
        
        ids = ids + ([0] * padding_len)
        mask = mask + ([0] * padding_len)
        token_type_ids = token_type_ids + ([0] * padding_len)
        target_pos = target_pos + ([0] * padding_len)
        target_tag = target_tag + ([0] * padding_len)

        return {
            "ids": torch.tensor(ids, dtype=torch.long),
            "mask": torch.tensor(mask, dtype=torch.long),
            "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
            "target_pos": torch.tensor(target_pos, dtype=torch.long),
            "target_tag": torch.tensor(target_tag, dtype=torch.long),
        }

In [None]:
#creating train,valid dataset and train, valid dataloader
train_dataset = EntityDataset(
        texts=train_sentences, pos=train_pos, tags=train_tag
    )
train_data_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=TRAIN_BATCH_SIZE, num_workers=4
    )

valid_dataset = EntityDataset(
        texts=test_sentences, pos=test_pos, tags=test_tag
    )

valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset, batch_size=VALID_BATCH_SIZE, num_workers=1
    )

In [None]:
len(train_dataset)

43163

In [None]:
len(valid_dataset)

4796

In [None]:
#defining device to run our model on
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

Building model

In [None]:
from transformers import BertModel
#importing the pretrained bert model
bert = BertModel.from_pretrained('bert-base-uncased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




In [None]:
def loss_fn(output, target, mask, num_labels):
    """ returns cross entropy loss of each iteration"""
    lfn = nn.CrossEntropyLoss()
    #Active_loss is where attention_mask value is 1. So we don't need to 
    #calculate loss for whole sentence only calculate where you don't have any padding. 
    # i.e where mask = 1
    active_loss = mask.view(-1) == 1
    active_logits = output.view(-1, num_labels)
    active_labels = torch.where(
        active_loss,
        target.view(-1),
        torch.tensor(lfn.ignore_index).type_as(target) # if active loss is false or 0 then replace with 
        #"torch.tensor(lfn.ignore_index).type_as(target)" this is -100 and we can ignore that index
    )
    loss = lfn(active_logits, active_labels)
    return loss


In [None]:
import torch.nn as nn

In [None]:
class EntityModel(nn.Module):
    """class implementing model for ner"""
    def __init__(self, num_tag, num_pos):
        super(EntityModel, self).__init__()
        self.num_tag = num_tag
        self.num_pos = num_pos
        self.bert = bert
        self.bert_drop_1 = nn.Dropout(0.3)
        self.bert_drop_2 = nn.Dropout(0.3)
        self.out_tag = nn.Linear(768, self.num_tag)
        self.out_pos = nn.Linear(768, self.num_pos)
    
    def forward(self, ids, mask, token_type_ids, target_pos, target_tag):
        #embedding using bert
        o1, _ = self.bert(ids, attention_mask=mask, token_type_ids=token_type_ids)
        # applying dropout
        bo_tag = self.bert_drop_1(o1)
        bo_pos = self.bert_drop_2(o1)
        # feeding to fully connected layer
        tag = self.out_tag(bo_tag)
        pos = self.out_pos(bo_pos)
        #calculating loss for tag and pos
        loss_tag = loss_fn(tag, target_tag, mask, self.num_tag)
        loss_pos = loss_fn(pos, target_pos, mask, self.num_pos)
        #taking average of losses as our final loss
        loss = (loss_tag + loss_pos) / 2

        return tag, pos, loss

In [None]:
# creating model instance
model = EntityModel(num_tag=num_tag, num_pos=num_pos)
model.to(device)

EntityModel(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)


In [None]:
from tqdm import tqdm
def train_fn(data_loader, model, optimizer, device, scheduler):
    """trains our model for optimum weights"""
    model.train()
    final_loss = 0
    # passing to tqdm to visualise the progress bar for our epochs
    for data in tqdm(data_loader, total=len(data_loader)):
        for k, v in data.items():
            data[k] = v.to(device)
       #setting gradients to zero     
        optimizer.zero_grad()
       # applying model to data 
        _, _, loss = model(**data)
        #backpropagation
        loss.backward()
        # updating weights
        optimizer.step()
        scheduler.step()
        # adding losses for each batch
        final_loss += loss.item()
    return final_loss / len(data_loader)

In [None]:
def eval_fn(data_loader, model, device):
    """ for evaluating performance of model"""
    model.eval()
    final_loss = 0
    for data in tqdm(data_loader, total=len(data_loader)):
        for k, v in data.items():
            data[k] = v.to(device)
        _, _, loss = model(**data)
        final_loss += loss.item()
    return final_loss / len(data_loader)

In [None]:
# list of named parameters of model
param_optimizer = list(model.named_parameters())

In [None]:
no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]

In [None]:
# setting weight decays and parameters for optimization
optimizer_parameters = [
        {
            "params": [
                p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.001,
        },
        {
            "params": [
                p for n, p in param_optimizer if any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.0,
        },
    ]

In [None]:
num_train_steps = int(len(train_sentences) / TRAIN_BATCH_SIZE * EPOCHS)

In [None]:
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
#defining our optimizer
optimizer = AdamW(optimizer_parameters, lr=3e-5)
# Create a schedule with a learning rate that decreases linearly from the initial lr set in the optimizer to 0,
# after a warmup period during which it increases linearly from 0 to the initial lr set in the optimizer.
scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=num_train_steps
    )

In [None]:
best_loss = np.inf
for epoch in range(EPOCHS):
        train_loss = train_fn(train_data_loader, model, optimizer, device, scheduler)
        test_loss = eval_fn(valid_data_loader, model, device)
        print(f"Train Loss = {train_loss} Valid Loss = {test_loss}")
        if test_loss < best_loss:
            torch.save(model.state_dict(),'model_ner_bert.pt')
            best_loss = test_loss

100%|██████████| 1349/1349 [16:08<00:00,  1.39it/s]
100%|██████████| 600/600 [00:43<00:00, 13.72it/s]


Train Loss = 0.22828843980044503 Valid Loss = 0.10512676869208613


100%|██████████| 1349/1349 [16:13<00:00,  1.39it/s]
100%|██████████| 600/600 [00:43<00:00, 13.68it/s]


Train Loss = 0.09406686915820665 Valid Loss = 0.09437256455576669


100%|██████████| 1349/1349 [16:14<00:00,  1.38it/s]
100%|██████████| 600/600 [00:43<00:00, 13.69it/s]


Train Loss = 0.07393012213499305 Valid Loss = 0.09256284738114724


100%|██████████| 1349/1349 [16:14<00:00,  1.38it/s]
100%|██████████| 600/600 [00:43<00:00, 13.69it/s]
  0%|          | 0/1349 [00:00<?, ?it/s]

Train Loss = 0.06023657892183784 Valid Loss = 0.0953140419938912


100%|██████████| 1349/1349 [16:14<00:00,  1.38it/s]
100%|██████████| 600/600 [00:44<00:00, 13.60it/s]
  0%|          | 0/1349 [00:00<?, ?it/s]

Train Loss = 0.04904235566823327 Valid Loss = 0.10056962445921575


100%|██████████| 1349/1349 [16:13<00:00,  1.39it/s]
100%|██████████| 600/600 [00:43<00:00, 13.64it/s]
  0%|          | 0/1349 [00:00<?, ?it/s]

Train Loss = 0.04054407985277579 Valid Loss = 0.10475413876896103


100%|██████████| 1349/1349 [16:12<00:00,  1.39it/s]
100%|██████████| 600/600 [00:43<00:00, 13.69it/s]
  0%|          | 0/1349 [00:00<?, ?it/s]

Train Loss = 0.033801730649008675 Valid Loss = 0.10976771191461011


100%|██████████| 1349/1349 [16:11<00:00,  1.39it/s]
100%|██████████| 600/600 [00:43<00:00, 13.68it/s]
  0%|          | 0/1349 [00:00<?, ?it/s]

Train Loss = 0.028820146838295584 Valid Loss = 0.11395437344753494


100%|██████████| 1349/1349 [16:12<00:00,  1.39it/s]
100%|██████████| 600/600 [00:43<00:00, 13.65it/s]
  0%|          | 0/1349 [00:00<?, ?it/s]

Train Loss = 0.024981357529345178 Valid Loss = 0.11776338476144398


100%|██████████| 1349/1349 [16:12<00:00,  1.39it/s]
100%|██████████| 600/600 [00:43<00:00, 13.67it/s]

Train Loss = 0.022838145327043917 Valid Loss = 0.11790074969292619





In [None]:
#predicting for an example sentence
sentence = 'Jim bought 300 shares of Acme Corp in 2006'
#okenizing sentence using bert tokenizer
tokenized_sentence = TOKENIZER.encode(sentence)
# removing white spaces
sentence = sentence.split()
print(sentence)
print(tokenized_sentence)
#creating test_dataset using sentence
test_dataset = EntityDataset(texts=[sentence], 
                                      pos=[[0] * len(sentence)], 
                                      tags=[[0] * len(sentence)])

device = torch.device("cuda")
# initialising model
model = EntityModel(num_tag=num_tag, num_pos=num_pos)
#loading model with best result
model.load_state_dict(torch.load('model_ner_bert.pt'))
model.to(device)
# switching off gradient calculation
with torch.no_grad():
    data = test_dataset[0]
    for k, v in data.items():
        data[k] = v.to(device).unsqueeze(0)
    tag, pos, _ = model(**data)
    #
    print(enc_tag.inverse_transform(tag.argmax(2).cpu().numpy().reshape(-1))[:len(tokenized_sentence)])
    print(enc_pos.inverse_transform(pos.argmax(2).cpu().numpy().reshape(-1))[:len(tokenized_sentence)])

['Jim', 'bought', '300', 'shares', 'of', 'Acme', 'Corp', 'in', '2006']
[101, 3958, 4149, 3998, 6661, 1997, 9353, 4168, 13058, 1999, 2294, 102]
['B-art' 'B-per' 'O' 'O' 'O' 'O' 'B-org' 'B-org' 'I-org' 'O' 'B-tim'
 'B-art']
['$' 'NNP' 'VBD' 'CD' 'NNS' 'IN' 'NNP' 'NNP' 'NNP' 'IN' 'CD' '$']
