# Building Named Entity Recognition 
## RNN based implementation based on word level features
Named Entity Recognition or named entity resolution is a similar concept known as NER in short. NER tags the sub-part of the sentences with the definite class. This sub-part can be of one word or combination of many words occurring together.  NER is of the hot topic in the field of NLP. NER has many powerful practical use cases, some of them are given below:

1. Writing efficient search engine by extracting key terms from the text.
2. Suggesting reading content on the basis of the entity mentioned in the literature, similarly suggesting product based on the description of the product.
3. Keeping an eye on the market, by parsing feeds from Twitter. 



## Importing requirements


In [None]:
import pandas as pd
import chakin
import matplotlib.pyplot as plt
from torchtext import data
import nltk
import json
from torchtext import vocab
from tqdm import tqdm
import torch
from torch import nn
import random
import torchtext
import traceback
from tensorboardX import SummaryWriter
from torch.autograd import Variable
import re
import pandas as pd
import os
import numpy as np
import sys
import torch.nn.functional as F
import random
import tarfile
import urllib
from torchtext import data
import datetime
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
import torch
import numpy as np
import json

In [None]:
SEED = 1234
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## Preprocesing
1. Preprocessing data
2. Defining charatcer set 
3. Constructing data iterator


In [None]:
class Preprocess:
    def __init__(self, embeddings_file, data_file, sliding_window = 5):
        self.data_file = data_file
        self.sliding_window = sliding_window
        self._splitted_fields(self.data_file)
        self._load_embeddings(embeddings_file)
        self._label_descretization()
        self._make_sliding_data()
        
        
    def _load_embeddings(self,embeddings_file):
        self.embed_dict = {}
        file_pointer = open(embeddings_file,"r")
        for f in file_pointer.readlines():
            self.embed_dict[f.split(" ")[0]] = [float(i.strip()) for i in f.split(" ")[1:]]
        return self.embed_dict
        
    def _label_descretization(self):
        self.label_2_idx = {}
        one_hot_labels = []
        self.unique_labels = list(set(self.labels))
                
    def _splitted_fields(self,data_file):
        self.words = []
        self.features = []
        self.labels = []
        for each_line in self.data_file:
            if each_line != "":
                self.words.append(each_line.split("\t")[0])
                self.features.append(each_line.split("\t")[0])
                self.labels.append(each_line.split("\t")[-1])
                
    def _make_sliding_data(self):
        self.dataset = []
        self.target = []
        for target_word_index in range(self.sliding_window, len(self.words)-self.sliding_window, 1):
            self.dataset.append(self.words[target_word_index-self.sliding_window : target_word_index+self.sliding_window])
            self.target.append(self.labels[target_word_index])

In [None]:
class data_loader_word_based:
    def __init__(self, embed_dict,unique_labels, dataset, target, batch_size,embed_dim = 100):
        self.words = []
        self.labels = []
        self.features = []
        self.dataset = dataset
        self.unique_labels = unique_labels
        self.target = target
        self.embed_dict = embed_dict
        self.embed_dim = embed_dim
        self.batch_size = batch_size
        self.data_iterator()
         
    def _label_vectorizer(self, label_batch):
            self.one_hot_labels = []
            for each_label in label_batch:
                temp = [ 0 for i in range(0, len(self.unique_labels))]
                temp[self.unique_labels.index(each_label)] = 1
                self.one_hot_labels.append(temp)
            return self.one_hot_labels

    def _data_vectorize(self, data_batch):
            self.vectorised_dataset = []
            for each_dataset in data_batch:
                temp = []
                for i,each_token in enumerate(each_dataset):
                    try:
                        temp.append(self.embed_dict[str(each_token).lower()])
                    except:
                        temp.append([0 for i in range(0,self.embed_dim)])
                self.vectorised_dataset.append(temp)
            return self.vectorised_dataset
    
    def data_iterator(self):
        for i in range(0, len(self.dataset)-self.batch_size):
            batch_labels = []
            batch_data = []
            batch_labels = self._data_vectorize(self.dataset[i:i+self.batch_size])
            target_labels = self._label_vectorizer(self.target[i:i+self.batch_size])
            yield torch.tensor(np.array(batch_labels)).type(torch.FloatTensor).to(device), torch.tensor(np.array(target_labels)).type(torch.FloatTensor).to(device)
            

**Downloading embedding :**
The pre-trained embeddings are available and can be easily used in our model. we will be using the GloVe vector trained having 300 dimensions

In [None]:
embed_exists = os.path.isfile('../embeddings/glove.6B.zip')
if not embed_exists:
    print("Downloading Glove embeddings, if not downloaded properly, then delete the `../embeddings/glove.6B.zip")
    chakin.search(lang='English')
    chakin.download(number=16, save_dir='../embeddings')
    zip_ref = zipfile.ZipFile("../embeddings/glove.6B.zip", 'r')
    zip_ref.extractall("../embeddings/")
    zip_ref.close()

**Loading data and embeddings**

In [None]:
train_file  = open("data/CONLL2003/train.txt").read().splitlines()
test_file  = open("data/CONLL2003/test.txt").read().splitlines()
embeddings_file = '../embeddings/glove.6B/glove.6B.100d.txt'
sliding_window = 5
batch_size = 128

In [None]:
PT =  Preprocess(embeddings_file, train_file, sliding_window = 2)

In [None]:
PTest = Preprocess(embeddings_file, test_file, sliding_window = 2)

In [None]:
DLWB_train = data_loader_word_based(PT.embed_dict, PT.unique_labels, PT.dataset, PT.target ,batch_size, embed_dim=100)

In [None]:
DLWB_test = data_loader_word_based(PTest.embed_dict, PT.unique_labels, PTest.dataset, PTest.target ,batch_size, embed_dim=100)

# RNN Model 

Each word token is embedded with n-dimensional glove vector. to predict if the given token is we need to have a context. Context means the surrounding words here I have 2 words after and before the target word as context. So there will be 5 words in each input. each word can have 100-dimensional GloVe embeddings. If a batch of 32 words is taken then the resultant shape of input will be [32, 5, 100].. the target will be one hot encoded vectors [0,1,0,0,0,0,0,0,0]. And the final batch of the target will be having shape :  [32, 9]



Figure.  Showing how word-based feature is generated. 1) shows the features are generated taking co
ntext window as 2 and 2) showing labels are converted into one hot embedding
This input representation will b be now processed with the LSTM network. 

In [None]:
class RNNAttentionModel(torch.nn.Module):
    def __init__(self,batch_size, class_num, hidden_size, embed_size):
        super(RNNAttentionModel, self).__init__()
        self.batch_size = batch_size
        self.class_num = class_num
        self.hidden_size = hidden_size
        self.embed_size = embed_size


        # self.word_embeddings.weights = nn.Parameter(weights, requires_grad=False)
        self.lstm = nn.LSTM(self.embed_size, self.hidden_size)
        self.label = nn.Linear(self.hidden_size, self.class_num)

    def attention_net(self, lstm_output, final_state):
        hidden = final_state.squeeze(0)
        attn_weights = torch.bmm(lstm_output, hidden.unsqueeze(2)).squeeze(2)
        soft_attn_weights = F.softmax(attn_weights, 1)
        new_hidden_state = torch.bmm(lstm_output.transpose(1, 2), soft_attn_weights.unsqueeze(2)).squeeze(2)
        return new_hidden_state

    def forward(self, input_sentences):
        input = input_sentences.permute(1, 0, 2)
        
        if self.batch_size is None:
            h_0 = Variable(torch.zeros(1, self.batch_size, self.hidden_size).type(torch.FloatTensor)).to(device)
            c_0 = Variable(torch.zeros(1, self.batch_size, self.hidden_size).type(torch.FloatTensor)).to(device)
        else:
            h_0 = Variable(torch.zeros(1, self.batch_size, self.hidden_size).type(torch.FloatTensor)).to(device)
            c_0 = Variable(torch.zeros(1, self.batch_size, self.hidden_size).type(torch.FloatTensor)).to(device)

        output, (final_hidden_state, final_cell_state) = self.lstm(input, (h_0, c_0))  # final_hidden_state.size() = (1, batch_size, hidden_size)
        output = output.permute(1, 0, 2)  # output.size() = (batch_size, num_seq, hidden_size)

        attn_output = self.attention_net(output, final_hidden_state)
        logits = self.label(attn_output)

        return torch.softmax(logits, dim=1)


**Constructing model object**

In [None]:
model  = RNNAttentionModel(batch_size, class_num = len(PT.unique_labels), hidden_size = 256, embed_size = 100)
model = model.to(device)

**Supporting Functions**

In [None]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    rounded_preds = torch.argmax(preds, dim=1)
    correct = (rounded_preds == torch.argmax(y, dim=1)).float() #convert into float for division 
    acc = correct.sum()/len(correct)
    return acc

In [None]:
def test_accuracy_calculator(model,test_iterator, writer,test_iteration):
    epoch_acc = []
    for i, batch in enumerate(test_iterator):
        feature, target = batch[0], batch[1]
        if feature.shape[0] ==  batch_size:
            predictions = model(feature.to(device))            
            acc = binary_accuracy(predictions.type(torch.FloatTensor), target.type(torch.FloatTensor))
            epoch_acc.append(acc.item())
            if i % 100 == 0:
                writer.add_scalar('Test/Accuracy',acc.item(), test_iteration)
        test_iteration = test_iteration + 1
    return  sum(epoch_acc) / len(epoch_acc),test_iteration

**Defining optimizer and loss function**

In [None]:
optimizer = torch.optim.SGD(model.parameters(), lr=0.1,momentum=0.9)
criterion = nn.MSELoss()
criterion = criterion.to(device)

# Training

In [None]:
def train(model, iterator, optimizer, criterion, writer,train_iteration):
    epoch_loss = []
    epoch_acc = []
    model.train()
    
    for i, batch in enumerate(iterator):
        feature, target = batch[0], batch[1]
        if feature.shape[0] ==  batch_size:
            optimizer.zero_grad()
            predictions = model(feature.to(device))            
            loss = criterion(predictions.type(torch.FloatTensor), target.type(torch.FloatTensor))
            loss.backward()
            optimizer.step()
            acc = binary_accuracy(predictions.type(torch.FloatTensor), target.type(torch.FloatTensor))
            epoch_loss.append(loss.item())
            epoch_acc.append(acc.item())
            if i % 100 == 0:
                writer.add_scalar('Train/Accuracy',acc.item(), train_iteration)
                writer.add_scalar('Train/loss',loss.item(), train_iteration)
            train_iteration = train_iteration + 1
            
    return model, sum(epoch_loss) / len(epoch_loss), sum(epoch_acc) / len(epoch_acc),train_iteration

In [None]:
epochs  = 100
train_iteration  = 0
test_iteration  = 0
loss = []
accuracy = []
test_accuracy = []
writer = SummaryWriter()
for i in tqdm(range(epochs)):
    if (i != 0 and i%10 == 0 ):
        for param_group in optimizer.param_groups:
            param_group['lr'] = param_group['lr']/2
        print(" === New Learning rate : ", param_group['lr'], " === ")

    model, epoch_loss, epoch_acc,train_iteration = train(model, DLWB_train.data_iterator(), optimizer, criterion, writer,train_iteration)

    test_acc, test_iteration = test_accuracy_calculator(model, DLWB_test.data_iterator(), writer,test_iteration)
    accuracy.append(epoch_acc)
    loss.append(epoch_loss)
    test_accuracy.append(test_acc)

# Performance

The accuracy reaches up to about 87% and loss also decreases in considerably


![](figures/NER_RNN_train.png)

Figure:  Showing decrease in the loss and increase in accuracy on train data when model trained NER task taking word level feature

The performance of the model on the test data is also notable, the accuracy reaches 87% here also. This also means out implementation generalizes well on the unseen data. 

![](figures/NER_RNN_test_acc.png)

Figure:  Showing increase in accuracy on test data when model trained NER task taking word level feature