# Building Named Entity Recognition 
## CNN based implementation based on character level features
Named Entity Recognition or named entity resolution is a similar concept known as NER in short. NER tags the sub-part of the sentences with the definite class. This sub-part can be of one word or combination of many words occurring together.  NER is of the hot topic in the field of NLP. NER has many powerful practical use cases, some of them are given below:

1. Writing efficient search engine by extracting key terms from the text.
2. Suggesting reading content on the basis of the entity mentioned in the literature, similarly suggesting product based on the description of the product.
3. Keeping an eye on the market, by parsing feeds from Twitter. 



## Importing requirements


In [None]:
import pandas as pd
import chakin
import matplotlib.pyplot as plt
from torchtext import data
import nltk
import json
from torchtext import vocab
from tqdm import tqdm
import torch
from torch import nn
import random
import torchtext
import traceback
from tensorboardX import SummaryWriter
from torch.autograd import Variable
import re
import pandas as pd
import os
import numpy as np
import sys
import torch.nn.functional as F
import random
import tarfile
import urllib
from torchtext import data
import datetime
import torch
import numpy as np
import json

In [None]:
SEED = 1234
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## Preprocesing
1. Preprocessing data
2. Defining charatcer set 
3. Constructing data iterator


In [None]:
class Preprocess:
    def __init__(self, data_file, sliding_window = 5):
        self.data_file = data_file
        self.sliding_window = sliding_window
        self._splitted_fields(self.data_file)
        self._label_descretization()
        self._make_sliding_data()
        
    def _label_descretization(self):
        self.label_2_idx = {}
        one_hot_labels = []
        self.unique_labels = list(set(self.labels))
                
    def _splitted_fields(self,data_file):
        self.words = []
        self.features = []
        self.labels = []
        for each_line in self.data_file:
            if each_line != "":
                self.words.append(each_line.split("\t")[0])
                self.features.append(each_line.split("\t")[0])
                self.labels.append(each_line.split("\t")[-1])
                
    def _make_sliding_data(self):
        self.dataset = []
        self.target = []
        for target_word_index in range(self.sliding_window, len(self.words)-self.sliding_window-1, 1):
            self.dataset.append(self.words[target_word_index-self.sliding_window : target_word_index+self.sliding_window+1])
            self.target.append(self.labels[target_word_index])


In [None]:
CHARSET = list("""abcdefghijklmnopqrstuvwxyz0123456789,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}""")
MAX_WORD_LEN = 10

In [None]:
class data_loader_charcter_based:
    def __init__(self, unique_labels, dataset, target, batch_size,embed_dim = 100):
        self.words = []
        self.labels = []
        self.features = []
        self.dataset = dataset
        self.unique_labels = unique_labels
        self.target = target
        self.embed_dim = embed_dim
        self.batch_size = batch_size
    def character_one_hot_encoder(self,word):
        word = list(word.lower())
        if len(word) <= MAX_WORD_LEN:
            word.extend([0 for i in range(0, MAX_WORD_LEN-len(word))])
        else:
            word[:MAX_WORD_LEN]
        zero_mat = np.zeros([MAX_WORD_LEN, len(CHARSET)])
        for i,char in enumerate(word):
            try:
                zero_mat[i][CHARSET.index(str(char))] = 1
            except:
                ""
        return zero_mat

    def context_onehot_encoder(self,context_words):
        context_word_features = []
        for word in context_words:
            context_word_features.append(self.character_one_hot_encoder(word))
        return np.array(context_word_features)
         
    def _label_vectorizer(self, label_batch):
            self.one_hot_labels = []
            for each_label in label_batch:
                temp = [0 for i in range(0, len(self.unique_labels))]
                temp[self.unique_labels.index(each_label)] = 1
                self.one_hot_labels.append(temp)
            return self.one_hot_labels

    def _data_vectorize(self, data_batch):
            self.vectorised_dataset = []
            for each_dataset in data_batch:
                self.vectorised_dataset.append(self.context_onehot_encoder(each_dataset))
            return np.array(self.vectorised_dataset)
    
    def data_iterator(self):
        for i in range(0, int(len(self.dataset)/self.batch_size)):
            batch_labels = []
            batch_data = []
            batch_labels = self._data_vectorize(self.dataset[i:i+self.batch_size])
            target_labels = self._label_vectorizer(self.target[i:i+self.batch_size])
            yield torch.tensor(np.array(batch_labels)).type(torch.FloatTensor).to(device), torch.tensor(np.array(target_labels)).type(torch.FloatTensor).to(device)
            

**Loading data and embeddings**

In [None]:
train_file  = open("data/CONLL2003/train.txt").read().splitlines()
test_file  = open("data/CONLL2003/test.txt").read().splitlines()
sliding_window = 5
batch_size = 128

In [None]:
PT =  Preprocess( train_file, sliding_window = 2)

In [None]:
PTest = Preprocess(test_file, sliding_window = 2)

In [None]:
DLWB_train = data_loader_charcter_based(PT.unique_labels, PT.dataset, PT.target ,batch_size, embed_dim=100)

**Inspecting data shape**

In [None]:
next(DLWB_train.data_iterator())[0].shape

In [None]:
DLWB_test = data_loader_charcter_based(PT.unique_labels, PTest.dataset, PTest.target ,batch_size, embed_dim=100)

# CNN Model 

Let's take that max size of our word can be 10, our vocabulary will be having a maximum of 69 characters. Each word can be represented as the matrix of [10,69] in one hot encoded form. This is for the one word if we consider the window of 2 words before and after including target word then the input size will be   [5, 10, 69]. Processing such input in the batch of 32 will give final size as [32,5,10,69]. This will be an input to the convolutional layers.

![](figures/NER_CNN.png)

Figure.  Showing how character-based feature is generated. 1) shows the features are generated taking context window as 2 and 2) showing labels are converted into one hot embedding

The model accepts [128, 5, 10, 68], dimensional input wherein the 128 is batch size, 5 is target plus context words, 10 is the max character in the word and 68 is a number of uniques characters considered.  In another term, if we compare the input to the image then 128 is the batch size 5 is similar to channels in the image of dim 10*68. The input shape passes through the series of convolution operation and the number of channels is increased from 5 to 40. The resultant tensor is passed to linear layer fo to convert it to output probabilities after softmax layer application.



In [None]:
class CNNmodel(torch.nn.Module):
    def __init__(self,batch_size, class_num):
        super(CNNmodel, self).__init__()
        self.batch_size = batch_size
        self.class_num = class_num
        self.conv1 = nn.Conv2d(in_channels=5, out_channels=10, kernel_size=3, stride=1)
        self.conv2 = nn.Conv2d(in_channels=10, out_channels=20, kernel_size=3, stride=1)
        self.conv3 = nn.Conv2d(in_channels=20, out_channels=40, kernel_size=3, stride=1)
        
        self.linear1 = nn.Linear(in_features=40*4*62, out_features=self.class_num)
    def forward(self, input):
        conv1_out = self.conv1(input)
        conv2_out = self.conv2(conv1_out)
        conv3_out = self.conv3(conv2_out)
        linear1_out = self.linear1(conv3_out.view(self.batch_size,-1))
        return torch.softmax(linear1_out, dim=1)
            

**Constructing model object**

In [None]:
model  = CNNmodel(batch_size, class_num = len(PT.unique_labels))
model = model.to(device)

**Supporting Functions**

In [None]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    rounded_preds = torch.argmax(preds, dim=1)
    correct = (rounded_preds == torch.argmax(y, dim=1)).float() #convert into float for division 
    acc = correct.sum()/len(correct)
    return acc

In [None]:
def test_accuracy_calculator(model,test_iterator, writer,test_iteration):
    epoch_acc = []
    for i, batch in enumerate(test_iterator):
        feature, target = batch[0], batch[1]
        if feature.shape[0] ==  batch_size:
            predictions = model(feature.to(device))            
            acc = binary_accuracy(predictions.type(torch.FloatTensor), target.type(torch.FloatTensor))
            epoch_acc.append(acc.item())
            if i % 100 == 0:
                writer.add_scalar('Test/Accuracy',acc.item(), test_iteration)
        test_iteration = test_iteration + 1
    return  sum(epoch_acc) / len(epoch_acc),test_iteration

**Defining optimizer and loss function**

In [None]:
optimizer = torch.optim.SGD(model.parameters(), lr=0.1,momentum=0.9)
criterion = nn.MSELoss()
criterion = criterion.to(device)

# Training

In [None]:
def train(model, iterator, optimizer, criterion, writer,train_iteration):
    epoch_loss = []
    epoch_acc = []
    model.train()
    
    for i, batch in enumerate(iterator):
        feature, target = batch[0], batch[1]
        if feature.shape[0] ==  batch_size:
            optimizer.zero_grad()
            predictions = model(feature.to(device))            
            loss = criterion(predictions.type(torch.FloatTensor), target.type(torch.FloatTensor))
            loss.backward()
            optimizer.step()
            acc = binary_accuracy(predictions.type(torch.FloatTensor), target.type(torch.FloatTensor))
            epoch_loss.append(loss.item())
            epoch_acc.append(acc.item())
            if i % 100 == 0:
                writer.add_scalar('Train/Accuracy',acc.item(), train_iteration)
                writer.add_scalar('Train/loss',loss.item(), train_iteration)
            train_iteration = train_iteration + 1
            
    return model, sum(epoch_loss) / len(epoch_loss), sum(epoch_acc) / len(epoch_acc),train_iteration

In [None]:
epochs  = 10
train_iteration  = 0
test_iteration  = 0
loss = []
accuracy = []
test_accuracy = []
writer = SummaryWriter()
for i in tqdm(range(epochs)):
    if (i != 0 and i%10 == 0 ):
        for param_group in optimizer.param_groups:
            param_group['lr'] = param_group['lr']/2
        print(" === New Learning rate : ", param_group['lr'], " === ")

    model, epoch_loss, epoch_acc,train_iteration = train(model, DLWB_train.data_iterator(), optimizer, criterion, writer,train_iteration)

    test_acc, test_iteration = test_accuracy_calculator(model, DLWB_test.data_iterator(), writer,test_iteration)
    accuracy.append(epoch_acc)
    loss.append(epoch_loss)
    test_accuracy.append(test_acc)

# Performance

The accuracy reaches up to about 85% and loss also decreases in considerably

![](figures/NER_CNN_Train.png)

Figure:  Showing decrease in the loss and increase in accuracy on train data when model trained NER task taking character level feature

The performance of the model on the test data is also notable, the accuracy reaches 85% here also. This also means out implementation generalizes well on the unseen data. 

![](figures/NER-CNN_test_acc.png)

Figure:  Showing increase in accuracy on test data when model trained NER task taking character level feature