# Natural Language Processing

## Window Classifier for NER

This will be based on our L3 lecture.

## 1. Load data

CoNLL-2002 Shared Task: Language-Independent Named Entity Recognition <br>
https://www.clips.uantwerpen.be/conll2002/ner/

In [1]:
import nltk
nltk.download('conll2002')

[nltk_data] Downloading package conll2002 to
[nltk_data]     /Users/chaklam/nltk_data...
[nltk_data]   Package conll2002 is already up-to-date!


True

In [2]:
corpus = nltk.corpus.conll2002.iob_sents()

In [3]:
data = []
for cor in corpus:
    sent, _, tag = list(zip(*cor))  #zip will help split them
    data.append([sent, tag])

In [4]:
print(len(data))
print(data[0])  #B-LOC stands for beginning of a location, I-LOC stands for inner entity

35651
[('Sao', 'Paulo', '(', 'Brasil', ')', ',', '23', 'may', '(', 'EFECOM', ')', '.'), ('B-LOC', 'I-LOC', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'O', 'O')]


## 2. Tokenization

Since the dataset is already tokenized, our life is easy.  Just skip!

## 3. Numericalization

In [5]:
flatten = lambda l: [item for sublist in l for item in sublist]

sents,tags = list(zip(*data))
vocab  = list(set(flatten(sents)))
tagset = list(set(flatten(tags)))

In [6]:
vocab[0], tagset[0]

('Grape', 'I-ORG')

In [7]:
len(vocab), len(tagset)

(65459, 9)

In [8]:
tagset

['I-ORG', 'I-PER', 'B-LOC', 'O', 'B-PER', 'I-LOC', 'B-ORG', 'I-MISC', 'B-MISC']

Note that we need to build separate id for vocab and tags for prediction

In [9]:
word2index={'<UNK>' : 0, '<DUMMY>' : 1} # dummy token is for start or end of sentence, like padding
for vo in vocab:
    if word2index.get(vo) is None:
        word2index[vo] = len(word2index)  #basically the id is the current length run until now...
index2word = {v:k for k, v in word2index.items()}

tag2index = {}
for tag in tagset:
    if tag2index.get(tag) is None:
        tag2index[tag] = len(tag2index)
index2tag={v:k for k, v in tag2index.items()}

In [10]:
tag2index

{'I-ORG': 0,
 'I-PER': 1,
 'B-LOC': 2,
 'O': 3,
 'B-PER': 4,
 'I-LOC': 5,
 'B-ORG': 6,
 'I-MISC': 7,
 'B-MISC': 8}

## 4. Prepare window data

<img src="figures/ner_win.png" width="400">

In [11]:
window_size = 2
windows = []

In [12]:
for sample in data:
    #sample = [(text1, text2, ..), (tag1, tag2, ..)]
    dummy       = ['<DUMMY>'] * window_size  #for padding
    text        = sample[0]    
    padded_text = dummy + list(sample[0]) + dummy  #padding so we can move the whole text
    window      = list(nltk.ngrams(padded_text, window_size * 2 + 1))  #*2 + 1 will include before and after the center word
    
    #given a window of five words, predict the tag of the middle word
    windows.extend([[list(window[i]), sample[1][i]] for i in range(len(sample[0]))])

In [13]:
windows[0]

[['<DUMMY>', '<DUMMY>', 'Sao', 'Paulo', '('], 'B-LOC']

In [14]:
len(windows)

678377

In [15]:
#too much for my cpu
# windows = windows[:100000]

In [16]:
import random
random.shuffle(windows)

train_data = windows[:int(len(windows) * 0.9)]
test_data  = windows[int(len(windows) * 0.9):]

## 5. Modeling

<img src="figures/ner_model.png" width="600">

In [17]:
import torch
import torch.nn as nn

class WindowClassifier(nn.Module): 
    def __init__(self, vocab_size, embedding_size, window_size, hidden_size, output_size):

        super(WindowClassifier, self).__init__()
        
        self.embed = nn.Embedding(vocab_size, embedding_size)
        self.h_layer1 = nn.Linear(embedding_size * (window_size * 2 + 1), hidden_size)
        self.h_layer2 = nn.Linear(hidden_size, hidden_size)
        self.o_layer  = nn.Linear(hidden_size, output_size)
        self.relu     = nn.ReLU()
        self.dropout  = nn.Dropout(0.3)
        
    def forward(self, inputs): 
        #inputs = 
        embeds = self.embed(inputs) # BxWxD
        concated = embeds.view(-1, embeds.size(1)*embeds.size(2)) # Bx(W*D)
        h0 = self.relu(self.h_layer1(concated))
        h0 = self.dropout(h0)
        h1 = self.relu(self.h_layer2(h0))
        h1 = self.dropout(h1)
        out = self.o_layer(h1)
        return out

## 6. Training 

It takes for a while if you use just cpu.

In [18]:
#we chose this number to save time....
batch_size  = 2
embed_size  = 4 # x (WINDOW_SIZE*2+1) = 20  #increasing this will increase the f1score
hidden_size = 8  #increasing this will increase the f1score
num_epochs  = 5

In [19]:
import torch.optim as optim

model = WindowClassifier(len(word2index), embed_size, window_size, hidden_size, len(tag2index))
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [20]:
#yield is better than return
#because it does not take memory space
def getBatch(batch_size, train_data):
    random.shuffle(train_data)
    sindex = 0
    eindex = batch_size
    while eindex < len(train_data):
        batch = train_data[sindex: eindex]
        temp = eindex
        eindex = eindex + batch_size
        sindex = temp
        yield batch
    
    if eindex >= len(train_data):
        batch = train_data[sindex:]
        yield batch

In [21]:
def prepare_sequence(seq, word2index):
    idxs = list(map(lambda w: word2index[w] if word2index.get(w) is not None else word2index["<UNK>"], seq))
    return torch.LongTensor(idxs)

def prepare_tag(tag,tag2index):
    return torch.LongTensor([tag2index[tag]])

In [22]:
import numpy as np

model.train()

for epoch in range(num_epochs):
    losses = []
    for i, batch in enumerate(getBatch(batch_size, train_data)):
        
        x,y=list(zip(*batch)) 
        #x: (['leerde', 'Luis', 'Barragán', 'door', 'die'], [another batch], ..)
        #y: ('I-PER', 'B-LOC')
        
        inputs  = torch.cat([prepare_sequence(sent, word2index).view(1, -1) for sent in x])
        #inputs = (batch_size, window_size)

        targets = torch.cat([prepare_tag(tag, tag2index) for tag in y])
        #targets = (batch_size)
        
        model.zero_grad()
        
        preds = model(inputs)
        loss  = criterion(preds, targets)
        losses.append(loss.item())
        loss.backward()
        optimizer.step()
            
    print(f"Epoch: {epoch + 1} | Batch: {i:5.0f} | loss: {np.mean(losses):.6f} ")

Epoch: 1 | Batch: 305269 | loss: 0.459799 
Epoch: 2 | Batch: 305269 | loss: 0.377058 
Epoch: 3 | Batch: 305269 | loss: 0.353781 
Epoch: 4 | Batch: 305269 | loss: 0.343317 
Epoch: 5 | Batch: 305269 | loss: 0.335064 


## 7. Test 

In [23]:
for_f1_score = []

In [24]:
accuracy = 0

model.eval()
for test in test_data:
    x, y = test[0], test[1]
    input_ = prepare_sequence(x, word2index).view(1, -1)
    #input_ = [[[18381, 33735, 59988, 48073, 33735]]
    #input_ : (1, window_size * 2 + 1)
    
    preds = model(input_)
    #preds : (1, label_size, i.e., number of tags)

    i = model(input_).max(1)[1]  #max on first dimension, then take the index which is the 1th index element returned by max
    pred = index2tag[i.item()]    
    
    for_f1_score.append([pred, y])
    if pred == y:
        accuracy += 1

print(accuracy/len(test_data) * 100)

90.23408708983166


This high score is because most of labels are 'O' tag. So we need to measure f1 score.

### f1-score

In [25]:
y_pred, y_test = list(zip(*for_f1_score))

In [26]:
set(y_pred)

{'B-LOC', 'B-ORG', 'B-PER', 'O'}

In [27]:
set(y_test)

{'B-LOC', 'B-MISC', 'B-ORG', 'B-PER', 'I-LOC', 'I-MISC', 'I-ORG', 'I-PER', 'O'}

In [28]:
sorted_labels = sorted(
    list(set(y_test) - {'O'}),
    key=lambda name: (name[1:], name[0])
)

In [29]:
sorted_labels

['B-LOC', 'I-LOC', 'B-MISC', 'I-MISC', 'B-ORG', 'I-ORG', 'B-PER', 'I-PER']

In [30]:
from sklearn import metrics
     
print(metrics.classification_report(y_test, y_pred, labels=sorted_labels, digits=3))

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

       B-LOC      1.000     0.002     0.003      1149
       I-LOC      0.000     0.000     0.000       315
      B-MISC      0.000     0.000     0.000       849
      I-MISC      0.000     0.000     0.000       617
       B-ORG      0.502     0.461     0.481      1480
       I-ORG      0.000     0.000     0.000       943
       B-PER      0.364     0.267     0.308      1294
       I-PER      0.000     0.000     0.000       907

   micro avg      0.446     0.136     0.209      7554
   macro avg      0.233     0.091     0.099      7554
weighted avg      0.313     0.136     0.147      7554



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
