## Loading Data

In [1]:
import nltk
nltk.__version__

'3.7'

In [2]:
nltk.download('conll2002') # downloading the dataset

[nltk_data] Downloading package conll2002 to /root/nltk_data...
[nltk_data]   Unzipping corpora/conll2002.zip.


True

In [3]:
corpus = nltk.corpus.conll2002.iob_sents()

In [4]:
data = []

for cor in corpus:
    # print(cor)  # B-LOC = beginning of a location, I-LOC = inside of a location entity, 0 means non-entity
    # print(list(zip(*cor)))
    sen, _, tag = list(zip(*cor))
    data.append([sen, tag])

In [5]:
data[9999]

[('ERC',
  'CONVOCA',
  'CONFERENCIA',
  'PRESENTARSE',
  'COMO',
  'PARTIDO',
  'DE',
  'GOBIERNO',
  'Barcelona',
  '.'),
 ('B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'B-LOC', 'O')]

## Tokenization

In [6]:
# we don't need to do it

## Numericalization

In [7]:
flatten = lambda l: [item for sublist in l for item in sublist]

sents, tags = list(zip(*data))
vocab = list(set(flatten(sents)))
tagset = list(set(flatten(tags)))

In [10]:
vocab[89:98]

['medemens',
 'contravalor',
 'Estética',
 'Asturiana',
 'bouwen',
 'évolué',
 'Schedels',
 'sepa',
 'duurde']

In [8]:
len(vocab)

65459

In [9]:
tagset

['B-ORG', 'B-PER', 'B-LOC', 'B-MISC', 'O', 'I-MISC', 'I-ORG', 'I-PER', 'I-LOC']

In [11]:
# creating word2index library
word2index={'<UNK>': 0, '<DUMMY>': 1}  # DUMMY facilitates to move the windows.....

# looping each vocab
for v in vocab:
    # if that vocab does not exist yet in the word2index
    if word2index.get(v) is None:
        # the index of this vocab is basically the current len of word2indx
        word2index[v] = len(word2index)

# creating the index2word
index2word = {v:k for k, v in word2index.items()}

tag2index = {}

# doing this the same for tagset
# looping each vocab

for t in tagset:
    # if that vocab does not exist yet in the word2index
    if tag2index.get(t) is None:
        # the index of this vocab is basically the current len of word2indx
        tag2index[t] = len(tag2index)

# creating the index2word
index2tag = {v:k for k, v in tag2index.items()}

In [12]:
tag2index

{'B-ORG': 0,
 'B-PER': 1,
 'B-LOC': 2,
 'B-MISC': 3,
 'O': 4,
 'I-MISC': 5,
 'I-ORG': 6,
 'I-PER': 7,
 'I-LOC': 8}

In [13]:
index2tag

{0: 'B-ORG',
 1: 'B-PER',
 2: 'B-LOC',
 3: 'B-MISC',
 4: 'O',
 5: 'I-MISC',
 6: 'I-ORG',
 7: 'I-PER',
 8: 'I-LOC'}

## Preparing Window Data

In [14]:
for sample in data:
    print(sample[1])
    break

('B-LOC', 'I-LOC', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'O', 'O')


In [16]:
ws = 2
windows = []

for sample in data:
    dummy = ['<DUMMY>'] * ws
    text  = sample[0]
    fulltext = dummy + list(text) + dummy
    print(fulltext)

    window = list(nltk.ngrams(fulltext, ws * 2 + 1))
    
    windows.extend([[list(window[i]), sample[1][i]] for i in range(len(sample[0]))])   

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
['<DUMMY>', '<DUMMY>', 'In', 'New', 'York', 'of', 'Londen', 'gaat', 'het', 'tot', '700.000', 'frank', 'per', 'set', 'van', 'twee', 'uur', '.', '<DUMMY>', '<DUMMY>']
['<DUMMY>', '<DUMMY>', 'Klopt', ',', 'zegt', 'Stijn', 'Snick', '.', '<DUMMY>', '<DUMMY>']
['<DUMMY>', '<DUMMY>', 'Peter-Jan', 'Bogaert', '<DUMMY>', '<DUMMY>']
['<DUMMY>', '<DUMMY>', 'Je', 'hoeft', 'er', 'geen', 'vraag', 'over', 'te', 'stellen', '.', '<DUMMY>', '<DUMMY>']
['<DUMMY>', '<DUMMY>', 'Danst', 'mee', 'op', 'het', 'ritme', 'en', 'de', 'brede', 'smile', 'is', 'niet', 'van', 'zijn', 'gezicht', 'te', 'branden', '.', '<DUMMY>', '<DUMMY>']
['<DUMMY>', '<DUMMY>', 'Van', 'onze', 'verslaggever', '<DUMMY>', '<DUMMY>']
['<DUMMY>', '<DUMMY>', 'Als', 'die', 'opgetrokken', 'worden', 'tot', 'ongeveer', '148', 'bpm', '(', 'beats', 'per', 'minuut', ')', 'en', 'je', 'staat', 'te', 'dansen', ',', 'dan', 'is', 'dat', 'het', 'einde', '.', '<DUMMY>', '<DUMMY>']
['<DUMMY>',

In [17]:
windows[0]

[['<DUMMY>', '<DUMMY>', 'Sao', 'Paulo', '('], 'B-LOC']

In [18]:
len(windows)

678377

In [19]:
windows = windows[:50]

In [20]:
import random
random.shuffle(windows)

train = windows[:int(len(windows) * 0.9)]
test  = windows[int(len(windows) * 0.9):]

In [21]:
len(train), len(test)

(45, 5)

## Modeling

In [22]:
import torch
import torch.nn as nn

class WinNER(nn.Module):
    
    def __init__(self, voc_size, emb_size, hid_size, window_size, output_size):
        super(WinNER, self).__init__()
        self.embed = nn.Embedding(voc_size, emb_size) # embedding the inputs
        self.h1 = nn.Linear(window_size * emb_size, hid_size)
        self.h2 = nn.Linear(hid_size, output_size)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.5)
        
    def forward(self, inputs):
        # inputs = (batch_size, window_size * 2 + 1)
        
        input_embed = self.embed(inputs)
        # input_embed = (batch_size, window_size * 2 + 1, emb_size)
                
        concats  = input_embed.reshape(-1, input_embed.shape[1] * input_embed.shape[2])
        # concats = (batch_size, window_size * 2 + 1 * emb_size)  ===> 5d, e.g., 20
        
        h = self.dropout(self.relu(self.h1(concats)))
        h2 = self.dropout(self.relu(self.h2(h)))
        
        return h2

### Testing the Model

In [23]:
batch_size = 2
inputs = torch.randint(0, len(vocab), (batch_size, 5))
inputs

tensor([[22801, 22657, 13422, 26981, 17239],
        [ 7985, 56630, 29630,  9619, 21311]])

In [24]:
voc_size = len(vocab)
emb_size = 4
hid_size = 8
window_size = ws * 2 + 1
output_size = len(tagset)
model = WinNER(voc_size, emb_size, hid_size, window_size, output_size)

In [25]:
something = model(inputs)
something[0]

tensor([0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 1.0131, 0.6515, 0.0000],
       grad_fn=<SelectBackward0>)

## Training

In [26]:
voc_size = len(vocab)
emb_size = 4
hid_size = 8
window_size  = ws * 2 + 1
num_epochs   = 5
batch_size   = 2
output_size = len(tagset)

model = WinNER(voc_size, emb_size, hid_size, window_size, output_size)

In [27]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr = 0.001)

In [28]:
train[0]

[['de', 'nuevas', 'líneas', 'en', 'el'], 'O']

In [29]:
def getBatch(batch_size, train):

    random.shuffle(train)
    s = 0
    e = batch_size
    
    while e < len(train):
        batch = train[s:e]
        temp = e
        e = e + batch_size
        s = temp
        yield batch   # what is difference between yield and return (yield is MUCH more efficient than return)
    
    if e > len(train):
        batch = train[s:]
        yield batch

In [31]:
# utility function to convert out batch to tensor

def prepare_sequence(seq, word2index):
    idxs = list(map(lambda w: word2index[w] if word2index.get(w) is not None else word2index["<UNK>"], seq))
    return torch.LongTensor(idxs)

def prepare_tag(tag,tag2index):
    return torch.LongTensor([tag2index[tag]])

In [33]:
import numpy as np

# making sure that model uses dropout and any normalization
model.train()

# looping each epoch
for epoch in range(num_epochs):

    # looping each batch
    for i, batch in enumerate(getBatch(batch_size, train)):
        
        x, y = list(zip(*batch))
        
        inputs  = torch.cat([prepare_sequence(sent, word2index).reshape(1, -1) for sent in x])
        # (batch_size, 5)
        
        targets = torch.cat([prepare_tag(tag, tag2index) for tag in y])
        # (batch_size)
    
        # predicting
        preds = model(inputs)
        # (batch_size, len(tag_size))
                        
        # getting the loss
        loss = criterion(preds, targets)
        
        # zero grad
        model.zero_grad()

        # backpropagation
        loss.backward()
        
        # updating parameters
        optimizer.step()
        
    
    print(f"Epoch: {epoch + 1} | Batch: {i:5.0f} | loss: {loss.item()}")

Epoch: 1 | Batch:    22 | loss: 1.7048304080963135
Epoch: 2 | Batch:    22 | loss: 2.296099901199341
Epoch: 3 | Batch:    22 | loss: 1.789602518081665
Epoch: 4 | Batch:    22 | loss: 0.859798789024353
Epoch: 5 | Batch:    22 | loss: 2.1972246170043945
