In [40]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F
import nltk
import random
import numpy as np
from collections import Counter
flatten = lambda l: [item for sublist in l for item in sublist]

from sklearn_crfsuite import metrics
random.seed(1024)
nltk.download('conll2002')

[nltk_data] Downloading package conll2002 to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package conll2002 is already up-to-date!


True

In [49]:
def getBatch(batch_size, train_data):
    random.shuffle(train_data)
    sindex = 0
    eindex = batch_size
    while eindex < len(train_data):
        batch = train_data[sindex: eindex]
        temp = eindex
        eindex = eindex + batch_size
        sindex = temp
        yield batch
    
    if eindex >= len(train_data):
        batch = train_data[sindex:]
        yield batch

In [50]:
FloatTensor = torch.FloatTensor
LongTensor =  torch.LongTensor
ByteTensor =  torch.ByteTensor

In [51]:
def prepare_sequence(seq, word2index):
    idxs = list(map(lambda w: word2index[w] if word2index.get(w) is not None else word2index["<UNK>"], seq))
    return Variable(LongTensor(idxs))

def prepare_word(word, word2index):
    return Variable(LongTensor([word2index[word]]) if word2index.get(word) is not None else LongTensor([word2index["<UNK>"]]))

def prepare_tag(tag,tag2index):
    return Variable(LongTensor([tag2index[tag]]))

## Data load and Preprocessing

In [52]:
corpus = nltk.corpus.conll2002.iob_sents()

In [53]:
data = []
for cor in corpus:
    sent, _, tag = list(zip(*cor))
    data.append([sent, tag])

In [54]:
print(len(data))
print(data[0])

35651
[('Sao', 'Paulo', '(', 'Brasil', ')', ',', '23', 'may', '(', 'EFECOM', ')', '.'), ('B-LOC', 'I-LOC', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'O', 'O')]


## Build Vocab

In [55]:
sents,tags = list(zip(*data))
vocab = list(set(flatten(sents)))
tagset = list(set(flatten(tags)))

In [56]:
word2index={'<UNK>' : 0, '<DUMMY>' : 1} # dummy token is for start or end of sentence
for vo in vocab:
    if word2index.get(vo) is None:
        word2index[vo] = len(word2index)
index2word = {v:k for k, v in word2index.items()}

tag2index = {}
for tag in tagset:
    if tag2index.get(tag) is None:
        tag2index[tag] = len(tag2index)
index2tag={v:k for k, v in tag2index.items()}

In [57]:
index2tag

{0: 'B-ORG',
 1: 'I-PER',
 2: 'I-LOC',
 3: 'B-MISC',
 4: 'I-ORG',
 5: 'B-LOC',
 6: 'I-MISC',
 7: 'O',
 8: 'B-PER'}

## Prepare data

![](https://user-images.githubusercontent.com/36406676/54126778-aef46900-444b-11e9-9fae-60de322d6ec0.jpg)

In [58]:
WINDOW_SIZE = 2
windows = []

In [59]:
for sample in data:
    dummy =['DUMMY'] * WINDOW_SIZE
    window = list(nltk.ngrams(dummy + list(sample[0]) + dummy, WINDOW_SIZE * 2 + 1))
    windows.extend([[list(window[i]), sample[1][i]] for i in range(len(sample[0]))])

In [60]:
windows[0] ## Sao 개체명을 예측해야됨

[['DUMMY', 'DUMMY', 'Sao', 'Paulo', '('], 'B-LOC']

In [61]:
len(windows)

678377

In [62]:
random.shuffle(windows)

train_data = windows[:int(len(windows) * 0.9)] # 610539
test_data = windows[int(len(windows) * 0.9):]  # 67838

![](https://user-images.githubusercontent.com/36406676/54128044-8326b280-444e-11e9-803f-b482e0702673.jpg)

In [75]:
class WindowClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_size, window_size, hidden_size, output_size):
        super(WindowClassifier,self).__init__()
        
        self.embed = nn.Embedding(vocab_size, embedding_size)
        self.h_layer1 = nn.Linear(embedding_size * (window_size * 2 + 1), hidden_size)
        self.h_layer2 = nn.Linear(hidden_size, hidden_size)
        self.o_layer = nn.Linear(hidden_size, output_size)
        self.relu = nn.ReLU()
        self.softmax = nn.LogSoftmax(dim = 1)
        self.dropout = nn.Dropout(0.3)
        
    def forward(self, inputs, is_training = False):
        embeds = self.embed(inputs) #BXWXD
        concated = embeds.view(-1, embeds.size(1)*embeds.size(2)) # BX(WXD)
        h0 = self.relu(self.h_layer1(concated))
        if is_training:
            h0 = self.dropout(h0)
        h1 = self.relu(self.h_layer2(h0))
        if is_training:
            h1 = self.dropout(h1)
            
        out = self.softmax(self.o_layer(h1))
        return out

In [76]:
BATCH_SIZE = 128
EMBEDDING_SIZE = 50 # x (WINDOW_SIZE*2+1) = 250
HIDDEN_SIZE = 300
EPOCH = 3
LEARNING_RATE = 0.001

## Training

In [77]:
model = WindowClassifier(len(word2index), EMBEDDING_SIZE, WINDOW_SIZE, HIDDEN_SIZE, len(tag2index))
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [79]:
for epoch in range(EPOCH):
    losses = []
    for i,batch in enumerate(getBatch(BATCH_SIZE, train_data)):
        x,y=list(zip(*batch))
        inputs = torch.cat([prepare_sequence(sent, word2index).view(1, -1) for sent in x])
        targets = torch.cat([prepare_tag(tag, tag2index) for tag in y])
        model.zero_grad()
        preds = model(inputs, is_training=True)
        loss = loss_function(preds, targets)
        losses.append(loss.data)
        loss.backward()
        optimizer.step()

        if i % 1000 == 0:
            print("[%d/%d] mean_loss : %0.2f" %(epoch, EPOCH, np.mean(losses)))
            losses = []

[0/3] mean_loss : 2.13
[0/3] mean_loss : 0.47
[0/3] mean_loss : 0.37
[0/3] mean_loss : 0.32
[0/3] mean_loss : 0.28
[1/3] mean_loss : 0.20
[1/3] mean_loss : 0.22
[1/3] mean_loss : 0.21
[1/3] mean_loss : 0.20
[1/3] mean_loss : 0.19
[2/3] mean_loss : 0.31
[2/3] mean_loss : 0.15
[2/3] mean_loss : 0.14
[2/3] mean_loss : 0.14
[2/3] mean_loss : 0.14


## Test

In [80]:
for_f1_score = []

In [81]:
accuracy = 0
for test in test_data:
    x, y = test[0], test[1]
    input_ = prepare_sequence(x, word2index).view(1, -1)

    i = model(input_).max(1)[1]
    pred = index2tag[i.data.tolist()[0]]
    for_f1_score.append([pred, y])
    if pred == y:
        accuracy += 1

print(accuracy/len(test_data) * 100)

95.80028892361213


## Print confusion matrix

In [82]:
y_pred, y_test = list(zip(*for_f1_score))

In [83]:
sorted_labels = sorted(
    list(set(y_test) - {'O'}),
    key=lambda name: (name[1:], name[0])
)

In [84]:
sorted_labels

['B-LOC', 'I-LOC', 'B-MISC', 'I-MISC', 'B-ORG', 'I-ORG', 'B-PER', 'I-PER']

In [85]:
y_pred = [[y] for y in y_pred] # this is because sklearn_crfsuite.metrics function flatten inputs
y_test = [[y] for y in y_test]

In [86]:
print(metrics.flat_classification_report(
    y_test, y_pred, labels = sorted_labels, digits=3
))

             precision    recall  f1-score   support

      B-LOC      0.760     0.697     0.727      1136
      I-LOC      0.686     0.438     0.534       320
     B-MISC      0.750     0.442     0.556       801
     I-MISC      0.604     0.418     0.494       646
      B-ORG      0.770     0.696     0.731      1343
      I-ORG      0.764     0.682     0.720       917
      B-PER      0.809     0.753     0.780      1304
      I-PER      0.877     0.830     0.853       961

avg / total      0.768     0.659     0.706      7428

