In [1]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F
import nltk
import random
import numpy as np
from collections import Counter

from sklearn.metrics import classification_report

# to make a simple list out of list of lists 
flatten = lambda l: [item for sublist in l for item in sublist] 
# which means:  for sublist in I: 
#                  for item in sublist:
#                      flatten.append(item) 
random.seed(1024)

In [2]:
USE_CUDA = torch.cuda.is_available()
# gpus = [0]
# torch.cuda.set_device(gpus[0])

FloatTensor = torch.cuda.FloatTensor if USE_CUDA else torch.FloatTensor
LongTensor = torch.cuda.LongTensor if USE_CUDA else torch.LongTensor
ByteTensor = torch.cuda.ByteTensor if USE_CUDA else torch.ByteTensor

In [4]:
def getBatch(batch_size, train_data):
    random.shuffle(train_data)
    sindex = 0
    eindex = batch_size
    while eindex < len(train_data):
        batch = train_data[sindex: eindex]
        # eindex -> sindex 로 
        temp = eindex
        eindex = eindex + batch_size
        sindex = temp
        yield batch
        
    if eindex >= len(train_data):
        batch = train_data[sindex:]
        yield batch

## Data load and Preprocessing

In [5]:
# four types of named entities: 
# persons, locations, organizations and names of miscellaneous entities that do not belong to the previous three groups.
corpus = nltk.corpus.conll2002.iob_sents()

In [6]:
# The first item on each line is a word and the second the named entity tag.
# B denotes the first item of a phrase and an I any non-initial word

data = []
for cor in corpus:                  
    sent, _, tag = list(zip(*cor))  # sent, _, tag에 cor의 i번째 원소들을 묶어서 리턴 
    data.append([sent, tag])        

In [7]:
cor

[('Maar', 'Conj', 'O'),
 ('we', 'Pron', 'O'),
 ('verwachtten', 'V', 'O'),
 ('eigenlijk', 'Adj', 'O'),
 ('iets', 'Pron', 'O'),
 ("extra's", 'N', 'O'),
 ('uit', 'Prep', 'O'),
 ('de', 'Art', 'O'),
 ('wonderoogst', 'N', 'O'),
 ('1997', 'Num', 'O'),
 ('.', 'Punc', 'O')]

In [18]:
sent

('Maar',
 'we',
 'verwachtten',
 'eigenlijk',
 'iets',
 "extra's",
 'uit',
 'de',
 'wonderoogst',
 '1997',
 '.')

In [17]:
# 품사 
_

('Conj', 'Pron', 'V', 'Adj', 'Pron', 'N', 'Prep', 'Art', 'N', 'Num', 'Punc')

In [27]:
tag

('O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O')

In [8]:
print(len(data))
print(data[0])

35651
[('Sao', 'Paulo', '(', 'Brasil', ')', ',', '23', 'may', '(', 'EFECOM', ')', '.'), ('B-LOC', 'I-LOC', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'O', 'O')]


### Build Vocab

In [9]:
sents, tags = list(zip(*data))    # data의 i번째 원소들을 묶어서 반환
vocab = list(set(flatten(sents))) # set(): 중복 값 제거
tagset = list(set(flatten(tags))) 

In [10]:
word2index = {'<UNK>': 0, '<DUMMY>': 1}  # dummy token is for start or end of sentence

# word2index - word : index 형태 
for vo in vocab:
    if word2index.get(vo) is None:
        word2index[vo] = len(word2index)

# index2word - index : word 형태로 변경
index2word = {v:k for k, v in word2index.items()}

# tag2index - tag : index 형태
tag2index = {}
for tag in tagset:
    if tag2index.get(tag) is None:
        tag2index[tag] = len(tag2index)

# index2tag - index : tag 형태로 변경 
index2tag = {v:k for k, v in tag2index.items()}

### Prepare Data

![](https://nbviewer.jupyter.org/github/DSKSD/DeepNLP-models-Pytorch/blob/master/images/04.window-data.png)

In [21]:
WINDOW_SIZE = 2
windows = []

for sample in data:  # data에는 sent, tag의 데이터가 있음 
    dummy = ['<DUMMY>'] * WINDOW_SIZE
    # 시작과 끝에 dummy넣어주고, (WINDOW_SIZE * 2 + 1)길이의 window를 생성
    window = list(nltk.ngrams(dummy+list(sample[0])+dummy, WINDOW_SIZE*2+1))  
    windows.extend([[list(window[i]), sample[1][i]] for i in range(len(sample[0]))])  # window에 tag추가해서 windows생성 

In [23]:
window

[('<DUMMY>', '<DUMMY>', 'Maar', 'we', 'verwachtten'),
 ('<DUMMY>', 'Maar', 'we', 'verwachtten', 'eigenlijk'),
 ('Maar', 'we', 'verwachtten', 'eigenlijk', 'iets'),
 ('we', 'verwachtten', 'eigenlijk', 'iets', "extra's"),
 ('verwachtten', 'eigenlijk', 'iets', "extra's", 'uit'),
 ('eigenlijk', 'iets', "extra's", 'uit', 'de'),
 ('iets', "extra's", 'uit', 'de', 'wonderoogst'),
 ("extra's", 'uit', 'de', 'wonderoogst', '1997'),
 ('uit', 'de', 'wonderoogst', '1997', '.'),
 ('de', 'wonderoogst', '1997', '.', '<DUMMY>'),
 ('wonderoogst', '1997', '.', '<DUMMY>', '<DUMMY>')]

In [25]:
windows[0]  

[['<DUMMY>', '<DUMMY>', 'Sao', 'Paulo', '('], 'B-LOC']

In [13]:
len(windows)

678377

In [14]:
random.shuffle(windows)  # windows 목록들의 순서 섞이도록 

train_data = windows[:int(len(windows) * 0.9)]  # 처음부터 int(len(windows)*0.9)의 index까지의 train_data 생성 
test_data = windows[int(len(windows) * 0.9):]   # 인덱스 int(len(windows)*0.9)부터 끝까지 test_data

## Modeling

![](https://nbviewer.jupyter.org/github/DSKSD/DeepNLP-models-Pytorch/blob/master/images/04.window-classifier-architecture.png)

In [28]:
class WindowClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_size, window_size, hidden_size, output_size):
        super(WindowClassifier, self).__init__()  
        self.embed = nn.Embedding(vocab_size, embedding_size)  # nn.Embedding(embedding할 사이즈, 각 embedding vector의 사이즈)
        self.h_layer1 = nn.Linear(embedding_size*(window_size*2+1), hidden_size)
        self.h_layer2 = nn.Linear(hidden_size, hidden_size)
        self.o_layer = nn.Linear(hidden_size, output_size)
        
        self.relu = nn.ReLU()
        self.softmax = nn.LogSoftmax(dim=1)
        self.dropout = nn.Dropout(0.3)  # probability of an element to be zeroed = 0.3
        
    def forward(self, inputs, is_training=False): 
        embeds = self.embed(inputs)  # Batch x Window x D
        concated = embeds.view(-1, embeds.size(1)*embeds.size(2))  # B x (W * D), row는 그대로 
        h0 = self.relu(self.h_layer1(concated))
        if is_training:
            h0 = self.dropout(h0)
        h1 = self.relu(self.h_layer2(h0))
        if is_training:
            h1 = self.dropout(h1)
        out = self.softmax(self.o_layer(h1))
        return out 

In [16]:
BATCH_SIZE = 128
EMBEDDING_SIZE = 50
HIDDEN_SIZE = 300
EPOCH = 3
LEARNING_RATE = 0.001

## Training

In [17]:
# __init__(self, vocab_size, embedding_size, window_size, hidden_size, output_size)
model = WindowClassifier(len(word2index), EMBEDDING_SIZE, WINDOW_SIZE, HIDDEN_SIZE, len(tag2index))

if USE_CUDA:
    model = model.cuda()

loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [51]:
def prepare_sequence(seq, word2index):
    idxs = list(map(lambda w: word2index[w] if word2index.get(w) is not None else word2index["<UNK>"], 
                    seq))
    return Variable(LongTensor(idxs))

def prepare_word(word, word2index):
    return Variable(LongTensor([word2index[word]]) if word2index.get(word) is not None else LongTensor([word2index["<UNK>"]]))

def prepare_tag(tag,tag2index):
    return Variable(LongTensor([tag2index[tag]]))

In [18]:
for epoch in range(EPOCH):
    losses = []
    for i, batch in enumerate(getBatch(BATCH_SIZE, train_data)):
        x, y = list(zip(*batch))  # batch의 i번째 원소들끼리 묶어서 x와 y(=label)에 각각 반환
        inputs = torch.cat([prepare_sequence(sent, word2index).view(1, -1) for sent in x]) # x의 개수만큼의 sent에 대해서 
        targets = torch.cat([prepare_tag(tag, tag2index) for tag in y])                    # y의 개수만큼의 tag에 대해서 
       
        model.zero_grad()
        preds = model(inputs, is_training=True)
        loss = loss_function(preds, targets)
        losses.append(loss.item())
        loss.backward()
        optimizer.step()  # update the parameters
        
        if i % 1000 == 0:
            print("[%d/%d] mean_loss : %0.2f" %(epoch+1, EPOCH, np.mean(losses)))
            losses = []

[1/3] mean_loss : 2.10
[1/3] mean_loss : 0.47
[1/3] mean_loss : 0.37
[1/3] mean_loss : 0.32
[1/3] mean_loss : 0.28
[2/3] mean_loss : 0.21
[2/3] mean_loss : 0.22
[2/3] mean_loss : 0.21
[2/3] mean_loss : 0.20
[2/3] mean_loss : 0.19
[3/3] mean_loss : 0.12
[3/3] mean_loss : 0.15
[3/3] mean_loss : 0.14
[3/3] mean_loss : 0.14
[3/3] mean_loss : 0.14


## Test

In [19]:
for_f1_score = [] 

accuracy = 0
for test in test_data:
    x, y = test[0], test[1]
    input_ = prepare_sequence(x, word2index).view(1, -1)
    
    i = model(input_).max(1)[1] 
    pred = index2tag[i.data.tolist()[0]]  # model의 output으로 나온 index i를 가지고 그에 해당하는 tag를 알아낸다. 
    for_f1_score.append([pred, y])
    if pred == y:
        accuracy += 1
    
print(accuracy/len(test_data)*100)

95.76491052212624


In [74]:
x 

['de', 'Madrid', ',', 'Juan', 'Carlos']

In [75]:
y

'O'

In [52]:
model(input_)

tensor([[-19.1951, -20.8160,  -0.0001, -21.8554, -10.6259, -19.3503, -17.5546,
          -9.9770, -21.2898]], grad_fn=<LogSoftmaxBackward>)

In [53]:
model(input_).max(1)[1]

tensor([2])

This high score is because most of labels are '0' tag. 
So we need to measure f1 score. 

### Print Confusion Matrix

In [22]:
y_pred, y_test = list(zip(*for_f1_score))

In [23]:
# '0' 제외 
sorted_labels = sorted(
    list(set(y_test) - {'O'}),
    key=lambda name: (name[1:], name[0])
)

In [24]:
sorted_labels

['B-LOC', 'I-LOC', 'B-MISC', 'I-MISC', 'B-ORG', 'I-ORG', 'B-PER', 'I-PER']

In [25]:
print(classification_report(y_test, y_pred, labels=sorted_labels, digits=3))

             precision    recall  f1-score   support

      B-LOC      0.799     0.686     0.738      1136
      I-LOC      0.651     0.478     0.551       320
     B-MISC      0.705     0.429     0.534       801
     I-MISC      0.640     0.358     0.459       646
      B-ORG      0.754     0.709     0.731      1343
      I-ORG      0.723     0.730     0.726       917
      B-PER      0.807     0.762     0.784      1304
      I-PER      0.894     0.806     0.848       961

avg / total      0.765     0.659     0.703      7428

