In [1]:
%matplotlib inline
import numpy as np
from matplotlib import pyplot as plt

import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import pickle

from utils import load_sentences_and_labels, get_normalized_sentences, \
        load_word2vec, build_token_embeddings_tensor, encode_sentences_and_labels

In [2]:
sentences, labels = load_sentences_and_labels('factRuEval-2016-master/devset')

#normalized_sentences = get_normalized_sentences(sentences)
#with open('devset-normalized_sentences.pk', 'wb') as normalized_sentences_dump:
#    pickle.dump(normalized_sentences, normalized_sentences_dump)

with open('devset-normalized_sentences.pk', 'rb') as normalized_sentences_dump:
    normalized_sentences = pickle.load(normalized_sentences_dump)

In [3]:
data_size = len(sentences)
train_data_size = int(data_size*0.8)
val_data_size = data_size - train_data_size

np.random.seed(seed=1)
data_permutation = np.random.permutation(data_size)

train_sentences = sentences[data_permutation[:train_data_size]]
train_normalized_sentences = sentences[data_permutation[:train_data_size]]
train_labels = labels[data_permutation[:train_data_size]]

symbol_codes, token_codes, label_codes, sentence_dimension, token_dimension, \
        train_encoded_symbols, train_encoded_tokens, train_encoded_labels = \
        encode_sentences_and_labels(train_sentences, train_normalized_sentences,
                                    train_labels)

unique_symbols_count = len(symbol_codes)
unique_tokens_count = len(token_codes)
unique_labels_count = len(label_codes)

val_sentences = sentences[data_permutation[train_data_size:]]
val_normalized_sentences = sentences[data_permutation[train_data_size:]]
val_labels = labels[data_permutation[train_data_size:]]

val_encoded_symbols, val_encoded_tokens, val_encoded_labels = \
        encode_sentences_and_labels(val_sentences, val_normalized_sentences,
        val_labels, codes=(symbol_codes, token_codes, label_codes),
        dimensions=(sentence_dimension, token_dimension))

In [4]:
class NERTagger(nn.Module):

    def __init__(self, unique_symbols_count, unique_labels_count, token_dimension,
                 token_embeddings_tensor):

        super(NERTagger, self).__init__()

        #TODO: add residual connections

        symbol_embedding_dimension = 64
        symbol_convolution_out_channels = 128
        symbol_convolution_kernel_size = 3
        symbol_convolution_padding = 1

        token_convolution_out_channels = 256
        token_convolution_kernel_size = 5
        token_convolution_padding = 2

        self.symbol_embedding = nn.Embedding(unique_symbols_count,
                                             symbol_embedding_dimension)
        self.symbol_convolution = nn.Conv1d(symbol_embedding_dimension,
                                            symbol_convolution_out_channels,
                                            symbol_convolution_kernel_size,
                                            padding=symbol_convolution_padding)
        self.symbol_relu = nn.ReLU()
        self.symbol_dropout = nn.Dropout()
        self.symbol_batchnorm = nn.BatchNorm1d(symbol_convolution_out_channels)
        self.symbol_pooling = nn.MaxPool1d(token_dimension,
                                           stride=token_dimension)

        unique_tokens_count, token_embedding_dimension = token_embeddings_tensor.shape
        
        self.token_embedding = nn.Embedding(unique_tokens_count,
                                            token_embedding_dimension)
        self.token_embedding.weight = nn.Parameter(token_embeddings_tensor)

        self.token_convolution1 = nn.Conv1d(token_embedding_dimension + \
                                            symbol_convolution_out_channels,
                                            token_convolution_out_channels,
                                            token_convolution_kernel_size,
                                            padding=token_convolution_padding)
        self.token_relu1 = nn.ReLU()
        self.token_dropout1 = nn.Dropout()
        self.token_batchnorm1 = nn.BatchNorm1d(token_convolution_out_channels)

        self.token_convolution2 = nn.Conv1d(token_convolution_out_channels,
                                            token_convolution_out_channels,
                                            token_convolution_kernel_size,
                                            padding=token_convolution_padding)
        self.token_relu2 = nn.ReLU()
        self.token_dropout2 = nn.Dropout()
        self.token_batchnorm2 = nn.BatchNorm1d(token_convolution_out_channels)

        self.lstm = nn.LSTM(token_convolution_out_channels, unique_labels_count)

    def forward(self, encoded_symbols, encoded_tokens):

        symbol_embeddings = self.symbol_embedding(encoded_symbols)
        symbol_embeddings = torch.transpose(symbol_embeddings, 1, 2)
        symbol_forwarded_data = self.symbol_convolution(symbol_embeddings)
        symbol_forwarded_data = self.symbol_relu(symbol_forwarded_data)
        symbol_forwarded_data = self.symbol_dropout(symbol_forwarded_data)
        symbol_forwarded_data = self.symbol_batchnorm(symbol_forwarded_data)
        symbol_forwarded_data = self.symbol_pooling(symbol_forwarded_data)
        symbol_forwarded_data = torch.transpose(symbol_forwarded_data, 1, 2)

        token_embeddings = self.token_embedding(encoded_tokens)
        token_forwarded_data = torch.cat((symbol_forwarded_data, token_embeddings), dim=2)
        token_forwarded_data = torch.transpose(token_forwarded_data, 1, 2)

        token_forwarded_data = self.token_convolution1(token_forwarded_data)
        token_forwarded_data = self.token_relu1(token_forwarded_data)
        token_forwarded_data = self.token_dropout1(token_forwarded_data)
        token_forwarded_data = self.token_batchnorm1(token_forwarded_data)

        token_forwarded_data = self.token_convolution2(token_forwarded_data)
        token_forwarded_data = self.token_relu2(token_forwarded_data)
        token_forwarded_data = self.token_dropout2(token_forwarded_data)
        token_forwarded_data = self.token_batchnorm2(token_forwarded_data)

        result_forwarded_data = torch.transpose(token_forwarded_data, 1, 2)
        result_forwarded_data = torch.transpose(result_forwarded_data, 0, 1)
        result_forwarded_data = self.lstm(result_forwarded_data)[0]
        result_forwarded_data = torch.transpose(result_forwarded_data, 0, 1)
        result_forwarded_data = F.log_softmax(result_forwarded_data, dim=2)

        return result_forwarded_data

In [None]:
word2vec = load_word2vec()
token_embeddings_tensor = build_token_embeddings_tensor(token_codes, word2vec, 300)

model = NERTagger(unique_symbols_count, unique_labels_count, token_dimension,
                  token_embeddings_tensor)

loss_function = nn.NLLLoss()
#TODO: try ADAM, RMSProp - works bad :(
optimizer = optim.SGD(model.parameters(), lr=0.002, weight_decay=0.95)

In [None]:
epochs_count = 100
batch_size = 200
EPSILON = 1e-10

loss_and_metrics_per_epoch = []

for epoch in range(epochs_count):

    train_data_permutation = np.random.permutation(train_data_size)

    data_pass_loss = {}
    data_pass_recall = {}
    data_pass_precision = {}
    data_pass_f1 = {}

    for pass_name, pass_size, pass_encoded_symbols, pass_encoded_tokens, pass_encoded_labels in \
            [('train', train_data_size, train_encoded_symbols, train_encoded_tokens, train_encoded_labels),
             ('val', val_data_size, val_encoded_symbols, val_encoded_tokens, val_encoded_labels)]:

        if pass_name == 'train':
            model.train()
        else:
            model.eval()
            
        data_pass_loss[pass_name] = 0.0
        data_pass_recall[pass_name] = 0.0
        data_pass_precision[pass_name] = 0.0
        data_pass_f1[pass_name] = 0.0

        for i in range(pass_size//batch_size):

            batch_indices = slice(batch_size*i, batch_size*(i + 1))

            if pass_name == 'train':
                model.zero_grad()
                batch_indices = train_data_permutation[batch_indices]

            batch_encoded_symbols = autograd.Variable(torch.LongTensor(
                    pass_encoded_symbols[batch_indices]))

            batch_encoded_tokens = pass_encoded_tokens[batch_indices]

            if pass_name == 'train':
                random_mask = np.random.uniform(size=batch_encoded_tokens.shape)
                batch_encoded_tokens[random_mask > 0.7] = token_codes['__unknown__']

            batch_encoded_tokens = autograd.Variable(torch.LongTensor(
                    batch_encoded_tokens))
            batch_encoded_labels_numpy = pass_encoded_labels[batch_indices]
            batch_labels_count = batch_encoded_labels_numpy.shape[0]*\
                    batch_encoded_labels_numpy.shape[1]
            batch_encoded_labels_numpy = batch_encoded_labels_numpy.ravel()
            batch_encoded_labels = autograd.Variable(torch.LongTensor(
                    batch_encoded_labels_numpy))

            if pass_name == 'train':
                predicted_label_probs = model(batch_encoded_symbols, batch_encoded_tokens).view(
                        batch_labels_count, unique_labels_count)
            else:
                #TODO: turn off dropout
                predicted_label_probs = model(batch_encoded_symbols, batch_encoded_tokens).view(
                        batch_labels_count, unique_labels_count)

            predicted_encoded_labels = predicted_label_probs.data.numpy().argmax(axis=1)

            relevant_mask = (batch_encoded_labels_numpy != label_codes['none'])
            selected_mask = (predicted_encoded_labels != label_codes['none'])
            relevant_and_selected_count = np.logical_and(relevant_mask, selected_mask).sum()

            recall = relevant_and_selected_count/(relevant_mask.sum() + EPSILON)
            precision = relevant_and_selected_count/(selected_mask.sum() + EPSILON)
            f1 = 2*recall*precision/(recall + precision + EPSILON)

            data_pass_recall[pass_name] += recall
            data_pass_precision[pass_name] += precision
            data_pass_f1[pass_name] += f1

            loss = loss_function(predicted_label_probs, batch_encoded_labels)

            if pass_name == 'train':
                loss.backward()
                optimizer.step()

            data_pass_loss[pass_name] += loss.data[0]
        
        for data_pass_statistic in [data_pass_loss, data_pass_recall,
                                    data_pass_precision, data_pass_f1]:
            data_pass_statistic[pass_name] /= pass_size//batch_size

    print('#{0} epoch:'.format(epoch + 1))
    print('\ttrain_nll={0:.3f} val_nll={1:.3f}'.format(data_pass_loss['train'],
                                               data_pass_loss['val']))
    print('\ttrain_recall={0:.3f} val_recall={1:.3f}'.format(data_pass_recall['train'],
                                                     data_pass_recall['val']))
    print('\ttrain_precision={0:.3f} val_precision={1:.3f}'.format(data_pass_precision['train'],
                                                           data_pass_precision['val']))
    print('\ttrain_f1={0:.3f} val_f1={1:.3f}'.format(data_pass_f1['train'], data_pass_f1['val']))

    loss_and_metrics_per_epoch.append((data_pass_loss, data_pass_recall, data_pass_precision,
                                       data_pass_f1))