In [21]:
from __future__ import print_function
from collections import OrderedDict
import os
import matplotlib.pyplot as plt
import torch
import copy
torch.manual_seed(0)

<torch._C.Generator at 0x7fdc06a12330>

In [22]:
class Parse():
    def __init__(self):
        self.dataset = 'conll'
        self.result_path = 'neural_ner/results'
        self.usemodel = 'CNN_BiLSTM_CRF'
        self.worddim = 100
        self.pretrnd = 'wordvectors/glove.6B.100d.txt'
        self.reload = 0
        self.checkpoint = '.'
        self.num_epochs = 10

opt=Parse()

In [23]:
a = torch.autograd.Variable(torch.randn(4)).cuda()

In [24]:
a[1]

Variable containing:
-0.2934
[torch.cuda.FloatTensor of size 1 (GPU 0)]

In [25]:
from __future__ import print_function
import os
import re
import numpy as np
np.random.seed(0)
import codecs
import random
random.seed(0)

START_TAG = '<START>'
STOP_TAG = '<STOP>'

def get_name(parameters):
    """
    Generate a model name from its parameters.
    """
    l = []
    for k, v in parameters.items():
        if type(v) is str and "/" in v:
            l.append((k, v[::-1][:v[::-1].index('/')][::-1]))
        else:
            l.append((k, v))
    name = ",".join(["%s=%s" % (k, str(v).replace(',', '')) for k, v in l])
    return "".join(i for i in name if i not in "\/:*?<>|")


def set_values(name, param, pretrained):
    """
    Initialize a network parameter with pretrained values.
    We check that sizes are compatible.
    """
    param_value = param.get_value()
    if pretrained.size != param_value.size:
        raise Exception(
            "Size mismatch for parameter %s. Expected %i, found %i."
            % (name, param_value.size, pretrained.size)
        )
    param.set_value(np.reshape(
        pretrained, param_value.shape).astype(np.float32))


def create_dico(item_list):
    """
    Create a dictionary of items from a list of list of items.
    """
    assert type(item_list) is list
    dico = {}
    for items in item_list:
        for item in items:
            if item not in dico:
                dico[item] = 1
            else:
                dico[item] += 1
    return dico


def create_mapping(dico):
    """
    Create a mapping (item to ID / ID to item) from a dictionary.
    Items are ordered by decreasing frequency.
    """
    sorted_items = sorted(dico.items(), key=lambda x: (-x[1], x[0]))
    id_to_item = {i: v[0] for i, v in enumerate(sorted_items)}
    item_to_id = {v: k for k, v in id_to_item.items()}
    return item_to_id, id_to_item


def zero_digits(s):
    """
    Replace every digit in a string by a zero.
    """
    return re.sub('\d', '0', s)


def iob2(tags):
    """
    Check that tags have a valid IOB format.
    Tags in IOB1 format are converted to IOB2.
    """
    for i, tag in enumerate(tags):
        if tag == 'O':
            continue
        split = tag.split('-')
        if len(split) != 2 or split[0] not in ['I', 'B']:
            return False
        if split[0] == 'B':
            continue
        elif i == 0 or tags[i - 1] == 'O':  # conversion IOB1 to IOB2
            tags[i] = 'B' + tag[1:]
        elif tags[i - 1][1:] == tag[1:]:
            continue
        else:  # conversion IOB1 to IOB2
            tags[i] = 'B' + tag[1:]
    return True


def iob_iobes(tags):
    """
    IOB -> IOBES
    """
    new_tags = []
    for i, tag in enumerate(tags):
        if tag == 'O':
            new_tags.append(tag)
        elif tag.split('-')[0] == 'B':
            if i + 1 != len(tags) and \
               tags[i + 1].split('-')[0] == 'I':
                new_tags.append(tag)
            else:
                new_tags.append(tag.replace('B-', 'S-'))
        elif tag.split('-')[0] == 'I':
            if i + 1 < len(tags) and \
                    tags[i + 1].split('-')[0] == 'I':
                new_tags.append(tag)
            else:
                new_tags.append(tag.replace('I-', 'E-'))
        else:
            raise Exception('Invalid IOB format!')
    return new_tags


def iobes_iob(tags):
    """
    IOBES -> IOB
    """
    new_tags = []
    for i, tag in enumerate(tags):
        if tag.split('-')[0] == 'B':
            new_tags.append(tag)
        elif tag.split('-')[0] == 'I':
            new_tags.append(tag)
        elif tag.split('-')[0] == 'S':
            new_tags.append(tag.replace('S-', 'B-'))
        elif tag.split('-')[0] == 'E':
            new_tags.append(tag.replace('E-', 'I-'))
        elif tag.split('-')[0] == 'O':
            new_tags.append(tag)
        else:
            raise Exception('Invalid format!')
    return new_tags


def insert_singletons(words, singletons, p=0.5):
    """
    Replace singletons by the unknown word with a probability p.
    """
    new_words = []
    for word in words:
        if word in singletons and np.random.uniform() < p:
            new_words.append(0)
        else:
            new_words.append(word)
    return new_words


def pad_word_chars(words):
    """
    Pad the characters of the words in a sentence.
    Input:
        - list of lists of ints (list of words, a word being a list of char indexes)
    Output:
        - padded list of lists of ints
        - padded list of lists of ints (where chars are reversed)
        - list of ints corresponding to the index of the last character of each word
    """
    max_length = max([len(word) for word in words])
    char_for = []
    char_rev = []
    char_pos = []
    for word in words:
        padding = [0] * (max_length - len(word))
        char_for.append(word + padding)
        char_rev.append(word[::-1] + padding)
        char_pos.append(len(word) - 1)
    return char_for, char_rev, char_pos


def create_input(data, parameters, add_label, singletons=None):
    """
    Take sentence data and return an input for
    the training or the evaluation function.
    """
    words = data['words']
    chars = data['chars']
    if singletons is not None:
        words = insert_singletons(words, singletons)
    if parameters['cap_dim']:
        caps = data['caps']
    char_for, char_rev, char_pos = pad_word_chars(chars)
    input = []
    if parameters['word_dim']:
        input.append(words)
    if parameters['char_dim']:
        input.append(char_for)
        if parameters['char_bidirect']:
            input.append(char_rev)
        input.append(char_pos)
    if parameters['cap_dim']:
        input.append(caps)
    if add_label:
        input.append(data['tags'])
    return input

def char_mapping(sentences):
    """
    Create a dictionary and mapping of characters, sorted by frequency.
    """
    chars = ["".join([w[0] for w in s]) for s in sentences]
    dico = create_dico(chars)
    dico['<PAD>'] = 10000000
    # dico[';'] = 0
    char_to_id, id_to_char = create_mapping(dico)
    print("Found %i unique characters" % len(dico))
    return dico, char_to_id, id_to_char


def tag_mapping(sentences):
    """
    Create a dictionary and a mapping of tags, sorted by frequency.
    """
    tags = [[word[-1] for word in s] for s in sentences]
    dico = create_dico(tags)
    dico[START_TAG] = -1
    dico[STOP_TAG] = -2
    tag_to_id, id_to_tag = create_mapping(dico)
    print("Found %i unique named entity tags" % len(dico))
    return dico, tag_to_id, id_to_tag


def cap_feature(s):
    """
    Capitalization feature:
    0 = low caps
    1 = all caps
    2 = first letter caps
    3 = one capital (not first letter)
    """
    if s.lower() == s:
        return 0
    elif s.upper() == s:
        return 1
    elif s[0].upper() == s[0]:
        return 2
    else:
        return 3


def prepare_sentence(str_words, word_to_id, char_to_id, lower=False):
    """
    Prepare a sentence for evaluation.
    """
    def f(x): return x.lower() if lower else x
    words = [word_to_id[f(w) if f(w) in word_to_id else '<UNK>']
             for w in str_words]
    chars = [[char_to_id[c] for c in w if c in char_to_id]
             for w in str_words]
    caps = [cap_feature(w) for w in str_words]
    return {
        'str_words': str_words,
        'words': words,
        'chars': chars,
        'caps': caps
    }


def prepare_dataset(sentences, word_to_id, char_to_id, tag_to_id, lower=True):
    """
    Prepare the dataset. Return a list of lists of dictionaries containing:
        - word indexes
        - word char indexes
        - tag indexes
    """
    def f(x): return x.lower() if lower else x
    data = []
    for s in sentences:
        str_words = [w[0] for w in s]
        words = [word_to_id[f(w) if f(w) in word_to_id else '<UNK>']
                 for w in str_words]
        # Skip characters that are not in the training set
        chars = [[char_to_id[c] for c in w if c in char_to_id]
                 for w in str_words]
        caps = [cap_feature(w) for w in str_words]
        tags = [tag_to_id[w[-1]] for w in s]
        data.append({
            'str_words': str_words,
            'words': words,
            'chars': chars,
            'caps': caps,
            'tags': tags,
        })
    return data


def augment_with_pretrained(dictionary, ext_emb_path, words):
    """
    Augment the dictionary with words that have a pretrained embedding.
    If `words` is None, we add every word that has a pretrained embedding
    to the dictionary, otherwise, we only add the words that are given by
    `words` (typically the words in the development and test sets.)
    """
    print('Loading pretrained embeddings from %s...' % ext_emb_path)
    assert os.path.isfile(ext_emb_path)

    # Load pretrained embeddings from file
    pretrained = set([
        line.rstrip().split()[0].strip()
        for line in codecs.open(ext_emb_path, 'r', 'utf-8')
        if len(ext_emb_path) > 0
    ])
    
    if words is None:
        for word in pretrained:
            if word not in dictionary:
                dictionary[word] = 0
    else:
        for word in words:
            if any(x in pretrained for x in [
                word,
                word.lower(),
                re.sub('\d', '0', word.lower())
            ]) and word not in dictionary:
                dictionary[word] = 0

    word_to_id, id_to_word = create_mapping(dictionary)
    return dictionary, word_to_id, id_to_word


def pad_seq(seq, max_length, PAD_token=0):
    
    seq += [PAD_token for i in range(max_length - len(seq))]
    return seq

def log_sum_exp(vec, dim=-1, keepdim = False):
    max_score, _ = vec.max(dim, keepdim=keepdim)
    if keepdim:
        stable_vec = vec - max_score
    else:
        stable_vec = vec - max_score.unsqueeze(dim)
    output = max_score + (stable_vec.exp().sum(dim, keepdim=keepdim)).log()
    return output

def create_batches(dataset, batch_size, order='keep', str_words=False, tag_padded= True):
    
        newdata = copy.deepcopy(dataset)
        if order=='sort':
            newdata.sort(key = lambda x:len(x['words']))
        elif order=='random':
            random.shuffle(newdata)
        
        newdata = np.array(newdata)  
        batches = []
        num_batches = np.ceil(len(dataset)/float(batch_size)).astype('int')
        
        for i in range(num_batches):
            batch_data = newdata[(i*batch_size):min(len(dataset),(i+1)*batch_size)]
            
            words_seqs = [itm['words'] for itm in batch_data]
            caps_seqs = [itm['caps'] for itm in batch_data]
            target_seqs = [itm['tags'] for itm in batch_data]
            chars_seqs = [itm['chars'] for itm in batch_data]
            str_words_seqs = [itm['str_words'] for itm in batch_data]
            
            seq_pairs = sorted(zip(words_seqs, caps_seqs, target_seqs, chars_seqs, str_words_seqs), 
                               key=lambda p: len(p[0]), reverse=True)
            
            words_seqs, caps_seqs, target_seqs, chars_seqs, str_words_seqs = zip(*seq_pairs)
            words_lengths = np.array([len(s) for s in words_seqs])
            
            words_padded = np.array([pad_seq(s, np.max(words_lengths)) for s in words_seqs])
            caps_padded = np.array([pad_seq(s, np.max(words_lengths)) for s in caps_seqs])
            
            if tag_padded:
                target_padded = np.array([pad_seq(s, np.max(words_lengths)) for s in target_seqs])
            else:
                target_padded = target_seqs
            
            words_mask = (words_padded!=0).astype('int')
            
            chars_pseqs = [pad_seq(s, max(words_lengths), []) for s in chars_seqs]
            chars_lengths = np.array([[len(s) for s in w] for w in chars_pseqs]).reshape(-1)
            chars_padded = np.array([[pad_seq(s, np.max(chars_lengths)) 
                                      for s in w] for w in chars_pseqs]).reshape(-1,np.max(chars_lengths))
    
            if str_words:
                outputdict = {'words':words_padded, 'caps':caps_padded, 'tags': target_padded, 
                              'chars': chars_padded, 'wordslen': words_lengths, 'charslen': chars_lengths,
                              'tagsmask':words_mask, 'str_words': str_words_seqs}
            else:
                outputdict = {'words':words_padded, 'caps':caps_padded, 'tags': target_padded, 
                              'chars': chars_padded, 'wordslen': words_lengths, 'charslen': chars_lengths,
                              'tagsmask':words_mask}
            
            batches.append(outputdict)
        
        return batches

In [26]:
import torch.nn as nn
from torch.nn import init

class Initializer(object):
    
    def __init__(self):
        pass
    
    def init_embedding(self, input_embedding):
        bias = np.sqrt(3.0 / input_embedding.size(1))
        nn.init.uniform(input_embedding, -bias, bias)
    
    def init_linear(self, input_linear):
        bias = np.sqrt(6.0 / (input_linear.weight.size(0) + input_linear.weight.size(1)))
        nn.init.uniform(input_linear.weight, -bias, bias)
        if input_linear.bias is not None:
            input_linear.bias.data.zero_()
    
    def init_lstm(self, input_lstm):
        for ind in range(0, input_lstm.num_layers):
            weight = eval('input_lstm.weight_ih_l' + str(ind))
            bias = np.sqrt(6.0 / (weight.size(0) / 4 + weight.size(1)))
            nn.init.uniform(weight, -bias, bias)
            weight = eval('input_lstm.weight_hh_l' + str(ind))
            bias = np.sqrt(6.0 / (weight.size(0) / 4 + weight.size(1)))
            nn.init.uniform(weight, -bias, bias)
        
        if input_lstm.bidirectional:
            for ind in range(0, input_lstm.num_layers):
                weight = eval('input_lstm.weight_ih_l' + str(ind) + '_reverse')
                bias = np.sqrt(6.0 / (weight.size(0) / 4 + weight.size(1)))
                nn.init.uniform(weight, -bias, bias)
                weight = eval('input_lstm.weight_hh_l' + str(ind) + '_reverse')
                bias = np.sqrt(6.0 / (weight.size(0) / 4 + weight.size(1)))
                nn.init.uniform(weight, -bias, bias)
        
        if input_lstm.bias:
            
            for ind in range(0, input_lstm.num_layers):
                weight = eval('input_lstm.bias_ih_l' + str(ind))
                weight.data.zero_()
                weight.data[input_lstm.hidden_size: 2 * input_lstm.hidden_size] = 1
                weight = eval('input_lstm.bias_hh_l' + str(ind))
                weight.data.zero_()
                weight.data[input_lstm.hidden_size: 2 * input_lstm.hidden_size] = 1
            
            if input_lstm.bidirectional:
                for ind in range(0, input_lstm.num_layers):
                    weight = eval('input_lstm.bias_ih_l' + str(ind) + '_reverse')
                    weight.data.zero_()
                    weight.data[input_lstm.hidden_size: 2 * input_lstm.hidden_size] = 1
                    weight = eval('input_lstm.bias_hh_l' + str(ind) + '_reverse')
                    weight.data.zero_()
                    weight.data[input_lstm.hidden_size: 2 * input_lstm.hidden_size] = 1

In [27]:
from __future__ import print_function
import os
from torch.autograd import Variable
import codecs
import cPickle

class Loader(object):
    
    def __init__(self):
        pass
    
    def pad_sequence_cnn(self, chars):
        d = {}
        chars_length = [len(c) for c in chars]
        chars_maxlen = max(chars_length)
        chars_mask = np.zeros((len(chars_length), chars_maxlen), dtype='int')
        for i, c in enumerate(chars):
            chars_mask[i, :chars_length[i]] = c
        return chars_mask, chars_length, d
    
    
    def pad_sequence_rnn(self, chars):
        chars_sorted = sorted(chars, key=lambda p: len(p), reverse=True)
        d = {}
        for i, ci in enumerate(chars):
            for j, cj in enumerate(chars_sorted):
                if ci == cj and not j in d and not i in d.values():
                    d[j] = i
                    continue
        chars_length = [len(c) for c in chars_sorted]
        chars_maxlen = max(chars_length)
        chars_mask = np.zeros((len(chars_sorted), char_maxlen), dtype='int')
        for i, c in enumerate(chars_sorted):
            chars_mask[i, :chars_length[i]] = c
        return chars_mask, chars_length, d
    
    def update_tag_scheme(self, sentences, tag_scheme):
        
        for i, s in enumerate(sentences):
            tags = [w[-1] for w in s]
            if not iob2(tags):
                s_str = '\n'.join(' '.join(w) for w in s)
                raise Exception('Sentences should be given in IOB format! ' +
                                'Please check sentence %i:\n%s' % (i, s_str))
            if tag_scheme == 'iob':
                for word, new_tag in zip(s, tags):
                    word[-1] = new_tag
            elif tag_scheme == 'iobes':
                new_tags = iob_iobes(tags)
                for word, new_tag in zip(s, new_tags):
                    word[-1] = new_tag
            else:
                raise Exception('Unknown tagging scheme!')
                
    def word_mapping(self, sentences, lower):
        
        words = [[x[0].lower() if lower else x[0] for x in s] for s in sentences]
        dico = create_dico(words)

        dico['<PAD>'] = 10000001
        dico['<UNK>'] = 10000000
        dico = {k:v for k,v in dico.items() if v>=3}
        word_to_id, id_to_word = create_mapping(dico)

        print("Found %i unique words (%i in total)" % (
            len(dico), sum(len(x) for x in words)
        ))
        return dico, word_to_id, id_to_word
    
    def load_conll_sentences(self, path, lower, zeros):
        
        sentences = []
        sentence = []
        for line in codecs.open(path, 'r', 'utf-8'):
            line = zero_digits(line.rstrip()) if zeros else line.rstrip()
            if not line:
                if len(sentence) > 0:
                    if 'DOCSTART' not in sentence[0][0]:
                        sentences.append(sentence)
                    sentence = []
            else:
                word = line.split()
                assert len(word) >= 2
                sentence.append(word)
        if len(sentence) > 0:
            if 'DOCSTART' not in sentence[0][0]:
                sentences.append(sentence)
        return sentences
    
    def load_conll(self, dataset ,parameters):
        
        zeros = parameters['zeros']
        lower = parameters['lower']
        word_dim = parameters['wrdim']
        pretrained = parameters['ptrnd']
        tag_scheme = parameters['tgsch']
        
        train_path = os.path.join(dataset,'eng.train')
        dev_path = os.path.join(dataset,'eng.testa')
        test_path = os.path.join(dataset,'eng.testb')
        test_train_path = os.path.join(dataset,'eng.train54019')
        
        train_sentences = self.load_conll_sentences(train_path, lower, zeros)
        dev_sentences = self.load_conll_sentences(dev_path, lower, zeros)
        test_sentences = self.load_conll_sentences(test_path, lower, zeros)
        test_train_sentences = self.load_conll_sentences(test_train_path, lower, zeros)
        
        self.update_tag_scheme(train_sentences, tag_scheme)
        self.update_tag_scheme(dev_sentences, tag_scheme)
        self.update_tag_scheme(test_sentences, tag_scheme)
        self.update_tag_scheme(test_train_sentences, tag_scheme)
        
        dico_words_train = self.word_mapping(train_sentences, lower)[0]
        
        all_embedding = 1
        dico_words, word_to_id, id_to_word = augment_with_pretrained(
                dico_words_train.copy(),
                pretrained,
                list(itertools.chain.from_iterable(
                    [[w[0] for w in s] for s in dev_sentences + test_sentences])
                ) if not all_embedding else None)

        dico_chars, char_to_id, id_to_char = char_mapping(train_sentences)
        dico_tags, tag_to_id, id_to_tag = tag_mapping(train_sentences)
        
        train_data = prepare_dataset(train_sentences, word_to_id, char_to_id, tag_to_id, lower)
        dev_data = prepare_dataset(dev_sentences, word_to_id, char_to_id, tag_to_id, lower)
        test_data = prepare_dataset(test_sentences, word_to_id, char_to_id, tag_to_id, lower)
        test_train_data = prepare_dataset(test_train_sentences, word_to_id, char_to_id, tag_to_id, lower)
        
        print("%i / %i / %i sentences in train / dev / test." % (
              len(train_data), len(dev_data), len(test_data)))
        
        mapping_file = os.path.join(dataset,'mappinghawa.pkl')
        
        if not os.path.isfile(mapping_file):
            all_word_embeds = {}
            for i, line in enumerate(codecs.open(pretrained, 'r', 'utf-8')):
                s = line.strip().split()
                if len(s) == word_dim + 1:
                    all_word_embeds[s[0]] = np.array([float(i) for i in s[1:]])

            word_embeds = np.random.uniform(-np.sqrt(0.06), np.sqrt(0.06), (len(word_to_id), word_dim))

            for w in word_to_id:
                if w in all_word_embeds:
                    word_embeds[word_to_id[w]] = all_word_embeds[w]
                elif w.lower() in all_word_embeds:
                    word_embeds[word_to_id[w]] = all_word_embeds[w.lower()]

            print('Loaded %i pretrained embeddings.' % len(all_word_embeds))

            with open(mapping_file, 'wb') as f:
                mappings = {
                    'word_to_id': word_to_id,
                    'tag_to_id': tag_to_id,
                    'id_to_tag': id_to_tag,
                    'char_to_id': char_to_id,
                    'parameters': parameters,
                    'word_embeds': word_embeds
                }
                cPickle.dump(mappings, f)
        else:
            mappings = cPickle.load(open(mapping_file,'rb'))
            
        return train_data, dev_data, test_data, test_train_data, mappings
        

In [28]:
parameters = OrderedDict()

parameters['model'] = opt.usemodel
parameters['wrdim'] = opt.worddim
parameters['ptrnd'] = opt.pretrnd

if opt.usemodel == 'CNN_BiLSTM_CRF':
    parameters['lower'] = 1
    parameters['zeros'] = 0
    parameters['cpdim'] = 0
    parameters['dpout'] = 0.5
    parameters['chdim'] = 25
    parameters['tgsch'] = 'iob'

    parameters['wldim'] = 200
    parameters['cldim'] = 25
    parameters['cnchl'] = 25
    
    parameters['lrate'] = 0.015
    
elif opt.usemodel == 'CNN_BiLSTM_CRF_MC':
    parameters['lower'] = 1
    parameters['zeros'] = 0
    parameters['cpdim'] = 0
    parameters['dpout'] = 0.5
    parameters['chdim'] = 25
    parameters['tgsch'] = 'iobes'

    parameters['wldim'] = 200
    parameters['cldim'] = 25
    parameters['cnchl'] = 25
    
    parameters['lrate'] = 0.015

elif opt.usemodel == 'CNN_CNN_LSTM':
    parameters['lower'] = 1
    parameters['zeros'] = 0
    parameters['cpdim'] = 0
    parameters['dpout'] = 0.5
    parameters['chdim'] = 25
    parameters['tgsch'] = 'iobes'
    
    parameters['w1chl'] = 400
    parameters['w2chl'] = 400
    parameters['cldim'] = 25
    parameters['cnchl'] = 50
    parameters['dchid'] = 50
    
    parameters['lrate'] = 0.01
    
else:
    raise NotImplementedError()

use_dataset = opt.dataset
dataset_path = os.path.join('datasets', use_dataset)
result_path = os.path.join(opt.result_path, use_dataset)
model_name = opt.usemodel
model_load = opt.reload
loader = Loader()

print('Model:', model_name)
print('Dataset:', use_dataset)

if not os.path.exists(result_path):
    os.makedirs(result_path)
    
if not os.path.exists(os.path.join(result_path,model_name)):
    os.makedirs(os.path.join(result_path,model_name))

if opt.dataset == 'conll':
    train_data, dev_data, test_data, test_train_data, mappings = loader.load_conll(dataset_path, parameters)

word_to_id = mappings['word_to_id']
tag_to_id = mappings['tag_to_id']
char_to_id = mappings['char_to_id']
word_embeds = mappings['word_embeds']

print('Load Complete')

Model: CNN_BiLSTM_CRF
Dataset: conll
Found 7518 unique words (203621 in total)
Loading pretrained embeddings from wordvectors/glove.6B.100d.txt...
Found 85 unique characters
Found 11 unique named entity tags
14041 / 3250 / 3453 sentences in train / dev / test.
Loaded 400000 pretrained embeddings.
Load Complete


In [32]:
train_data[1]

{'caps': [2, 2],
 'chars': [[50, 1, 3, 1, 7], [44, 9, 2, 12, 29, 21, 13, 7, 4]],
 'str_words': [u'Peter', u'Blackburn'],
 'tags': [2, 4],
 'words': [792, 1895]}

In [31]:
mappings['id_to_tag']

{0: u'O',
 1: u'B-LOC',
 2: u'B-PER',
 3: u'B-ORG',
 4: u'I-PER',
 5: u'I-ORG',
 6: u'B-MISC',
 7: u'I-LOC',
 8: u'I-MISC',
 9: '<START>',
 10: '<STOP>'}

In [203]:
class baseRNN(nn.Module):

    def __init__(self, vocab_size, hidden_size, input_dropout_p, output_dropout_p, n_layers, rnn_cell, max_len=25):
        super(baseRNN, self).__init__()
        
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        self.max_len = max_len
        
        self.input_dropout_p = input_dropout_p
        self.output_dropout_p = output_dropout_p
        
        if rnn_cell.lower() == 'lstm':
            self.rnn_cell = nn.LSTM
        elif rnn_cell.lower() == 'gru':
            self.rnn_cell = nn.GRU
        else:
            raise ValueError("Unsupported RNN Cell: {0}".format(rnn_cell))

        self.input_dropout = nn.Dropout(p=input_dropout_p)

    def forward(self, *args, **kwargs):
        raise NotImplementedError()
        
class CharEncoderCNN(nn.Module):
    
    def __init__(self, vocab_size, embedding_size ,out_channels, kernel_width, pad_width, 
                 input_dropout_p=0, output_dropout_p=0, in_channels=1):
        
        super(CharEncoderCNN, self).__init__()
        
        self.out_channels = out_channels
        self.input_dropout = nn.Dropout(input_dropout_p)
        self.output_dropout = nn.Dropout(output_dropout_p)
        self.embedding = nn.Embedding(vocab_size, embedding_size)
        self.cnn = nn.Conv2d(in_channels, out_channels, kernel_size = (kernel_width, embedding_size),
                             padding = (pad_width,0))

    def forward(self, input_var, input_lengths=None):
        
        embedded = self.embedding(input_var).unsqueeze(1)
        embedded = self.input_dropout(embedded)
        output = self.cnn(embedded)
        output = nn.functional.max_pool2d(output, kernel_size=(output.size(2), 1))
        output = output.squeeze(3).squeeze(2)
        
        return output
    
class DecoderCRF(nn.Module):

    def __init__(self, input_dimension, tag_to_ix, input_dropout_p=0.5):
        
        super(DecoderCRF, self).__init__()
        
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)
        
        self.dropout = nn.Dropout(input_dropout_p)
        self.hidden2tag = nn.Linear(input_dimension, self.tagset_size)
        
        self.transitions = nn.Parameter(torch.zeros(self.tagset_size, self.tagset_size))
        self.transitions.data[tag_to_ix[START_TAG], :] = -10000
        self.transitions.data[:, tag_to_ix[STOP_TAG]] = -10000
    

    def viterbi_decode(self, feats, mask ,usecuda = True, score_only= False):
    
        batch_size, sequence_len, num_tags = feats.size()
        
        assert num_tags == self.tagset_size
        
        mask = mask.transpose(0, 1).contiguous()
        feats = feats.transpose(0, 1).contiguous()
        
        backpointers = []
        
        all_forward_vars = Variable(torch.Tensor(sequence_len, 
                                    batch_size, num_tags).fill_(0.)).cuda()
        
        init_vars = torch.Tensor(batch_size, num_tags).fill_(-10000.)
        init_vars[:,self.tag_to_ix[START_TAG]] = 0.
        if usecuda:
            forward_var = Variable(init_vars).cuda()
        else:
            forward_var = Variable(init_vars)
        
        for i in range(sequence_len):
            broadcast_forward = forward_var.view(batch_size, 1, num_tags)
            transition_scores = self.transitions.view(1, num_tags, num_tags)
            
            next_tag_var = broadcast_forward + transition_scores
            
            viterbivars_t, bptrs_t = torch.max(next_tag_var, dim=2)
            
            forward_var = viterbivars_t + feats[i]
            all_forward_vars[i,:,:] = forward_var

            bptrs_t = bptrs_t.squeeze().data.cpu().numpy()
            backpointers.append(bptrs_t)
        
        mask_sum = torch.sum(mask, dim = 0, keepdim =True) - 1
        mask_sum_ex = mask_sum.view(1, batch_size, 1).expand(1, batch_size, num_tags)
        final_forward_var = all_forward_vars.gather(0, mask_sum_ex).squeeze(0)
        
        terminal_var = final_forward_var + self.transitions[self.tag_to_ix[STOP_TAG]].view(1, num_tags)
        terminal_var.data[:,self.tag_to_ix[STOP_TAG]] = -10000.
        terminal_var.data[:,self.tag_to_ix[START_TAG]] = -10000.
        
        path_score, best_tag_id = torch.max(terminal_var, dim = 1)
                
        if score_only:
            return path_score
        
        n_mask_sum = mask_sum.squeeze().data.cpu().numpy() + 1
        best_tag_id = best_tag_id.data.cpu().numpy()
        decoded_tags = []
        for i in range(batch_size):
            best_path = [best_tag_id[i]]
            bp_list = reversed([itm[i] for itm in backpointers[:n_mask_sum[i]]])
            for bptrs_t in bp_list:
                best_tag_id[i] = bptrs_t[best_tag_id[i]]
                best_path.append(best_tag_id[i])
            start = best_path.pop()
            assert start == self.tag_to_ix[START_TAG]
            best_path.reverse()
            decoded_tags.append(best_path)
        
        return path_score, decoded_tags
    
    def crf_forward(self, feats, mask, usecuda=True):
        
        batch_size, sequence_length, num_tags = feats.size()
        
        mask = mask.float().transpose(0, 1).contiguous()
        feats = feats.transpose(0, 1).contiguous()
        
        init_alphas = torch.Tensor(batch_size, num_tags).fill_(-10000.)
        init_alphas[:,self.tag_to_ix[START_TAG]] = 0.
        if usecuda:
            forward_var = Variable(init_alphas).cuda()
        else:
            forward_var = Variable(init_alphas)
        
        for i in range(sequence_length):
            emit_score = feats[i].view(batch_size, num_tags, 1)
            transition_scores = self.transitions.view(1, num_tags, num_tags)
            broadcast_forward = forward_var.view(batch_size, 1, num_tags)
            tag_var = broadcast_forward + transition_scores + emit_score 
            
            forward_var = (log_sum_exp(tag_var, dim = 2) * mask[i].view(batch_size, 1) +
                            forward_var * (1 - mask[i]).view(batch_size, 1))
            
        terminal_var = (forward_var + (self.transitions[self.tag_to_ix[STOP_TAG]]).view(1, -1))
        alpha = log_sum_exp(terminal_var, dim = 1)
        
        return alpha
        
    
    def score_sentence(self, feats, tags, mask, usecuda=True):
                
        batch_size, sequence_length, num_tags = feats.size()
        
        feats = feats.transpose(0, 1).contiguous()
        tags = tags.transpose(0, 1).contiguous()
        mask = mask.float().transpose(0, 1).contiguous()
                
        broadcast_transitions = self.transitions.view(1, num_tags, num_tags).expand(batch_size, num_tags, num_tags)
        
        score = self.transitions[:,self.tag_to_ix[START_TAG]].index_select(0, tags[0])
        
        for i in range(sequence_length - 1):
            current_tag, next_tag = tags[i], tags[i+1]
            
            transition_score = (
                     broadcast_transitions
                    .gather(1, next_tag.view(batch_size, 1, 1).expand(batch_size, 1, num_tags))
                    .squeeze(1)
                    .gather(1, current_tag.view(batch_size, 1))
                    .squeeze(1)
                    )

            emit_score = feats[i].gather(1, current_tag.view(batch_size, 1)).squeeze(1)

            score = score + transition_score* mask[i + 1] + emit_score * mask[i]  
        last_tag_index = mask.sum(0).long() - 1

        last_tags = tags.gather(0, last_tag_index.view(1, batch_size).expand(sequence_length, batch_size))
        last_tags = last_tags[0]

        last_transition_score = self.transitions[self.tag_to_ix[STOP_TAG]].index_select(0, last_tags)
        
        last_inputs = feats[-1]                                     
        last_input_score = last_inputs.gather(1, last_tags.view(batch_size, 1))
        last_input_score = last_input_score.squeeze(1)
        
        score = score + last_transition_score + last_input_score * mask[-1]
        
        return score
    
    def decode(self, input_var, mask, usecuda=True):
        
        input_var = self.dropout(input_var)
        features = self.hidden2tag(input_var)
        score, tag_seq = self.viterbi_decode(features, mask, usecuda=usecuda)
        
        return score, tag_seq
    
    def forward(self, input_var, tags, mask=None, usecuda=True):
        
        if mask is None:
            mask = torch.autograd.Variable(torch.ones(*tags.size()).long())
        
        input_var = self.dropout(input_var)
        features = self.hidden2tag(input_var)
        forward_score = self.crf_forward(features, mask, usecuda=usecuda)
        ground_score = self.score_sentence(features, tags, mask, usecuda=usecuda)
        
        return forward_score-ground_score


class WordEncoderRNN(baseRNN):

    def __init__(self, vocab_size, embedding_size ,hidden_size, char_size, cap_size, input_dropout_p=0.5, 
                 output_dropout_p=0, n_layers=1, bidirectional=True, rnn_cell='lstm'):
        
        super(WordEncoderRNN, self).__init__(vocab_size, hidden_size, input_dropout_p, 
                                             output_dropout_p, n_layers, rnn_cell)

        self.embedding = nn.Embedding(vocab_size, embedding_size)
        
        augmented_embedding_size = embedding_size + char_size + cap_size
        self.rnn = self.rnn_cell(augmented_embedding_size, hidden_size, n_layers,
                                 bidirectional=bidirectional, dropout=output_dropout_p,
                                 batch_first=True)

    def forward(self, words, char_embedding, cap_embedding, input_lengths):
        
        embedded = self.embedding(words)
        if cap_embedding is not None:
            embedded = torch.cat((embedded,char_embedding,cap_embedding),2)  
        else:
            embedded = torch.cat((embedded,char_embedding),2)
    
        embedded = self.input_dropout(embedded)
        embedded = nn.utils.rnn.pack_padded_sequence(embedded, input_lengths, batch_first= True)
        output, _ = self.rnn(embedded)
        output, _ = nn.utils.rnn.pad_packed_sequence(output, batch_first= True)
        
        return output

class WordEncoderCNN(nn.Module):

    def __init__(self, vocab_size, embedding_size, char_size, kernel_width = 5, pad_width = 4, 
                 in_channels=1, out1_channels=800, out2_channels=800, cap_size=0, input_dropout_p=0.5, 
                 output_dropout_p=0):
        
        super(WordEncoderCNN, self).__init__()
        
        self.kernel_width = kernel_width
        self.out2_channels = out2_channels
        self.input_dropout = nn.Dropout(p=input_dropout_p)
        
        self.embedding = nn.Embedding(vocab_size, embedding_size)
        new_embedding_size = embedding_size + char_size
        self.cnn1 = nn.Conv2d(in_channels, out1_channels, kernel_size=(kernel_width, new_embedding_size),
                             padding = (pad_width,0))
        self.cnn2 = nn.Conv2d(out1_channels, out2_channels, kernel_size=(kernel_width, 1),
                             padding = (pad_width,0))

    def forward(self, words, char_embedding, cap_embedding=None ,input_lengths=None):
        
        embedded = self.embedding(words)
        
        if cap_embedding:
            embedded = torch.cat((embedded,char_embedding,cap_embedding),2)  
        else:
            embedded = torch.cat((embedded,char_embedding),2)
        
        embedded1 = embedded.unsqueeze(1)
        embedded1 = self.input_dropout(embedded1)
                        
        output1 = self.cnn1(embedded1)
        output1 = nn.functional.max_pool2d(output1, kernel_size=(self.kernel_width, 1), stride = 1)
        
        output2 = self.cnn2(output1)
        output2 = nn.functional.max_pool2d(output2, kernel_size=(self.kernel_width, 1), stride = 1)
        output2 = output2.squeeze(3).transpose(1,2)
        
        return output2, embedded


class DecoderRNN(nn.Module):
    def __init__(self, input_size ,hidden_size, tag_size, tag_to_ix, input_dropout_p=0.5, 
                 output_dropout_p=0, n_layers=1):
        super(DecoderRNN, self).__init__()
        
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        
        self.input_dropout_p = input_dropout_p
        self.output_dropout_p = output_dropout_p
        
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)
        
        self.dropout = nn.Dropout(input_dropout_p)
        
        self.rnn = nn.LSTM(input_size + tag_size, hidden_size, n_layers, bidirectional=False)
        self.linear = nn.Linear(hidden_size, tag_size)
        self.ignore = -1
        self.lossfunc = nn.CrossEntropyLoss(ignore_index= self.ignore)
        
    def forward_step(self, input_var, prev_tag, hidden ,usecuda=True):
        
        prev_tag_onehot = torch.eye(self.tagset_size)
        prev_tag_onehot = prev_tag_onehot.index_select(0,torch.LongTensor(prev_tag))
        
        if usecuda:
            prev_tag_onehot = Variable(prev_tag_onehot).cuda()
        else:
            prev_tag_onehot = Variable(prev_tag_onehot)
        
        decoder_input = torch.cat([input_var, prev_tag_onehot],1).unsqueeze(0)
        output, hidden = self.rnn(decoder_input, hidden)
        output = self.linear(output.squeeze(0))
        output_tag = output.max(1)[1].data.cpu().numpy().tolist()

        return output, output_tag, hidden
        
    def forward(self, input_var, tags, mask, usecuda=True):
        
        batch_size, sequence_len, _ = input_var.size()
        
        input_var = self.dropout(input_var)
        
        input_var = input_var.transpose(0, 1).contiguous()
        
        tags = tags.transpose(0, 1).contiguous()
        mask = mask.float().transpose(0, 1).contiguous()
        
        maskedtags = tags.clone()
        maskedtags[mask==0] = -1
        
        loss = 0.0
        prev_tag = [self.tag_to_ix[START_TAG]]*batch_size
        hidden = None
        
        for i in range(sequence_len):
            output, prev_tag, hidden=self.forward_step(input_var[i], prev_tag, hidden, 
                                                       usecuda=usecuda)
            loss += self.lossfunc(output, maskedtags[i])
        return loss
    
    def decode(self, input_var, wordslen, usecuda=True):
        
        batch_size, sequence_len, _ = input_var.size()
        
        input_var = self.dropout(input_var)
        input_var = input_var.transpose(0, 1).contiguous()
        
        loss = 0.0
        prev_tag = [self.tag_to_ix[START_TAG]]*batch_size
        hidden = None
        
        tag_seq = []
        probs= []
        for i in range(sequence_len):
            output, prev_tag, hidden=self.forward_step(input_var[i], prev_tag, hidden, 
                                                       usecuda=usecuda)
            tag_seq.append(prev_tag)
            pb = nn.functional.softmax(output, dim = 1).data.cpu().numpy()
            probs.append(pb)
        
        probs = np.array(probs).transpose(1,0,2)
        
        tag_seq = np.array(tag_seq).transpose().tolist()
        tag_seq = [ts[:wordslen[i]] for i,ts in enumerate(tag_seq)]
        
        return probs, tag_seq

In [204]:
class CNN_CNN_LSTM(nn.Module):
    
    def __init__(self, word_vocab_size, word_embedding_dim, word_out1_channels, word_out2_channels,
                 char_vocab_size, char_embedding_dim, char_out_channels, decoder_hidden_units,
                 tag_to_id, cap_input_dim=4, cap_embedding_dim=0, pretrained=None):
        
        super(CNN_CNN_LSTM, self).__init__()
        
        self.word_vocab_size = word_vocab_size
        self.word_embedding_dim = word_embedding_dim
        self.word_out1_channels = word_out1_channels
        self.word_out2_channels = word_out2_channels
        
        self.char_vocab_size = char_vocab_size
        self.char_embedding_dim = char_embedding_dim
        self.char_out_channels = char_out_channels
        
        self.cap_input_dim = cap_input_dim
        self.cap_embedding_dim = cap_embedding_dim
        
        self.tag_to_ix = tag_to_id
        self.tagset_size = len(tag_to_id)
        
        self.initializer = Initializer()
        self.loader = Loader()
        
        if self.cap_input_dim and self.cap_embedding_dim:
            self.cap_embedder = nn.Embedding(self.cap_input_dim, self.cap_embedding_dim)
            self.initializer.init_embedding(self.cap_embedder.weight)
        
        self.char_encoder = CharEncoderCNN(char_vocab_size, char_embedding_dim, char_out_channels, 
                                           kernel_width=3, pad_width=1)
        
        self.initializer.init_embedding(self.char_encoder.embedding.weight)
        
        self.word_encoder = WordEncoderCNN(word_vocab_size, word_embedding_dim, char_out_channels,
                                           kernel_width = 3, pad_width = 2, input_dropout_p=0.5,
                                           out1_channels=word_out1_channels, out2_channels=word_out2_channels)
        
        if pretrained is not None:
            self.word_encoder.embedding.weight = nn.Parameter(torch.FloatTensor(pretrained))
        
        augmented_decoder_inp_size = (word_out2_channels + word_embedding_dim + 
                                      char_out_channels + cap_embedding_dim)
        self.decoder = DecoderRNN(augmented_decoder_inp_size, decoder_hidden_units, self.tagset_size, 
                                  self.tag_to_ix, input_dropout_p=0.5)
        
    def forward(self, words, tags, chars, caps, wordslen, charslen, tagsmask, usecuda=True):
        
        batch_size, max_len = words.size()
        
        cap_features = self.cap_embedder(caps) if self.cap_embedding_dim else None
        
        char_features = self.char_encoder(chars)
        char_features = char_features.view(batch_size, max_len, -1)
        
        word_features, word_input_feats = self.word_encoder(words, char_features, cap_features)
        
        new_word_features = torch.cat((word_features,word_input_feats),2)
        loss = self.decoder(new_word_features, tags, tagsmask, usecuda=usecuda)
        
        return loss
    
    def decode(self, words, chars, caps, wordslen, charslen, tagsmask, usecuda=True, 
               score_only = False):
        
        batch_size, max_len = words.size()
        
        cap_features = self.cap_embedder(caps) if self.cap_embedding_dim else None
        
        char_features = self.char_encoder(chars)
        char_features = char_features.view(batch_size, max_len, -1)
        
        word_features, word_input_feats = self.word_encoder(words, char_features, cap_features)
        
        new_word_features = torch.cat((word_features,word_input_feats),2)
        
        if score_only:
            score, _ = self.decoder.decode(new_word_features, wordslen, usecuda=usecuda)
            return score
        
        score, tag_seq = self.decoder.decode(new_word_features, wordslen, usecuda=usecuda)
        return score, tag_seq

In [205]:
if model_load:
    print ('Loading Saved Weights....................................................................')
    model_path = os.path.join(result_path, model_name, opt.checkpoint, 'modelweights')
    model = torch.load(model_path)
else:
    print('Building Model............................................................................')
    if (model_name == 'CNN_BiLSTM_CRF'):
        print ('CNN_BiLSTM_CRF')
        word_vocab_size = len(word_to_id)
        word_embedding_dim = parameters['wrdim']
        word_hidden_dim = parameters['wldim']
        char_vocab_size = len(char_to_id)
        char_embedding_dim = parameters['chdim']
        char_out_channels = parameters['cnchl']

        model = CNN_BiLSTM_CRF(word_vocab_size, word_embedding_dim, word_hidden_dim, char_vocab_size,
                               char_embedding_dim, char_out_channels, tag_to_id, pretrained = word_embeds,
                               cap_embedding_dim = 10)
        
    elif (model_name == 'CNN_BiLSTM_CRF_MC'):
        print ('CNN_BiLSTM_CRF_MC')
        word_vocab_size = len(word_to_id)
        word_embedding_dim = parameters['wrdim']
        word_hidden_dim = parameters['wldim']
        char_vocab_size = len(char_to_id)
        char_embedding_dim = parameters['chdim']
        char_out_channels = parameters['cnchl']

        model = CNN_BiLSTM_CRF_MC(word_vocab_size, word_embedding_dim, word_hidden_dim, char_vocab_size,
                               char_embedding_dim, char_out_channels, tag_to_id, pretrained = word_embeds)

    elif (model_name == 'CNN_CNN_LSTM'):
        print ('CNN_CNN_LSTM')
        word_vocab_size = len(word_to_id)
        word_embedding_dim = parameters['wrdim']
        word_out1_channels = parameters['w1chl']
        word_out2_channels = parameters['w2chl']
        char_vocab_size = len(char_to_id)
        char_embedding_dim = parameters['chdim']
        char_out_channels = parameters['cnchl']
        decoder_hidden_units = parameters['dchid']

        model = CNN_CNN_LSTM(word_vocab_size, word_embedding_dim, word_out1_channels, word_out2_channels,
                             char_vocab_size, char_embedding_dim, char_out_channels, decoder_hidden_units,
                             tag_to_id, pretrained = word_embeds)
    
    
model.cuda()
learning_rate = parameters['lrate']
print('Initial learning rate is: %s' %(learning_rate))
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9)

Building Model............................................................................
CNN_CNN_LSTM
Initial learning rate is: 0.01


In [206]:
import os
import codecs

class Evaluator(object):
    def __init__(self, result_path, model_name, mappings, usecuda=True):
        self.result_path = result_path
        self.model_name = model_name
        self.tag_to_id = mappings['tag_to_id']
        self.id_to_tag = mappings['id_to_tag']
        self.usecuda = usecuda

    def evaluate_conll(self, model, dataset, best_F, eval_script='./datasets/conll/conlleval',
                       checkpoint_folder='.', record_confmat = False, batch_size = 32):
        
        prediction = []
        save = False
        new_F = 0.0
        confusion_matrix = torch.zeros((len(self.tag_to_id) - 2, len(self.tag_to_id) - 2))
    
        data_batches = create_batches(dataset, batch_size = batch_size, str_words = True,
                                      tag_padded = False)

        for data in data_batches:

            words = data['words']
            chars = data['chars']
            caps = data['caps']
            mask = data['tagsmask']

            if self.usecuda:
                words = Variable(torch.LongTensor(words)).cuda()
                chars = Variable(torch.LongTensor(chars)).cuda()
                caps = Variable(torch.LongTensor(caps)).cuda()
                mask = Variable(torch.LongTensor(mask)).cuda()
            else:
                words = Variable(torch.LongTensor(words))
                chars = Variable(torch.LongTensor(chars))
                caps = Variable(torch.LongTensor(caps))
                mask = Variable(torch.LongTensor(mask))

            wordslen = data['wordslen']
            charslen = data['charslen']
            
            str_words = data['str_words']
            
            _, out = model.decode(words, chars, caps, wordslen, charslen, mask, usecuda = self.usecuda)
            print (out)
            assert False
            
            ground_truth_id = data['tags']
            predicted_id = out            
            
            for (swords, sground_truth_id, spredicted_id) in zip(str_words, ground_truth_id, predicted_id):
                for (word, true_id, pred_id) in zip(swords, sground_truth_id, spredicted_id):
                    line = ' '.join([word, self.id_to_tag[true_id], self.id_to_tag[pred_id]])
                    prediction.append(line)
                    confusion_matrix[true_id, pred_id] += 1
                prediction.append('')

        predf = os.path.join(self.result_path, self.model_name, checkpoint_folder ,'pred.txt')
        scoref = os.path.join(self.result_path, self.model_name, checkpoint_folder ,'score.txt')

        with open(predf, 'wb') as f:
            f.write('\n'.join(prediction))

        os.system('%s < %s > %s' % (eval_script, predf, scoref))

        eval_lines = [l.rstrip() for l in codecs.open(scoref, 'r', 'utf8')]

        for i, line in enumerate(eval_lines):
            print(line)
            if i == 1:
                new_F = float(line.strip().split()[-1])
                if new_F > best_F:
                    best_F = new_F
                    save = True
                    print('the best F is ', new_F)
        if record_confmat:
            print(("{: >2}{: >7}{: >7}%s{: >9}" % ("{: >7}" * confusion_matrix.size(0))).format(
                "ID", "NE", "Total",
                *([self.id_to_tag[i] for i in range(confusion_matrix.size(0))] + ["Percent"])
            ))
            for i in range(confusion_matrix.size(0)):
                print(("{: >2}{: >7}{: >7}%s{: >9}" % ("{: >7}" * confusion_matrix.size(0))).format(
                    str(i), self.id_to_tag[i], str(confusion_matrix[i].sum()),
                    *([confusion_matrix[i][j] for j in range(confusion_matrix.size(0))] +
                      ["%.3f" % (confusion_matrix[i][i] * 100. / max(1, confusion_matrix[i].sum()))])
                ))
            
        return best_F, new_F, save


In [207]:
from __future__ import print_function
import time
import sys
import os

class Trainer(object):
    
    def __init__(self, model, optimizer, result_path, model_name, usedataset, mappings, 
                 eval_every=1, usecuda = True):
        self.model = model
        self.optimizer = optimizer
        self.eval_every = eval_every
        self.model_name = os.path.join(result_path, model_name)
        self.usecuda = usecuda
        
        if usedataset=='conll':
            self.evaluator = Evaluator(result_path, model_name, mappings, usecuda).evaluate_conll
    
    def adjust_learning_rate(self, optimizer, lr):
        for param_group in optimizer.param_groups:
            param_group['lr'] = lr
        
            
    def train_model(self, num_epochs, train_data, dev_data, test_train_data, test_data, learning_rate,
                    checkpoint_folder='.', eval_test_train=True, plot_every=1, adjust_lr=True,
                    batch_size = 2):
        
        losses = []
        loss = 0.0
        best_dev_F = -1.0
        best_test_F = -1.0
        best_train_F = -1.0
        all_F=[[0,0,0]]
        count = 0
        word_count = 0
        
        self.model.train(True)
        for epoch in range(1, num_epochs+1):
            t=time.time()
            
            #Random Batching Ensure
            train_batches = create_batches(train_data, batch_size= batch_size)
            #Random Permutation instead of Range.
            for i, index in enumerate(np.arange(len(train_batches[:5]))):
                
                data = train_batches[index]
                self.model.zero_grad()

                words = data['words']
                tags = data['tags']
                chars = data['chars']
                caps = data['caps']
                mask = data['tagsmask']
                
                if self.usecuda:
                    words = Variable(torch.LongTensor(words)).cuda()
                    chars = Variable(torch.LongTensor(chars)).cuda()
                    caps = Variable(torch.LongTensor(caps)).cuda()
                    mask = Variable(torch.LongTensor(mask)).cuda()
                    tags = Variable(torch.LongTensor(tags)).cuda()
                else:
                    words = Variable(torch.LongTensor(words))
                    chars = Variable(torch.LongTensor(chars))
                    caps = Variable(torch.LongTensor(caps))
                    mask = Variable(torch.LongTensor(mask))
                    tags = Variable(torch.LongTensor(tags))
                
                wordslen = data['wordslen']
                charslen = data['charslen']
                batch_score = self.model(words, tags, chars, caps, wordslen, charslen, mask,
                                         usecuda=self.usecuda)
                loss += np.mean(batch_score.data.cpu().numpy()/np.array(data['wordslen']))
                score = torch.sum(batch_score)
                score.backward()
                
                
                nn.utils.clip_grad_norm(self.model.parameters(), 5.0)
                self.optimizer.step()
                
                count += 1
                word_count += len(data['words'])
                
                if count % plot_every == 0:
                    loss /= plot_every
                    print(word_count, ': ', loss)
                    if losses == []:
                        losses.append(loss)
                    losses.append(loss)
                    loss = 0.0
                    
            if adjust_lr:
                self.adjust_learning_rate(self.optimizer, lr=learning_rate/(1+0.05*count/len(train_data)))
            
            if epoch%self.eval_every==0:
                
                self.model.train(False)
                
                if eval_test_train:
                    best_train_F, new_train_F, _ = self.evaluator(self.model, test_train_data, best_train_F, 
                                                                  checkpoint_folder=checkpoint_folder)
                else:
                    best_train_F, new_train_F, _ = 0, 0, 0
                best_dev_F, new_dev_F, save = self.evaluator(self.model, dev_data, best_dev_F,
                                                             checkpoint_folder=checkpoint_folder)
                if save:
                    torch.save(self.model, os.path.join(self.model_name, checkpoint_folder, 'modelweights'))
                best_test_F, new_test_F, _ = self.evaluator(self.model, test_data, best_test_F,
                                                            checkpoint_folder=checkpoint_folder)
                sys.stdout.flush()

                all_F.append([new_train_F, new_dev_F, new_test_F])
                
                self.model.train(True)

            print('*'*80)
            print('Epoch %d Complete: Time Taken %d' %(epoch ,time.time() - t))

        return losses, all_F

In [208]:
trainer = Trainer(model, optimizer, result_path, model_name, usedataset=opt.dataset, mappings= mappings) 
losses, all_F = trainer.train_model(opt.num_epochs, train_data, dev_data, test_train_data, test_data,
                                     learning_rate = learning_rate)

torch.Size([2, 19])
torch.Size([2, 19])
torch.Size([2, 19])
torch.Size([2, 19])
torch.Size([2, 19])
torch.Size([2, 19])
torch.Size([2, 19])
torch.Size([2, 19])
torch.Size([2, 19])
2 :  8.404297722710503
torch.Size([2, 19])
torch.Size([2, 19])
torch.Size([2, 19])
torch.Size([2, 19])
torch.Size([2, 19])
torch.Size([2, 19])
torch.Size([2, 19])
torch.Size([2, 19])
torch.Size([2, 19])
torch.Size([2, 19])
torch.Size([2, 19])
torch.Size([2, 19])
torch.Size([2, 19])
torch.Size([2, 19])
torch.Size([2, 19])
torch.Size([2, 19])
torch.Size([2, 19])
torch.Size([2, 19])
torch.Size([2, 19])
torch.Size([2, 19])
torch.Size([2, 19])
torch.Size([2, 19])
torch.Size([2, 19])
torch.Size([2, 19])
torch.Size([2, 19])
torch.Size([2, 19])
torch.Size([2, 19])
torch.Size([2, 19])
torch.Size([2, 19])
torch.Size([2, 19])
4 :  23.722637939453126
torch.Size([2, 19])
torch.Size([2, 19])
torch.Size([2, 19])
torch.Size([2, 19])
torch.Size([2, 19])
torch.Size([2, 19])
torch.Size([2, 19])
torch.Size([2, 19])
torch.Size([2

AssertionError: 

In [4]:
def zero_digits(s):
    """
    Replace every digit in a string by a zero.
    """
    return re.sub('\d', '0', s)

In [1]:
import re
import codecs
from neural_ner.util.loader import Loader

In [2]:
loader = Loader()

In [3]:
parameters = {}
parameters['zeros'] = 0
parameters['lower'] = 1
parameters['wrdim'] = 100
parameters['ptrnd'] = 'wordvectors/glove.6B.100d.txt'
parameters['tgsch'] = 'iobes'

In [4]:
train_data, dev_data, test_data, mappings = loader.load_ontonotes('datasets/ontonotes/',parameters)

Found 20676 unique words (1633660 in total)
Loading pretrained embeddings from wordvectors/glove.6B.100d.txt...
Found 115 unique characters
Found 75 unique named entity tags
82122 / 12678 / 8968 sentences in train / dev / test.


In [9]:
prediction = []
for data in train_data:
    for word in data['str_words']:
        prediction.append(word)
with open('hawa.txt', 'wb') as f:
    f.write('\n'.join(prediction).encode('utf-8'))

In [1]:
import torch

In [2]:
a= torch.randn(3,4,5)

In [3]:
a.transpose(1,2).size()

torch.Size([3, 5, 4])

In [1]:
import numpy as np

In [2]:
a = 

In [6]:
a.max(2).sum(1)

array([4.23408643, 3.74352882, 4.63904877])

In [7]:
import torch

In [12]:
a = torch.autograd.Variable(torch.randn(3,5)).cuda()

In [14]:
a.cpu().data.numpy() * np.random.randn(3,5,4).max(2)

array([[-0.6069078 , -2.66035119, -0.99708431,  3.21097201,  1.12540611],
       [ 0.11840909, -1.08448844, -0.24090727, -1.20281994, -0.29294011],
       [-0.2129307 ,  0.29792313, -0.45437451,  0.00458517, -0.5133353 ]])