In [None]:
import torch
torch.manual_seed(0)
import numpy as np
from collections import Counter
import time

from neural_ner.util import Trainer, Loader


In [None]:
from joblib import Parallel, delayed

In [None]:
class Parse():
    def __init__(self):
        self.dataset = 'conll'
        self.result_path = 'neural_ner/results'
        self.usemodel = 'CNN_BiLSTM_CRF'
        self.worddim = 100
        self.pretrnd = 'wordvectors/glove.6B.100d.txt'
        self.reload = 0
        self.num_epochs = 10

opt=Parse()

In [None]:
from collections import OrderedDict
parameters = OrderedDict()
import os


parameters['model'] = opt.usemodel
parameters['wrdim'] = opt.worddim
parameters['ptrnd'] = opt.pretrnd

if opt.usemodel == 'CNN_BiLSTM_CRF':
    parameters['lower'] = 1
    parameters['zeros'] = 0
    parameters['cpdim'] = 0
    parameters['dpout'] = 0.5
    parameters['chdim'] = 25
    parameters['tgsch'] = 'iobes'

    parameters['wldim'] = 200
    parameters['cldim'] = 25
    parameters['cnchl'] = 25
    
    parameters['lrate'] = 0.015
    parameters['acqmd'] = 'd'
    
elif opt.usemodel == 'CNN_BiLSTM_CRF_MC':
    parameters['lower'] = 1
    parameters['zeros'] = 0
    parameters['cpdim'] = 0
    parameters['dpout'] = 0.5
    parameters['chdim'] = 25
    parameters['tgsch'] = 'iobes'

    parameters['wldim'] = 200
    parameters['cldim'] = 25
    parameters['cnchl'] = 25
    
    parameters['lrate'] = 0.015
    parameters['acqmd'] = 'm'

elif opt.usemodel == 'CNN_CNN_LSTM':
    parameters['lower'] = 1
    parameters['zeros'] = 0
    parameters['cpdim'] = 0
    parameters['dpout'] = 0.5
    parameters['chdim'] = 25
    parameters['tgsch'] = 'iobes'
    
    parameters['w1chl'] = 800
    parameters['w2chl'] = 800
    parameters['cldim'] = 25
    parameters['cnchl'] = 50
    parameters['dchid'] = 20
    
    parameters['lrate'] = 0.001
    parameters['acqmd'] = 'd'
    
else:
    raise NotImplementedError()

dataset_path = os.path.join('datasets',opt.dataset)
result_path = os.path.join(opt.result_path, opt.dataset)
model_name = opt.usemodel
model_load = opt.reload
init_percent = 2
acquire_percent = 2
acquire_method = 'random'
loader = Loader()

if not os.path.exists(result_path):
    os.makedirs(result_path)
    
if not os.path.exists(os.path.join(result_path, model_name)):
    os.makedirs(os.path.join(result_path, model_name))

if not os.path.exists(os.path.join(result_path, model_name, 'active_checkpoints', acquire_method)):
    os.makedirs(os.path.join(result_path, model_name, 'active_checkpoints', acquire_method))

if opt.dataset == 'conll':
    train_data, dev_data, test_data, test_train_data, mappings = loader.load_conll(dataset_path, parameters)
    
word_to_id = mappings['word_to_id']
tag_to_id = mappings['tag_to_id']
char_to_id = mappings['char_to_id']
word_embeds = mappings['word_embeds']

In [None]:
 model_path = '/home/ubuntu/Active-NLP/neural_ner/results/conll/CNN_BiLSTM_CRF_MC/active_checkpoint/mnlp/00004078/modelweights'

In [None]:
train_index=set()

def get_mnlp_mc(dataset, model_path, decoder, num_tokens, nsamp=100):
    model = torch.load(model_path, map_location=lambda storage, loc: storage)
    model.train(True)
    tm = time.time()
    
    def get_metrics(j, data):
        if j not in train_index:
            sentence = data['words']
            tags = data['tags']
            chars = data['chars']
            caps = data['caps']
            if decoder=='CRF':
                decoded = [model.decode(sentence, tags, chars, caps, usecuda=False) for itr in range(nsamp)]
            elif decoder=='LSTM':
                raise NotImplementedError()
            score = np.array([itm[0].data[0] for itm in decoded])
            tag_seq_list = [str(itm[1]) for itm in decoded]
            tprobs = score/len(sentence)
            tvarsc = Counter(tag_seq_list).most_common(1)[0][1]
        else:
            tprobs = np.ones(nsamp)*float('Inf')
            tvarsc = float('Inf')
        
        return tprobs, tvarsc
    
    loopoutput = [get_metrics(j, data) for j, data in enumerate(dataset[:100])]
    probs = np.array([litm[0] for litm in loopoutput])
    varsc = np.array([litm[1] for litm in loopoutput])
    
    '''
    probsmean = np.mean(probs, axis=1)
    test_indices = np.argsort(varsc)
    #test_indices = np.lexsort((varsc, probsmean))
    cur_tokens=0
    cur_indices = set()
    i = 0
    while cur_tokens<num_tokens:
        cur_indices.add(test_indices[i])
        cur_tokens += len(dataset[test_indices[i]]['words'])
        i+=1
    train_index.update(cur_indices)
    '''

    print ('Acquisition took %d seconds.' %(time.time()-tm))

In [None]:
get_mnlp_mc(train_data, model_path, decoder='CRF', num_tokens=4000, nsamp=100)

In [None]:
a =[1,2,3]

In [None]:
str(a)

In [None]:
train_index=set()

model = torch.load(model_path, map_location=lambda storage, loc: storage)
model.train(True)

def unwrap_decode(sentence, tags, chars, caps, usecuda=False):
    return model.decode(sentence, tags, chars, caps, usecuda=False)

def get_mnlp_mc(dataset, model_path, decoder, num_tokens, nsamp=100):
    
    tm = time.time()
    probs = np.ones((len(dataset),nsamp))*float('Inf')
    varsc = np.ones(len(dataset))*float('Inf')
    for j, data in enumerate(dataset[:10]):
        if j not in train_index:
            sentence = data['words']
            tags = data['tags']
            chars = data['chars']
            caps = data['caps']
            if decoder=='CRF':
                decoded = Parallel(n_jobs= -1, backend="threading")\
                                      (delayed(unwrap_decode)(sentence, tags, chars, 
                                       caps, usecuda=False) for itr in range(nsamp))
            elif decoder=='LSTM':
                raise NotImplementedError()
            score = np.array([itm[0].data[0] for itm in decoded])
            tag_seq_list = [str(itm[1]) for itm in decoded]
            probs[j,:] = score/len(sentence)
            varsc[j] = Counter(tag_seq_list).most_common(1)[0][1]
    
    probsmean = np.mean(probs, axis=1)
    test_indices = np.argsort(varsc)
    #test_indices = np.lexsort((varsc, probsmean))
    cur_tokens=0
    cur_indices = set()
    i = 0
    while cur_tokens<num_tokens:
        cur_indices.add(test_indices[i])
        cur_tokens += len(dataset[test_indices[i]]['words'])
        i+=1
    train_index.update(cur_indices)

    print ('Acquisition took %d seconds.' %(time.time()-tm))

In [None]:
a=[np.zeros(5) for _ in range(10)]

In [None]:
np.array(a).shape