In [3]:
import sys
from google.colab import drive
drive.mount('/content/drive')
sys.path.append('/content/drive/MyDrive/lattcie ner/fyz-lattcie/fyz_lattice_NER')

sys.argv=['']
del sys

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
import time
import sys
import argparse
import random
import copy
import torch
import gc
import pickle
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
from utils.metric import get_ner_fmeasure
from model.bilstmcrf import BiLSTM_CRF as SeqModel
from utils.data import Data

seed_num = 100
random.seed(seed_num)
torch.manual_seed(seed_num)
np.random.seed(seed_num)


def data_initialization(data, gaz_file, train_file, dev_file, test_file):
    data.build_alphabet(train_file)
    data.build_alphabet(dev_file)
    data.build_alphabet(test_file)
    data.build_gaz_file(gaz_file)

    #gaz_alphabet train,dev,test file在embedding中匹配到的词语
    data.build_gaz_alphabet(train_file)
    data.build_gaz_alphabet(dev_file)
    data.build_gaz_alphabet(test_file)
    data.fix_alphabet()
    return data


def predict_check(pred_variable, gold_variable, mask_variable):
    """
        input:
            pred_variable (batch_size, sent_len): pred tag result, in numpy format
            gold_variable (batch_size, sent_len): gold  result variable
            mask_variable (batch_size, sent_len): mask variable
    """
    pred = pred_variable.cpu().data.numpy()
    gold = gold_variable.cpu().data.numpy()
    mask = mask_variable.cpu().data.numpy()
    overlaped = (pred == gold)
    right_token = np.sum(overlaped * mask)
    total_token = mask.sum()
    # print("right: %s, total: %s"%(right_token, total_token))
    return right_token, total_token


def recover_label(pred_variable, gold_variable, mask_variable, label_alphabet, word_recover):
    """
        input:
            pred_variable (batch_size, sent_len): pred tag result
            gold_variable (batch_size, sent_len): gold result variable
            mask_variable (batch_size, sent_len): mask variable
    """
    
    pred_variable = pred_variable[word_recover]
    gold_variable = gold_variable[word_recover]
    mask_variable = mask_variable[word_recover]
    batch_size = gold_variable.size(0)
    seq_len = gold_variable.size(1)
    mask = mask_variable.cpu().data.numpy()
    pred_tag = pred_variable.cpu().data.numpy()
    gold_tag = gold_variable.cpu().data.numpy()
    batch_size = mask.shape[0]
    pred_label = []
    gold_label = []
    for idx in range(batch_size):
        pred = [label_alphabet.get_instance(pred_tag[idx][idy]) for idy in range(seq_len) if mask[idx][idy] != 0]
        gold = [label_alphabet.get_instance(gold_tag[idx][idy]) for idy in range(seq_len) if mask[idx][idy] != 0]
        # print "p:",pred, pred_tag.tolist()
        # print "g:", gold, gold_tag.tolist()
        assert(len(pred)==len(gold))
        pred_label.append(pred)
        gold_label.append(gold)
    return pred_label, gold_label


def save_data_setting(data, save_file):
    """
    new_data = copy.deepcopy(data)
    ## remove input instances
    new_data.train_texts = []
    new_data.dev_texts = []
    new_data.test_texts = []
    new_data.raw_texts = []
    new_data.train_Ids = []
    new_data.dev_Ids = []
    new_data.test_Ids = []
    new_data.raw_Ids = []"""
    ## save data settings
    with open(save_file, 'wb') as fp:
        pickle.dump(data, fp)
    print ("Data setting saved to file: ", save_file)


def load_data_setting(save_file):
    with open(save_file, 'rb') as fp:
        data = pickle.load(fp)
    print ("Data setting loaded from file: ", save_file)
    data.show_data_summary()
    return data

def lr_decay(optimizer, epoch, decay_rate, init_lr):
    lr = init_lr * ((1-decay_rate)**epoch)
    print (" Learning rate is setted as:", lr)
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr
    return optimizer



def evaluate(data, model, name):
    if name == "train":
        instances = data.train_Ids
    elif name == "dev":
        instances = data.dev_Ids
    elif name == 'test':
        instances = data.test_Ids
    elif name == 'raw':
        instances = data.raw_Ids
    else:
        print ("Error: wrong evaluate name,", name)
    pred_results = []
    gold_results = []
    ## set model in eval model
    model.eval()
    batch_size = 10
    start_time = time.time()
    train_num = len(instances)
    total_batch = train_num//batch_size+1
    for batch_id in range(total_batch):
        start = batch_id*batch_size
        end = (batch_id+1)*batch_size 
        if end >train_num:
            end =  train_num
        instance = instances[start:end]
        if not instance:
            continue
        gaz_list,batch_word, batch_biword, batch_wordlen, batch_wordrecover, batch_char, batch_charlen, batch_charrecover, batch_label, mask  = batchify_with_label(instance, data.HP_gpu, True)
        tag_seq = model(gaz_list,batch_word, batch_biword, batch_wordlen, batch_char, batch_charlen, batch_charrecover, mask)
        # print "tag:",tag_seq
        pred_label, gold_label = recover_label(tag_seq, batch_label, mask, data.label_alphabet, batch_wordrecover)
        pred_results += pred_label
        gold_results += gold_label
    decode_time = time.time() - start_time
    speed = len(instances)/decode_time
    acc, p, r, f = get_ner_fmeasure(gold_results, pred_results, data.tagScheme)
    return speed, acc, p, r, f, pred_results  


def batchify_with_label(input_batch_list, gpu, volatile_flag=False):
    """
        input: list of words, chars and labels, various length. [[words,biwords,chars,gaz, labels],[words,biwords,chars,labels],...]
            words: word ids for one sentence. (batch_size, sent_len) 
            chars: char ids for on sentences, various length. (batch_size, sent_len, each_word_length)
        output:
            zero padding for word and char, with their batch length
            word_seq_tensor: (batch_size, max_sent_len) Variable
            word_seq_lengths: (batch_size,1) Tensor
            char_seq_tensor: (batch_size*max_sent_len, max_word_len) Variable
            char_seq_lengths: (batch_size*max_sent_len,1) Tensor
            char_seq_recover: (batch_size*max_sent_len,1)  recover char sequence order 
            label_seq_tensor: (batch_size, max_sent_len)
            mask: (batch_size, max_sent_len) 
    """
    batch_size = len(input_batch_list)
    words = [sent[0] for sent in input_batch_list]
    biwords = [sent[1] for sent in input_batch_list]
    chars = [sent[2] for sent in input_batch_list]


    gazs = [sent[3] for sent in input_batch_list]
    labels = [sent[4] for sent in input_batch_list]
    word_seq_lengths = torch.LongTensor(list(map(len, words)))
    max_seq_len = word_seq_lengths.max().item()

    word_seq_tensor = autograd.Variable(torch.zeros((batch_size, max_seq_len)), volatile =  volatile_flag).long()
    biword_seq_tensor = autograd.Variable(torch.zeros((batch_size, max_seq_len)), volatile =  volatile_flag).long()
    label_seq_tensor = autograd.Variable(torch.zeros((batch_size, max_seq_len)),volatile =  volatile_flag).long()
    mask = autograd.Variable(torch.zeros((batch_size, max_seq_len)),volatile =  volatile_flag).byte()

    for idx, (seq, biseq, label, seqlen) in enumerate(zip(words, biwords, labels, word_seq_lengths)):
        word_seq_tensor[idx, :seqlen] = torch.LongTensor(seq)
        biword_seq_tensor[idx, :seqlen] = torch.LongTensor(biseq)
        label_seq_tensor[idx, :seqlen] = torch.LongTensor(label)
        mask[idx, :seqlen] = torch.Tensor([1]*seqlen.item())

    word_seq_lengths, word_perm_idx = word_seq_lengths.sort(0, descending=True)
    word_seq_tensor = word_seq_tensor[word_perm_idx]
    biword_seq_tensor = biword_seq_tensor[word_perm_idx]
    label_seq_tensor = label_seq_tensor[word_perm_idx]
    mask = mask[word_perm_idx]

    ### deal with char
    # pad_chars (batch_size, max_seq_len)
    pad_chars = [chars[idx] + [[0]] * (max_seq_len-len(chars[idx])) for idx in range(len(chars))]
    length_list = [list(map(len, pad_char)) for pad_char in pad_chars]
    #length_list = [len(pad_char) for pad_char in pad_chars]
    max_word_len = max(map(max, length_list))
    char_seq_tensor = autograd.Variable(torch.zeros((batch_size, max_seq_len, max_word_len)), volatile =  volatile_flag).long()
    char_seq_lengths = torch.LongTensor(length_list)
    for idx, (seq, seqlen) in enumerate(zip(pad_chars, char_seq_lengths)):
        for idy, (word, wordlen) in enumerate(zip(seq, seqlen)):
            # print len(word), wordlen
            char_seq_tensor[idx, idy, :wordlen] = torch.LongTensor(word)
    char_seq_tensor = char_seq_tensor[word_perm_idx].view(batch_size*max_seq_len,-1)
    char_seq_lengths = char_seq_lengths[word_perm_idx].view(batch_size*max_seq_len,)
    char_seq_lengths, char_perm_idx = char_seq_lengths.sort(0, descending=True)
    char_seq_tensor = char_seq_tensor[char_perm_idx]
    _, char_seq_recover = char_perm_idx.sort(0, descending=False)
    _, word_seq_recover = word_perm_idx.sort(0, descending=False)
    
    ## keep the gaz_list in orignial order
    
    gaz_list = [ gazs[i] for i in word_perm_idx]
    gaz_list.append(volatile_flag)
    if gpu:
        word_seq_tensor = word_seq_tensor.cuda()
        biword_seq_tensor = biword_seq_tensor.cuda()
        word_seq_lengths = word_seq_lengths.cuda()
        word_seq_recover = word_seq_recover.cuda()
        label_seq_tensor = label_seq_tensor.cuda()
        char_seq_tensor = char_seq_tensor.cuda()
        char_seq_recover = char_seq_recover.cuda()
        mask = mask.cuda()
    return gaz_list, word_seq_tensor, biword_seq_tensor, word_seq_lengths, word_seq_recover, char_seq_tensor, char_seq_lengths, char_seq_recover, label_seq_tensor, mask


def train(data, save_model_dir,save_data_set, seg=True):
    print ("Training model...")
    data.show_data_summary()
    #save_data_name = save_data_set
    #save_data_setting(data, save_data_name)

    save_data_name = save_data_set
    save_data_setting(data, save_data_name)

    model = SeqModel(data)
    print ("finished built model.")
    loss_function = nn.NLLLoss()
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = optim.SGD(parameters, lr=data.HP_lr, momentum=data.HP_momentum)
    best_dev = -1
    #data.HP_iteration = 100#epoch次数
    data.HP_iteration = 5#epoch次数
    ## start training data.HP_iteration
    for idx in range(data.HP_iteration):
        epoch_start = time.time()
        temp_start = epoch_start
        print("Epoch: %s/%s" %(idx,data.HP_iteration))
        optimizer = lr_decay(optimizer, idx, data.HP_lr_decay, data.HP_lr)
        instance_count = 0
        sample_id = 0
        sample_loss = 0
        batch_loss = 0
        total_loss = 0
        right_token = 0
        whole_token = 0
        random.shuffle(data.train_Ids)
        ## set model in train model
        model.train()
        model.zero_grad()
        batch_size = 10 ## current only support batch size = 1 to compulate and accumulate to data.HP_batch_size update weights
        train_num = len(data.train_Ids)
        total_batch = train_num//batch_size+1
        for batch_id in range(total_batch):
            start = batch_id*batch_size
            end = (batch_id+1)*batch_size 
            if end >train_num:
                end = train_num
            instance = data.train_Ids[start:end]
            if not instance:
                continue
            gaz_list,  batch_word, batch_biword, batch_wordlen, batch_wordrecover, batch_char, batch_charlen, batch_charrecover, batch_label, mask  = batchify_with_label(instance, data.HP_gpu)
            # print "gaz_list:",gaz_list
            # exit(0)
            instance_count += 1
            loss, tag_seq = model.neg_log_likelihood_loss(gaz_list, batch_word, batch_biword, batch_wordlen, batch_char, batch_charlen, batch_charrecover, batch_label, mask)
            right, whole = predict_check(tag_seq, batch_label, mask)
            right_token += right
            whole_token += whole
            #sample_loss += loss.data[0]
            #total_loss += loss.data[0]
            sample_loss += loss.data.item()
            total_loss += loss.data.item()
            batch_loss += loss

            #if end%2000 == 0:
            if end%500 == 0:
                temp_time = time.time()
                temp_cost = temp_time - temp_start
                temp_start = temp_time
                print("Instance: %s; Time: %.2fs; loss: %.4f; acc: %s/%s=%.4f"%(end, temp_cost, sample_loss, right_token, whole_token,(right_token+0.)/whole_token))
                sys.stdout.flush()
                sample_loss = 0
            if end%data.HP_batch_size == 0:
                batch_loss.backward()
                optimizer.step()
                model.zero_grad()
                batch_loss = 0
        temp_time = time.time()
        temp_cost = temp_time - temp_start
        print("     Instance: %s; Time: %.2fs; loss: %.4f; acc: %s/%s=%.4f"%(end, temp_cost, sample_loss, right_token, whole_token,(right_token+0.)/whole_token))       
        epoch_finish = time.time()
        epoch_cost = epoch_finish - epoch_start
        print("Epoch: %s training finished. Time: %.2fs, speed: %.2fst/s,  total loss: %s"%(idx, epoch_cost, train_num/epoch_cost, total_loss))
        # exit(0)
        # continue
        speed, acc, p, r, f, _ = evaluate(data, model, "dev")
        dev_finish = time.time()
        dev_cost = dev_finish - epoch_finish

        if seg:
            current_score = f
            print("Dev: time: %.2fs, speed: %.2fst/s; acc: %.4f, p: %.4f, r: %.4f, f: %.4f"%(dev_cost, speed, acc, p, r, f))
        else:
            current_score = acc
            print("Dev: time: %.2fs speed: %.2fst/s; acc: %.4f"%(dev_cost, speed, acc))

        if current_score > best_dev:
            if seg:
                print ("Exceed previous best f score:", best_dev)
            else:
                print ("Exceed previous best acc score:", best_dev)

            model_name = save_model_dir
            torch.save(model.state_dict(), model_name)
            #model_name = save_model_dir +'.'+ str(idx) + ".model"
            #torch.save(model.state_dict(), model_name)
            best_dev = current_score 
        # ## decode test
        speed, acc, p, r, f, _ = evaluate(data, model, "test")
        test_finish = time.time()
        test_cost = test_finish - dev_finish
        if seg:
            print("Test: time: %.2fs, speed: %.2fst/s; acc: %.4f, p: %.4f, r: %.4f, f: %.4f"%(test_cost, speed, acc, p, r, f))
        else:
            print("Test: time: %.2fs, speed: %.2fst/s; acc: %.4f"%(test_cost, speed, acc))
        gc.collect()
    return model


def load_model_decode(model_dir, data, name, gpu, seg=True):
    data.HP_gpu = gpu
    print ("Load Model from file: ", model_dir)
    model = SeqModel(data)
    ## load model need consider if the model trained in GPU and load in CPU, or vice versa
    # if not gpu:
    #     model.load_state_dict(torch.load(model_dir), map_location=lambda storage, loc: storage)
    #     # model = torch.load(model_dir, map_location=lambda storage, loc: storage)
    # else:
    model.load_state_dict(torch.load(model_dir))
        # model = torch.load(model_dir)
    #model = torch.load(model_dir)
    
    print("Decode %s data ..."%(name))
    start_time = time.time()
    speed, acc, p, r, f, pred_results = evaluate(data, model, name)
    end_time = time.time()
    time_cost = end_time - start_time
    if seg:
        print("%s: time:%.2fs, speed:%.2fst/s; acc: %.4f, p: %.4f, r: %.4f, f: %.4f"%(name, time_cost, speed, acc, p, r, f))
    else:
        print("%s: time:%.2fs, speed:%.2fst/s; acc: %.4f"%(name, time_cost, speed, acc))
    return pred_results

def load_model_decode_with_model(model, data, name, gpu, seg=True):
    data.HP_gpu = gpu
    
    print("Decode %s data ..."%(name))
    start_time = time.time()
    speed, acc, p, r, f, pred_results = evaluate(data, model, name)
    end_time = time.time()
    time_cost = end_time - start_time
    if seg:
        print("%s: time:%.2fs, speed:%.2fst/s; acc: %.4f, p: %.4f, r: %.4f, f: %.4f"%(name, time_cost, speed, acc, p, r, f))
    else:
        print("%s: time:%.2fs, speed:%.2fst/s; acc: %.4f"%(name, time_cost, speed, acc))
    return pred_results


In [3]:
if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Tuning with bi-directional LSTM-CRF')
    parser.add_argument('--embedding',  help='Embedding for words', default='None')
    parser.add_argument('--status', choices=['train', 'test', 'decode'], help='update algorithm', default='train')
    #parser.add_argument('--savemodel', default="/content/drive/MyDrive/lattcie ner/fyz-lattcie/fyz_lattice_NER/model/2-222-saved_model2.lstmcrf")
    #parser.add_argument('--savedset', help='Dir of saved data setting', default="/content/drive/MyDrive/lattcie ner/fyz-lattcie/fyz_lattice_NER/ResumeNER/save-222.dset")
    parser.add_argument('--savemodel', default="/content/drive/MyDrive/lattcie ner/fyz-lattcie/cyx/lattice-saved_model.lstmcrf")
    parser.add_argument('--savedset', help='Dir of saved data setting', default="/content/drive/MyDrive/lattcie ner/fyz-lattcie/cyx/save-data.dset")
    #parser.add_argument('--train', default="/content/drive/MyDrive/lattcie ner/fyz-lattcie/fyz_lattice_NER/ResumeNER/train.char.bmes")
    #parser.add_argument('--dev', default="/content/drive/MyDrive/lattcie ner/fyz-lattcie/fyz_lattice_NER/ResumeNER/dev.char.bmes" )
    #parser.add_argument('--test', default="/content/drive/MyDrive/lattcie ner/fyz-lattcie/fyz_lattice_NER/ResumeNER/test.char.bmes")

    parser.add_argument('--train', default="/content/drive/MyDrive/lattcie ner/fyz-lattcie/cyx/train.txt")
    parser.add_argument('--dev', default="/content/drive/MyDrive/lattcie ner/fyz-lattcie/cyx/dev.txt" )
    parser.add_argument('--test', default="/content/drive/MyDrive/lattcie ner/fyz-lattcie/cyx/test.txt")

    #parser.add_argument('--train', default="test_data/fyz.train.embs")
    #parser.add_argument('--dev', default="test_data/fyz.dev.embs")
    #parser.add_argument('--test', default="test_data/fyz.test.embs")
    parser.add_argument('--seg', default="True") 
    parser.add_argument('--extendalphabet', default="True") 
    parser.add_argument('--raw') 
    #parser.add_argument('--loadmodel',default="/content/drive/MyDrive/lattcie ner/fyz-lattcie/fyz_lattice_NER/model/2-222-saved_model2.lstmcrf")
    parser.add_argument('--loadmodel',default="/content/drive/MyDrive/lattcie ner/fyz-lattcie/cyx/lattice-saved_model.lstmcrf")
    parser.add_argument('--output') 
    args = parser.parse_args()
   
    train_file = args.train
    dev_file = args.dev
    test_file = args.test
    raw_file = args.raw
    model_dir = args.loadmodel
    dset_dir = args.savedset
    output_file = args.output
    if args.seg.lower() == "true":
        seg = True 
    else:
        seg = False
    status = args.status.lower()

    save_model_dir = args.savemodel
    gpu = torch.cuda.is_available()

    char_emb = "/content/drive/MyDrive/lattcie ner/fyz-lattcie/fyz_lattice_NER/data/gigaword_chn.all.a2b.uni.ite50.vec"
    #char_emb = "/content/drive/MyDrive/lattcie ner/TCM_NER-master/TCM_NER-master/data/sgns.sikuquanshu.vec"
    bichar_emb = '/content/drive/MyDrive/lattcie ner/TCM_NER-master/TCM_NER-master/data/gigaword_chn.all.a2b.bi.ite50.vec'
    #bichar_emb = "/content/drive/MyDrive/lattcie ner/TCM_NER-master/TCM_NER-master/data/sgns.sikuquanshu.bigram.vec"
    gaz_file = "/content/drive/MyDrive/lattcie ner/fyz-lattcie/fyz_lattice_NER/data/ctb.50d.vec"
    # gaz_file = None
    # char_emb = None
    #bichar_emb = None

    print ("CuDNN:", torch.backends.cudnn.enabled)
    # gpu = False
    print ("GPU available:", gpu)
    print ("Status:", status)
    print ("Seg: ", seg)
    print ("Train file:", train_file)
    print ("Dev file:", dev_file)
    print ("Test file:", test_file)
    print ("Raw file:", raw_file)
    print ("Char emb:", char_emb)
    print ("Bichar emb:", bichar_emb)
    print ("Gaz file:",gaz_file)
    if status == 'train':
        print ("Model saved to:", save_model_dir)
    sys.stdout.flush()
    
    if status == 'train':
        data = Data()
        data.HP_gpu = gpu
        data.HP_use_char = False
        data.HP_batch_size = 10
        data.use_bigram = False
        data.gaz_dropout = 0.5
        data.norm_gaz_emb = False
        data.HP_fix_gaz_emb = False
        data_initialization(data, gaz_file, train_file, dev_file, test_file)

        data.generate_instance_with_gaz(train_file,'train')
        data.generate_instance_with_gaz(dev_file,'dev')
        data.generate_instance_with_gaz(test_file,'test')

        data.build_word_pretrain_emb(char_emb)
        data.build_biword_pretrain_emb(bichar_emb)
        data.build_gaz_pretrain_emb(gaz_file)
        #data = load_data_setting(dset_dir)
        model = train(data, save_model_dir,dset_dir, seg)
    elif status == 'test':      
        data = load_data_setting(dset_dir)
        data.generate_instance_with_gaz(dev_file,'dev')
        load_model_decode(model_dir, data , 'dev', gpu, seg)
        data.generate_instance_with_gaz(test_file,'test')
        load_model_decode(model_dir, data, 'test', gpu, seg)
    elif status == 'decode':       
        data = load_data_setting(dset_dir)
        data.generate_instance_with_gaz(raw_file,'raw')
        decode_results = load_model_decode(model_dir, data, 'raw', gpu, seg)
        data.write_decoded_results(output_file, decode_results, 'raw')
    else:
        print ("Invalid argument! Please use valid arguments! (train/test/decode)")


CuDNN: True
GPU available: True
Status: train
Seg:  True
Train file: /content/drive/MyDrive/lattcie ner/fyz-lattcie/cyx/train.txt
Dev file: /content/drive/MyDrive/lattcie ner/fyz-lattcie/cyx/dev.txt
Test file: /content/drive/MyDrive/lattcie ner/fyz-lattcie/cyx/test.txt
Raw file: None
Char emb: /content/drive/MyDrive/lattcie ner/fyz-lattcie/fyz_lattice_NER/data/gigaword_chn.all.a2b.uni.ite50.vec
Bichar emb: /content/drive/MyDrive/lattcie ner/TCM_NER-master/TCM_NER-master/data/gigaword_chn.all.a2b.bi.ite50.vec
Gaz file: /content/drive/MyDrive/lattcie ner/fyz-lattcie/fyz_lattice_NER/data/ctb.50d.vec
Model saved to: /content/drive/MyDrive/lattcie ner/fyz-lattcie/cyx/lattice-saved_model.lstmcrf
Load gaz file:  /content/drive/MyDrive/lattcie ner/fyz-lattcie/fyz_lattice_NER/data/ctb.50d.vec  total size: 704368
gaz alphabet size: 16084
gaz alphabet size: 17591
gaz alphabet size: 19303
build word pretrain emb...
Embedding:
     pretrain word:11327, prefect match:2933, case_match:0, oov:63, oov%

  init.orthogonal(self.weight_ih.data)
  init.orthogonal(self.alpha_weight_ih.data)
  init.constant(self.bias.data, val=0)
  init.constant(self.alpha_bias.data, val=0)
  init.orthogonal(self.weight_ih.data)
  init.constant(self.bias.data, val=0)


build LatticeLSTM...  backward , Fix emb: False  gaz drop: 0.5
load pretrain word emb... (19303, 50)
build batched crf...
finished built model.
Epoch: 0/5
 Learning rate is setted as: 0.015


  masked_cur_partition = cur_partition.masked_select(mask_idx)
  partition.masked_scatter_(mask_idx, masked_cur_partition)
  tg_energy = tg_energy.masked_select(mask.transpose(1,0))
  cur_bp.masked_fill_(mask[idx].view(batch_size, 1).expand(batch_size, tag_size), 0)


Instance: 500; Time: 24.08s; loss: 19336.7461; acc: 5557.0/10164.0=0.5467
Instance: 1000; Time: 20.88s; loss: 3985.9834; acc: 12626.0/19388.0=0.6512
Instance: 1500; Time: 26.42s; loss: 2948.6758; acc: 20960.0/29575.0=0.7087
Instance: 2000; Time: 23.54s; loss: 2361.9191; acc: 29333.0/39541.0=0.7418
Instance: 2500; Time: 22.03s; loss: 2181.5068; acc: 37478.0/49217.0=0.7615
Instance: 3000; Time: 23.15s; loss: 2184.5509; acc: 46065.0/59208.0=0.7780
Instance: 3500; Time: 24.91s; loss: 2333.4824; acc: 55140.0/69939.0=0.7884
Instance: 4000; Time: 22.07s; loss: 2649.3646; acc: 62970.0/79362.0=0.7935
Instance: 4500; Time: 21.80s; loss: 1425.0020; acc: 71499.0/88937.0=0.8039
Instance: 5000; Time: 25.02s; loss: 2133.5153; acc: 80793.0/99568.0=0.8114
Instance: 5500; Time: 22.49s; loss: 1562.7746; acc: 89581.0/109490.0=0.8182
Instance: 6000; Time: 24.35s; loss: 1557.4173; acc: 98897.0/119901.0=0.8248
Instance: 6500; Time: 21.05s; loss: 1208.1266; acc: 107176.0/129088.0=0.8303
Instance: 7000; Time: 

  word_var = autograd.Variable(torch.LongTensor(skip_input_[t][0]),volatile =  volatile_flag)


gold_num =  10552  pred_num =  10689  right_num =  8537
Dev: time: 103.12s, speed: 54.74st/s; acc: 0.9504, p: 0.7987, r: 0.8090, f: 0.8038
Exceed previous best f score: -1
gold_num =  15522  pred_num =  15776  right_num =  12604
Test: time: 151.81s, speed: 55.86st/s; acc: 0.9537, p: 0.7989, r: 0.8120, f: 0.8054
Epoch: 1/5
 Learning rate is setted as: 0.014249999999999999
Instance: 500; Time: 22.03s; loss: 732.0709; acc: 9117.0/9658.0=0.9440
Instance: 1000; Time: 22.81s; loss: 842.3843; acc: 18416.0/19628.0=0.9383
Instance: 1500; Time: 23.29s; loss: 710.5457; acc: 27994.0/29732.0=0.9415
Instance: 2000; Time: 22.92s; loss: 938.7327; acc: 37289.0/39699.0=0.9393
Instance: 2500; Time: 21.48s; loss: 653.0795; acc: 46260.0/49201.0=0.9402
Instance: 3000; Time: 23.07s; loss: 916.3172; acc: 55464.0/59078.0=0.9388
Instance: 3500; Time: 23.28s; loss: 796.9550; acc: 64879.0/69038.0=0.9398
Instance: 4000; Time: 20.64s; loss: 910.4890; acc: 73588.0/78409.0=0.9385
Instance: 4500; Time: 23.54s; loss: 8

In [7]:
torch.save(model,"/content/drive/MyDrive/lattcie ner/fyz-lattcie/cyx/86%-228-test.model")

In [8]:
import torch
model_test=(torch.load("/content/drive/MyDrive/lattcie ner/fyz-lattcie/cyx/86%-228-test.model"))
data = load_data_setting("/content/drive/MyDrive/lattcie ner/fyz-lattcie/cyx/save-data.dset")
data.generate_instance_with_gaz('/content/drive/MyDrive/TT.txt','test')
gpu = torch.cuda.is_available()
load_model_decode_with_model(model_test, data, 'test', gpu, True)

Data setting loaded from file:  /content/drive/MyDrive/lattcie ner/fyz-lattcie/cyx/save-data.dset
DATA SUMMARY START:
     Tag          scheme: BIO
     MAX SENTENCE LENGTH: 250
     MAX   WORD   LENGTH: -1
     Number   normalized: True
     Use          bigram: False
     Word  alphabet size: 2997
     Biword alphabet size: 91748
     Char  alphabet size: 2997
     Gaz   alphabet size: 19303
     Label alphabet size: 5
     Word embedding size: 50
     Biword embedding size: 50
     Char embedding size: 30
     Gaz embedding size: 50
     Norm     word   emb: True
     Norm     biword emb: True
     Norm     gaz    emb: False
     Norm   gaz  dropout: 0.5
     Train instance number: 26182
     Dev   instance number: 5638
     Test  instance number: 8464
     Raw   instance number: 0
     Hyperpara  iteration: 100
     Hyperpara  batch size: 10
     Hyperpara          lr: 0.015
     Hyperpara    lr_decay: 0.05
     Hyperpara     HP_clip: 5.0
     Hyperpara    momentum: 0
     Hyperpar

  word_var = autograd.Variable(torch.LongTensor(skip_input_[t][0]),volatile =  volatile_flag)


gold_num =  0  pred_num =  3  right_num =  0
test: time:2.26s, speed:1.33st/s; acc: 0.6897, p: 0.0000, r: -1.0000, f: -1.0000


  cur_bp.masked_fill_(mask[idx].view(batch_size, 1).expand(batch_size, tag_size), 0)


[['O',
  'B-SYM',
  'E-SYM',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['B-SYM', 'I-SYM', 'I-SYM', 'E-SYM', 'O', 'O', 'O'],
 ['B-SYM', 'I-SYM', 'E-SYM', 'O', 'O', 'O', 'O']]

In [10]:
data.generate_instance_with_gaz('/content/drive/MyDrive/TT.txt','test')

In [11]:
load_model_decode_with_model(model_test, data, 'test', gpu, True)

Decode test data ...
gold_num =  0  pred_num =  6  right_num =  0
test: time:0.08s, speed:51.80st/s; acc: 0.6087, p: 0.0000, r: -1.0000, f: -1.0000


  word_var = autograd.Variable(torch.LongTensor(skip_input_[t][0]),volatile =  volatile_flag)
  cur_bp.masked_fill_(mask[idx].view(batch_size, 1).expand(batch_size, tag_size), 0)


[['O',
  'B-SYM',
  'E-SYM',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['B-SYM', 'I-SYM', 'I-SYM', 'E-SYM', 'O', 'O', 'O'],
 ['B-SYM', 'I-SYM', 'E-SYM', 'O', 'O', 'O', 'O'],
 ['O',
  'O',
  'O',
  'B-SYM',
  'E-SYM',
  'B-SYM',
  'E-SYM',
  'O',
  'B-SYM',
  'E-SYM',
  'E-SYM',
  'E-SYM',
  'E-SYM',
  'O',
  'O',
  'O',
  'O']]

In [None]:
data = load_data_setting("/content/drive/MyDrive/lattcie ner/fyz-lattcie/fyz_lattice_NER/ResumeNER/save-222.dset")

Data setting loaded from file:  /content/drive/MyDrive/lattcie ner/fyz-lattcie/fyz_lattice_NER/ResumeNER/save-222.dset
DATA SUMMARY START:
     Tag          scheme: BIO
     MAX SENTENCE LENGTH: 250
     MAX   WORD   LENGTH: -1
     Number   normalized: True
     Use          bigram: False
     Word  alphabet size: 3153
     Biword alphabet size: 54022
     Char  alphabet size: 3153
     Gaz   alphabet size: 12449
     Label alphabet size: 15
     Word embedding size: 50
     Biword embedding size: 50
     Char embedding size: 30
     Gaz embedding size: 50
     Norm     word   emb: True
     Norm     biword emb: True
     Norm     gaz    emb: False
     Norm   gaz  dropout: 0.5
     Train instance number: 72016
     Dev   instance number: 24006
     Test  instance number: 24006
     Raw   instance number: 0
     Hyperpara  iteration: 100
     Hyperpara  batch size: 10
     Hyperpara          lr: 0.015
     Hyperpara    lr_decay: 0.05
     Hyperpara     HP_clip: 5.0
     Hyperpara    m

In [None]:
model111 = torch.load("/content/drive/MyDrive/lattcie ner/fyz-lattcie/fyz_lattice_NER/model/2-222-saved_model2.lstmcrf")
model_test.load_state_dict(model111)

<All keys matched successfully>

In [None]:
type(model)

model.bilstmcrf.BiLSTM_CRF

In [None]:
torch.save(model,"/content/drive/MyDrive/lattcie ner/fyz-lattcie/fyz_lattice_NER/model/223_test.model")

In [None]:
model_test=(torch.load("/content/drive/MyDrive/lattcie ner/fyz-lattcie/fyz_lattice_NER/model/223_test.model"))

In [None]:
model_test==model

False

In [None]:
type(model)

model.bilstmcrf.BiLSTM_CRF

In [None]:
type(model_test)

model.bilstmcrf.BiLSTM_CRF

In [None]:
pred_results = load_model_decode_with_model(model, data, 'test', gpu, seg)

Decode test data ...


  word_var = autograd.Variable(torch.LongTensor(skip_input_[t][0]),volatile =  volatile_flag)
  cur_bp.masked_fill_(mask[idx].view(batch_size, 1).expand(batch_size, tag_size), 0)


gold_num =  27377  pred_num =  27388  right_num =  26986
test: time:288.80s, speed:83.24st/s; acc: 0.9949, p: 0.9853, r: 0.9857, f: 0.9855


In [None]:
f=open('pred_result.txt','w',encoding='utf-8')
for i in pred_results:
    for word in i:
        f.write(word+'\n')
    f.write('\n')
f.close()

In [None]:
pred_results2 = load_model_decode_with_model(model_test, data, 'test', gpu, seg)

Decode test data ...


  word_var = autograd.Variable(torch.LongTensor(skip_input_[t][0]),volatile =  volatile_flag)
  cur_bp.masked_fill_(mask[idx].view(batch_size, 1).expand(batch_size, tag_size), 0)


gold_num =  27377  pred_num =  27388  right_num =  26986
test: time:285.32s, speed:84.25st/s; acc: 0.9949, p: 0.9853, r: 0.9857, f: 0.9855


In [None]:
pred_results2==pred_results

True

In [None]:
f=open('pred_result2.txt','w',encoding='utf-8')
for i in pred_results2:
    for word in i:
        f.write(word+'\n')
    f.write('\n')
f.close()

In [None]:
data = load_data_setting("/content/drive/MyDrive/lattcie ner/fyz-lattcie/fyz_lattice_NER/ResumeNER/save-222.dset")

Data setting loaded from file:  /content/drive/MyDrive/lattcie ner/fyz-lattcie/fyz_lattice_NER/ResumeNER/save-222.dset
DATA SUMMARY START:
     Tag          scheme: BIO
     MAX SENTENCE LENGTH: 250
     MAX   WORD   LENGTH: -1
     Number   normalized: True
     Use          bigram: False
     Word  alphabet size: 3153
     Biword alphabet size: 54022
     Char  alphabet size: 3153
     Gaz   alphabet size: 12449
     Label alphabet size: 15
     Word embedding size: 50
     Biword embedding size: 50
     Char embedding size: 30
     Gaz embedding size: 50
     Norm     word   emb: True
     Norm     biword emb: True
     Norm     gaz    emb: False
     Norm   gaz  dropout: 0.5
     Train instance number: 72016
     Dev   instance number: 24006
     Test  instance number: 24006
     Raw   instance number: 0
     Hyperpara  iteration: 100
     Hyperpara  batch size: 10
     Hyperpara          lr: 0.015
     Hyperpara    lr_decay: 0.05
     Hyperpara     HP_clip: 5.0
     Hyperpara    m

In [None]:
model=(torch.load("/content/drive/MyDrive/lattcie ner/fyz-lattcie/fyz_lattice_NER/model/223_test.model"))

In [None]:
gpu = torch.cuda.is_available()

In [None]:
parser = argparse.ArgumentParser(description='Tuning with bi-directional LSTM-CRF')
parser.add_argument('--seg', default="True") 
args = parser.parse_args()
if args.seg.lower() == "true":
  seg = True 
else:
  seg = False

In [None]:
def predict(data, model, name):
    if name == "train":
        instances = data.train_Ids
    elif name == "dev":
        instances = data.dev_Ids
    elif name == 'test':
        instances = data.test_Ids
    elif name == 'raw':
        instances = data.raw_Ids
    else:
        print ("Error: wrong evaluate name,", name)
    pred_results = []
    gold_results = []
    ## set model in eval model
    model.eval()
    batch_size = 10
    start_time = time.time()
    train_num = len(instances)
    total_batch = train_num//batch_size+1
    for batch_id in range(total_batch):
        start = batch_id*batch_size
        end = (batch_id+1)*batch_size 
        if end >train_num:
            end =  train_num
        instance = instances[start:end]
        if not instance:
            continue
        gaz_list,batch_word, batch_biword, batch_wordlen, batch_wordrecover, batch_char, batch_charlen, batch_charrecover, batch_label, mask  = batchify_with_label(instance, data.HP_gpu, True)
        tag_seq = model(gaz_list,batch_word, batch_biword, batch_wordlen, batch_char, batch_charlen, batch_charrecover, mask)
        # print "tag:",tag_seq
        pred_label, gold_label = recover_label(tag_seq, batch_label, mask, data.label_alphabet, batch_wordrecover)
        pred_results += pred_label
        gold_results += gold_label
    #decode_time = time.time() - start_time
    #speed = len(instances)/decode_time
    #acc, p, r, f = get_ner_fmeasure(gold_results, pred_results, data.tagScheme)
    return pred_results

In [None]:
def load_model_decode_with_model_predict(model, data, name, gpu, seg=True):
    data.HP_gpu = gpu
    
    print("Decode %s data ..."%(name))
    start_time = time.time()
    pred_results = predict(data, model, name)
    end_time = time.time()
    time_cost = end_time - start_time
    return pred_results

In [None]:
data.generate_instance_with_gaz('/content/drive/MyDrive/TT.txt','test')
a=load_model_decode_with_model_predict(model, data, 'test', gpu, seg)

Decode test data ...


  word_var = autograd.Variable(torch.LongTensor(skip_input_[t][0]),volatile =  volatile_flag)
  cur_bp.masked_fill_(mask[idx].view(batch_size, 1).expand(batch_size, tag_size), 0)


In [None]:
a

[['O',
  'B-sym',
  'E-sym',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-sym',
  'I-sym',
  'E-sym'],
 ['B-sym', 'I-sym', 'I-sym', 'E-sym', 'O', 'O', 'O'],
 ['B-dru', 'I-dru', 'E-dru', 'O', 'O', 'O', 'O']]

In [None]:
data.test_Ids

[]

In [None]:
data.generate_instance_with_gaz('/content/drive/MyDrive/lattcie ner/fyz-lattcie/fyz_lattice_NER/ResumeNER/my_dev_set.txt','dev')
load_model_decode_with_model(model, data , 'dev', gpu, seg)


Decode dev data ...


  word_var = autograd.Variable(torch.LongTensor(skip_input_[t][0]),volatile =  volatile_flag)
  cur_bp.masked_fill_(mask[idx].view(batch_size, 1).expand(batch_size, tag_size), 0)


gold_num =  27528  pred_num =  27538  right_num =  27159
dev: time:281.01s, speed:85.55st/s; acc: 0.9954, p: 0.9862, r: 0.9866, f: 0.9864
Decode test data ...
