In [1]:
import warnings
import pickle
import numpy as np 
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
from TorchCRF import CRF

In [2]:
class DefaultConfig(object):
    pickle_path = '/Users/apple/Desktop/data/renmindata.pkl'  # 训练集存放路径
    load_model_path = None  # 加载预训练的模型的路径，为None代表不加载
    batch_size = 64  # batch size
    print_freq = 20  # print info every N batch
    max_epoch = 10
    lr = 0.001  # initial learning rate
    lr_decay = 0.5  # when val_loss increase, lr = lr*lr_decay
    weight_decay = 1e-5  
    embedding_dim = 100  #size of feature vector
    hidden_dim = 200  #number of nodes each layer possessing
    dropout = 0.2  
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
                
opt = DefaultConfig()

In [3]:
#keep the file open till we read all the content
with open(opt.pickle_path, 'rb') as inp:
        word2id = pickle.load(inp)
        id2word = pickle.load(inp)
        tag2id = pickle.load(inp)
        id2tag = pickle.load(inp)
        x_train = pickle.load(inp)
        y_train = pickle.load(inp)
        x_test = pickle.load(inp)
        y_test = pickle.load(inp)
        x_valid = pickle.load(inp)
        y_valid = pickle.load(inp)
        
print("train len:", len(x_train))
print("test len:", len(x_test))
print("valid len", len(x_valid))
print(word2id)
print(len(tag2id))
print(x_train)
print(y_train)
print(x_train.shape)
print(y_train.shape)

train len: 24271
test len: 7585
valid len 6068
的            1
国            2
中            3
在            4
１            5
          ... 
萎         3913
凳         3914
瞪         3915
妒         3916
unknow    3917
Length: 3917, dtype: int64
11
[[ 53 754 237 ...   0   0   0]
 [ 51 523  71 ...   0   0   0]
 [169   2 392 ...   0   0   0]
 ...
 [ 61  47 302 ...   0   0   0]
 [426 201 580 ...   0   0   0]
 [180  16  20 ...   0   0   0]]
[[ 1  1  1 ...  0  0  0]
 [ 1  1  1 ...  0  0  0]
 [ 1  1  1 ...  0  0  0]
 ...
 [ 1 10  6 ...  0  0  0]
 [ 8  2  9 ...  0  0  0]
 [ 1  5  3 ...  0  0  0]]
(24271, 60)
(24271, 60)


In [4]:
class NERDataset(Dataset):
    #concatenate x and y which are sequence and label
    def __init__(self, X, Y):
        self.data = [{'x': X[i], 'y': Y[i]} for i in range(X.shape[0])]

    def __getitem__(self, index):
        return self.data[index]

    def __len__(self):
        return len(self.data)

train_dataset = NERDataset(x_train, y_train)
valid_dataset = NERDataset(x_valid, y_valid)
test_dataset = NERDataset(x_test, y_test)

#by dataloader change the form into tensor
train_dataloader = DataLoader(train_dataset, batch_size=opt.batch_size, shuffle=True)
valid_dataloader = DataLoader(valid_dataset, batch_size=opt.batch_size, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=opt.batch_size, shuffle=False)

In [5]:
class NERLSTM(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, dropout, word2id, tag2id):
        super(NERLSTM, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = len(word2id) + 1
        self.tag_to_ix = tag2id
        self.tagset_size = len(tag2id)
        #there are vocab_size words and we use embedding_dim features to describle a word
        self.word_embeds = nn.Embedding(self.vocab_size, self.embedding_dim)
        self.dropout = nn.Dropout(dropout)
        #it is a birectinal LSTM, so the real num of hidden_dim is equal to hidden_dim // 2
        self.lstm = nn.LSTM(self.embedding_dim, self.hidden_dim // 2, num_layers=1, bidirectional=True, batch_first=True)
        self.hidden2tag = nn.Linear(self.hidden_dim, self.tagset_size)

    def forward(self, x): # (bs, seq_len)
        batch_size = x.size(0)
        sent_len = x.size(1)
        embedding = self.word_embeds(x)
        outputs, hidden = self.lstm(embedding)
        outputs = self.dropout(outputs)
        outputs = self.hidden2tag(outputs)
        return outputs

In [6]:
class NERLSTM_CRF(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, dropout, word2id, tag2id):
        super(NERLSTM_CRF, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = len(word2id) + 1
        self.tag_to_ix = tag2id
        self.tagset_size = len(tag2id)
        self.word_embeds = nn.Embedding(self.vocab_size, self.embedding_dim)
        self.dropout = nn.Dropout(dropout)
        self.lstm = nn.LSTM(self.embedding_dim, self.hidden_dim // 2, num_layers=1, bidirectional=True, batch_first=False)
        self.hidden2tag = nn.Linear(self.hidden_dim, self.tagset_size)
        self.crf = CRF(self.tagset_size)

    def forward(self, x):  # (batch_size, seq_len)
        #batch_first is false
        x = x.transpose(0, 1)
        batch_size = x.size(1)
        sent_len = x.size(0)
        embedding = self.word_embeds(x)
        outputs, hidden = self.lstm(embedding)
        outputs = self.dropout(outputs)
        outputs = self.hidden2tag(outputs)
        #decode returns the best tag for the batch
        outputs = self.crf.decode(outputs)
        return outputs

    def log_likelihood(self, x, tags): # (bs, seq_len)
        x = x.transpose(0, 1)
        batch_size = x.size(1)
        sent_len = x.size(0)
        tags = tags.transpose(0, 1)
        #to be mentioned, we should convert tags to long type so that it can be used as index
        tags = tags.type(torch.long)
        embedding = self.word_embeds(x)
        outputs, hidden = self.lstm(embedding)
        outputs = self.dropout(outputs)
        outputs = self.hidden2tag(outputs)
        #it uses log_likelihood method to calculate its loss. A minus annotation should be added
        return -self.crf(outputs, tags)

In [7]:
def format_result(result, text, tag): 
    entities = [] 
    for i in result: 
        begin, end = i 
        entities.append({ 
            "start":begin, 
            "stop":end + 1, 
            "word":text[begin:end+1],
            "type":tag
        }) 
    return entities

def get_tags(path, tag, tag_map):
    begin_tag = tag_map.get("B_" + tag)
    mid_tag = tag_map.get("M_" + tag)
    end_tag = tag_map.get("E_" + tag)
    # single_tag = tag_map.get("S")
    o_tag = tag_map.get("O")
    begin = -1
    end = 0
    tags = []
    last_tag = 0
    for index, tag in enumerate(path):
        if tag == begin_tag and index == 0:
            begin = 0
        elif tag == begin_tag:
            begin = index
        elif tag == end_tag and last_tag in [mid_tag, begin_tag] and begin > -1:
            end = index
            tags.append([begin, end])
        elif tag == o_tag:
            begin = -1
        last_tag = tag
    return tags

def f1_score(tar_path, pre_path, tag, tag_map):
    origin = 0.
    found = 0.
    right = 0.
    for fetch in zip(tar_path, pre_path):
        tar, pre = fetch
        tar_tags = get_tags(tar, tag, tag_map)
        pre_tags = get_tags(pre, tag, tag_map)

        origin += len(tar_tags)
        found += len(pre_tags)

        for p_tag in pre_tags:
            if p_tag in tar_tags:
                right += 1

    recall = 0. if origin == 0 else (right / origin)
    precision = 0. if found == 0 else (right / found)
    f1 = 0. if recall+precision == 0 else (2*precision*recall)/(precision + recall)
    print("\t{}\trecall {:.2f}\tprecision {:.2f}\tf1 {:.2f}".format(tag, recall, precision, f1))
    return recall, precision, f1

In [8]:
#initialize the model, the loss calculation and the optimizer
model = NERLSTM_CRF(opt.embedding_dim, opt.hidden_dim, opt.dropout, word2id, tag2id).to(opt.device)
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(model.parameters(), lr=opt.lr, weight_decay=opt.weight_decay)

In [9]:
class ChineseNER(object):
    def train(self):
        for epoch in range(opt.max_epoch):
            model.train()
            for index, batch in enumerate(train_dataloader):
                #empty the gradient
                optimizer.zero_grad()
                #X is the sequence and y is the label set
                X = batch['x'].to(opt.device)
                y = batch['y'].to(opt.device)
                #calculate the loss
                loss = model.log_likelihood(X, y)
                #back propaganda
                loss.backward()
                #10. is the max_norm, clip_coef = max_norm / total_norm, the larger the clip_corf is, the seriou it clip the gradient.
                torch.nn.utils.clip_grad_norm_(model.parameters(), 10.)
                #upgrade the parameters
                optimizer.step()
                if index % 200 == 0:
                    print('epoch:%04d,------------loss:%f'%(epoch, loss.item()))
            # in this part, gradient will not be tracked
            with torch.no_grad():
                aver_loss = 0
                # predictions and labels
                preds, labels = [], []
                for index, batch in enumerate(valid_dataloader):
                    model.eval()
                    #(batch_size, seq_len)
                    val_x, val_y = batch['x'].to(opt.device), batch['y'].to(opt.device)
                    predict = model(val_x)
                    loss = model.log_likelihood(val_x, val_y)
                    aver_loss += loss.item()

                    leng = []
                    #move val_y to cpu
                    #remove the label 0 in labels, namely the real length of the sequence
                    for i in val_y.cpu():
                        tmp = []
                        for j in i:
                            if j.item()>0:
                                tmp.append(j.item())
                        leng.append(tmp)
                    for index, i in enumerate(predict):
                        #get the real labelled sequence 
                        preds += i[:len(leng[index])]
                    for index, i in enumerate(val_y.tolist()):
                        #get the real label corresponding to prediction
                        labels += i[:len(leng[index])]
                aver_loss /= (len(valid_dataloader) * 64)
                report = classification_report(labels, preds)
                print(report)
                torch.save(model.state_dict(), '/Users/apple/Desktop/NLP_base/NERLSTM_CRF')

    def predict(self, tag, input_str=""):
        with torch.no_grad():
            model.load_state_dict(torch.load("/Users/apple/Desktop/NLP_base/NERLSTM_CRF"))
            if not input_str:
                input_str = input("请输入文本: ")
            # for each word in input_str, use the dictionary above to get the vector
            input_vec = [word2id.get(i, 0) for i in input_str]
            # convert to tensor and reshape the tensor
            sentences = torch.tensor(input_vec).view(1, -1)
            paths = model(sentences)
            entities = []
            tags = get_tags(paths[0], tag, tag2id)
            entities += format_result(tags, input_str, tag)
            print(entities)

In [None]:
cn = ChineseNER()
cn.train()

In [1]:
cn.predict('中华人民共和国万岁')


NameError: name 'cn' is not defined