In [3]:
# -*- coding: utf-8 -*-

# @Author  : xmh
# @Time    : 2021/3/3 20:42
# @File    : config_ner.py

"""
file description:：

"""
import torch

if torch.cuda.is_available():
    USE_CUDA = True
    print("USE_CUDA....")
else:
    USE_CUDA = False


class ConfigNer:
    def __init__(self,
                 lr=0.001,
                 epochs=100,
                 vocab_size=220000,
                 embedding_dim=100,
                 hidden_dim_lstm=128,
                 num_layers=3,
                 batch_size=32,
                 layer_size=128,
                 token_type_dim=8
                 ):
        self.lr = lr
        self.epochs = epochs
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.hidden_dim_lstm = hidden_dim_lstm
        self.num_layers = num_layers
        self.batch_size = batch_size
        self.layer_size = layer_size
        self.token_type_dim = token_type_dim
        self.relations = ["N", '丈夫', '上映时间', '专业代码', '主持人', '主演', '主角', '人口数量', '作曲', '作者', '作词', '修业年限', '出品公司', '出版社', '出生地',
                '出生日期', '创始人', '制片人', '占地面积', '号', '嘉宾', '国籍', '妻子', '字', '官方语言', '导演', '总部地点', '成立日期', '所在城市', '所属专辑',
                '改编自', '朝代', '歌手', '母亲', '毕业院校', '民族', '气候', '注册资本', '海拔', '父亲', '目', '祖籍', '简称', '编剧', '董事长', '身高',
                '连载网站', '邮政编码', '面积', '首都',"causes"]
        self.num_relations = len(self.relations)
        self.token_types_origin = ['Date', 'Number', 'Text', '书籍', '人物', '企业', '作品', '出版社', '历史人物', '国家', '图书作品', '地点', '城市', '学校', '学科专业',
         '影视作品', '景点', '机构', '歌曲', '气候', '生物', '电视综艺', '目', '网站', '网络小说', '行政区', '语言', '音乐专辑',"drug","adverse"]
        self.token_types = self.get_token_types()
        self.num_token_type = len(self.token_types)
        self.vocab_file = '/kaggle/input/casrel-adr-data/vocab.txt'
        self.max_seq_length = 256
        self.num_sample = 204800

        self.dropout_embedding = 0.1  # 从0.2到0.1
        self.dropout_lstm = 0.1
        self.dropout_lstm_output = 0.9
        self.dropout_head = 0.9  # 只更改这个参数 0.9到0.5
        self.dropout_ner = 0.8
        self.use_dropout = True
        self.threshold_rel = 0.65  # 从0.7到0.95
        self.teach_rate = 0.2
        self.ner_checkpoint_path = ''
    
    def get_token_types(self):
        token_type_bio = []
        for token_type in self.token_types_origin:
            token_type_bio.append('B-' + token_type)
            token_type_bio.append('I-' + token_type)
        token_type_bio.append('O')
        
        return token_type_bio



USE_CUDA....


In [4]:
import  pandas as pd


# def process_data(file_path):
#     data = pd.read_csv(file_path)
#     # print(data.columns)
#     processed_data = []
#
#     for index, line in data.iterrows():
#         dct = {
#             "text": [],
#             "spo_list": {
#                 "subject": [],
#                 "object": [],
#                 "predicate": [],
#             },
#         }
#         dct["text"] = line["text"]
#         dct["spo_list"]["object"] = line["effect"]
#         dct["spo_list"]["subject"] = line["drug"]
#         dct["spo_list"]["predicate"] = "causes"
#         processed_data.append(dct)
#
#
#     return processed_data
#
#
#
def merge_data(data):
    name_list = []
    data_list = []

    for each in data:
        if each["text"] not in name_list:
            name_list.append(each["text"])
            data_list.append(each)
        else:
            index = name_list.index(each["text"])
            data_list[index]["spo_list"].append(each["spo_list"][0])

    return data_list


def process_data(file_path):
    data = pd.read_csv(file_path)
    # print(data.columns)
    processed_data = []

    for index, line in data.iterrows():
        dct = {
            "text": [],
            "spo_list": [],
        }
        spo = {
            "subject": "",
            "predicate": "",
            "object": "",
            "subject_type":"drug",
            "object_type":"adverse"

        }

        dct["text"] = line["text"]
        spo["object"] = line["effect"]
        spo["subject"] = line["drug"]
        spo["predicate"] = "causes"
        dct["spo_list"].append(({"subject":spo["subject"],"predicate":spo["predicate"],"object":spo["object"],"subject_type":spo["subject_type"],"object_type":spo["object_type"]}))
        processed_data.append(dct)

    return merge_data(processed_data)






In [5]:
# -*- coding: utf-8 -*-

# @Author  : xmh
# @Time    : 2021/3/3 14:31
# @File    : process_ner.py

"""
file description:：

"""

'''
针对spo_list的主客体在原文的token进行标注，第一个字标注B-type,后面的字标注I-type，文本中其他词标注为O
（先将所有文本标注为O，然后根据spo_list的内容，将对应位置覆盖）

'''
import json
import torch
import copy

import numpy as np



class ModelDataPreparation:
    def __init__(self, config):
        self.config = config
        self.get_type2id()
    
    def subject_object_labeling(self, spo_list, text, text_tokened):
        # 在列表 k 中确定列表 q 的位置
        def _index_q_list_in_k_list(q_list, k_list):
            """Known q_list in k_list, find index(first time) of q_list in k_list"""
            q_list_length = len(q_list)
            k_list_length = len(k_list)
            for idx in range(k_list_length - q_list_length + 1):
                t = [q == k for q, k in zip(q_list, k_list[idx: idx + q_list_length])]
                # print(idx, t)
                if all(t):
                    # print(idx)
                    idx_start = idx
                    return idx_start

        # 给主体和客体表上BIO分割式类型标签
        def _labeling_type(subject_object, so_type):
            so_tokened = [c for c in subject_object]
            so_tokened_length = len(so_tokened)
            idx_start = _index_q_list_in_k_list(q_list=so_tokened, k_list=text_tokened)
            if idx_start is None:
                tokener_error_flag = True
                '''
                实体: "1981年"  原句: "●1981年2月27日，中国人口学会成立"
                so_tokened ['1981', '年']  text_tokened ['●', '##19', '##81', '年', '2', '月', '27', '日', '，', '中', '国', '人', '口', '学', '会', '成', '立']
                so_tokened 无法在 text_tokened 找到！原因是bert_tokenizer.tokenize 分词增添 “##” 所致！
                '''
            else:  # 给实体开始处标 B 其它位置标 I
                labeling_list[idx_start] = "B-" + so_type
                if so_tokened_length == 2:
                    labeling_list[idx_start + 1] = "I-" + so_type
                elif so_tokened_length >= 3:
                    labeling_list[idx_start + 1: idx_start + so_tokened_length] = ["I-" + so_type] * (
                                so_tokened_length - 1)
            return idx_start

        labeling_list = ["O" for _ in range(len(text_tokened))]
        have_error = False
        for spo_item in spo_list:
            subject = spo_item["subject"]
            subject_type = spo_item["subject_type"]
            object = spo_item["object"]
            subject, object = map(self.get_rid_unkonwn_word, (subject, object))
            subject = list(map(lambda x: x.lower(), subject))
            object = list(map(lambda x: x.lower(), object))
            object_type = spo_item["object_type"]
            subject_idx_start = _labeling_type(subject, subject_type)
            object_idx_start = _labeling_type(object, object_type)
            if subject_idx_start is None or object_idx_start is None:
                have_error = True
                return labeling_list, have_error
            #sample_cls = '$'.join([subject, object, text.replace(subject, '#'*len(subject)).replace(object, '#')])
            #cls_list.append(sample_cls)
        return labeling_list, have_error

    def get_rid_unkonwn_word(self, text):
        text_rid = []
        for token in text:  # 删除不在vocab里面的词汇
            if token in self.token2id.keys():
                text_rid.append(token)
        return text_rid
    
    def get_type2id(self):
        self.token_type2id = {}
        for i, token_type in enumerate(self.config.token_types):
            self.token_type2id[token_type] = i
        # with open('token_type2id.json', 'w', encoding='utf-8') as f:
        #     json.dump(self.token_type2id, f, ensure_ascii=False)
        # with open('rel2id.json', 'w', encoding='utf-8') as f:
        #     json.dump(self.rel2id, f, ensure_ascii=False)
        self.token2id = {}
        with open(self.config.vocab_file, 'r', encoding='utf-8') as f:
            cnt = 0
            for line in f:
                line = line.rstrip().split()
                self.token2id[line[0]] = cnt
                cnt += 1
        self.token2id[' '] = cnt
    
    def get_data(self, file_path, is_test=False):
        data = []
        cnt = 0
        datas = process_data(file_path)
        for data_item in datas:
            cnt += 1
            if cnt > self.config.num_sample:
                break
            if not is_test:
                spo_list = data_item['spo_list']
            else:
                spo_list = []
            text = data_item['text']
            text_tokened = [c.lower() for c in text]  # 中文使用简单的分词
            token_type_list, token_type_origin = None, None

            text_tokened = self.get_rid_unkonwn_word(text_tokened)
            if not is_test:
                token_type_list, have_error = self.subject_object_labeling(
                    spo_list=spo_list, text=text, text_tokened=text_tokened
                )
                token_type_origin = token_type_list  # 保存没有数值化前的token_type
                if have_error:
                    continue
            item = {'text_tokened': text_tokened, 'token_type_list': token_type_list}
            item['text_tokened'] = [self.token2id[x] for x in item['text_tokened']]
            if not is_test:
                item['token_type_list'] = [self.token_type2id[x] for x in item['token_type_list']]
            item['text'] = ''.join(text_tokened)  # 保存消除异常词汇的文本
            item['spo_list'] = spo_list
            item['token_type_origin'] = token_type_origin
            data.append(item)
        dataset = Dataset(data)
        if is_test:
            dataset.is_test = True
        data_loader = torch.utils.data.DataLoader(
            dataset=dataset,
            batch_size=self.config.batch_size,
            collate_fn=dataset.collate_fn,
            drop_last=True
        )
        return data_loader


    def get_train_dev_data(self, path_train=None, path_dev=None, path_test=None):
        train_loader, dev_loader, test_loader = None, None, None
        if path_train is not None:
            train_loader = self.get_data(path_train)
        if path_dev is not None:
            dev_loader = self.get_data(path_dev)
        if path_test is not None:
            test_loader = self.get_data(path_test, is_test=True)
        
        return train_loader, dev_loader, test_loader
        

class Dataset(torch.utils.data.Dataset):
    def __init__(self, data):
        self.data = copy.deepcopy(data)
        self.is_test = False
    
    def __getitem__(self, index):
        text_tokened = self.data[index]['text_tokened']
        token_type_list = self.data[index]['token_type_list']
        
        data_info = {}
        for key in self.data[0].keys():
            # try:
            #     data_info[key] = locals()[key]
            # except KeyError:
            #     print('{} cannot be found in locals()'.format(key))
            if key in locals():
                data_info[key] = locals()[key]

        data_info['text'] = self.data[index]['text']
        data_info['spo_list'] = self.data[index]['spo_list']
        data_info['token_type_origin'] = self.data[index]['token_type_origin']
        return data_info
    
    def __len__(self):
        return len(self.data)
      
    def collate_fn(self, data_batch):
        
        def merge(sequences):
            lengths = [len(seq) for seq in sequences]
            max_length = max(lengths)
            # padded_seqs = torch.zeros(len(sequences), max_length)
            padded_seqs = torch.zeros(len(sequences), max_length)
            tmp_pad = torch.ones(1, max_length)
            mask_tokens = torch.zeros(len(sequences), max_length)
            for i, seq in enumerate(sequences):
                end = lengths[i]
                seq = torch.LongTensor(seq)
                if len(seq) != 0:
                    padded_seqs[i, :end] = seq[:end]
                    mask_tokens[i, :end] = tmp_pad[0, :end]
                    
            return padded_seqs, mask_tokens
        item_info = {}
        for key in data_batch[0].keys():
            item_info[key] = [d[key] for d in data_batch]
        token_type_list = None
        text_tokened, mask_tokens = merge(item_info['text_tokened'])
        if not self.is_test:
            token_type_list, _ = merge(item_info['token_type_list'])
        # convert to contiguous and cuda
        if USE_CUDA:
            text_tokened = text_tokened.contiguous().cuda()
            mask_tokens = mask_tokens.contiguous().cuda()
        else:
            text_tokened = text_tokened.contiguous()
            mask_tokens = mask_tokens.contiguous()

        if not self.is_test:
            if USE_CUDA:
                token_type_list = token_type_list.contiguous().cuda()

            else:
                token_type_list = token_type_list.contiguous()

        data_info = {"mask_tokens": mask_tokens.to(torch.uint8)}
        data_info['text'] = item_info['text']
        data_info['spo_list'] = item_info['spo_list']
        data_info['token_type_origin'] = item_info['token_type_origin']
        for key in item_info.keys():
            # try:
            #     data_info[key] = locals()[key]
            # except KeyError:
            #     print('{} cannot be found in locals()'.format(key))
            if key in locals():
                data_info[key] = locals()[key]
        
        return data_info

if __name__ == '__main__':
    config = ConfigNer()
    process = ModelDataPreparation(config)
    train_loader, dev_loader, test_loader = process.get_train_dev_data('/kaggle/input/casrel-adr-data/adr-train.csv')
    # train_loader, dev_loader, test_loader = process.get_train_dev_data('../data/train_data_small.json')
   

In [6]:
# -*- coding: utf-8 -*-

# @Author  : xmh
# @Time    : 2021/3/3 10:02
# @File    : model_ner.py

"""
file description:：

"""
!pip install -U -i https://pypi.tuna.tsinghua.edu.cn/simple pytorch-crf==0.7.0

import torch
import torch.nn as nn
import torch.nn.functional as F
from torchcrf import CRF

import numpy as np


def setup_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)


class SeqLabel(nn.Module):
    def __init__(self, config):
        super().__init__()
        setup_seed(1)
        
        self.vocab_size = config.vocab_size
        self.embedding_dim = config.embedding_dim
        self.hidden_dim = config.hidden_dim_lstm
        self.num_layers = config.num_layers
        self.batch_size = config.batch_size
        self.layer_size = config.layer_size  # self.hidden_dim, 之前这里没有改
        self.num_token_type = config.num_token_type  # 实体类型的综述
        self.config = config
        
        self.word_embedding = nn.Embedding(config.vocab_size, config.embedding_dim)
        self.token_type_embedding = nn.Embedding(config.num_token_type, config.token_type_dim)
        self.gru = nn.GRU(config.embedding_dim, config.hidden_dim_lstm, num_layers=config.num_layers, batch_first=True,
                          bidirectional=True)
        self.is_train = True
        if USE_CUDA:
            self.weights_rel = (torch.ones(self.config.num_relations) * 100).cuda()
        else:
            self.weights_rel = torch.ones(self.config.num_relations) * 100
        self.weights_rel[0] = 1

        self.V_ner = nn.Parameter(torch.rand((config.num_token_type, self.layer_size)))
        self.U_ner = nn.Parameter(torch.rand((self.layer_size, 2 * self.hidden_dim)))
        self.b_s_ner = nn.Parameter(torch.rand(self.layer_size))
        self.b_c_ner = nn.Parameter(torch.rand(config.num_token_type))
        
        self.dropout_embedding_layer = torch.nn.Dropout(config.dropout_embedding)
        self.dropout_ner_layer = torch.nn.Dropout(config.dropout_ner)
        self.dropout_lstm_layer = torch.nn.Dropout(config.dropout_lstm)
        self.crf_model = CRF(self.num_token_type,batch_first=True)
        
    def get_ner_score(self, output_lstm):
        
        res = torch.matmul(output_lstm, self.U_ner.transpose(-1, -2)) + self.b_s_ner # [seq_len, batch, self.layer_size]
        res = torch.tanh(res)
        # res = F.leaky_relu(res,  negative_slope=0.01)
        if self.config.use_dropout:
            res = self.dropout_ner_layer(res)
            
        ans = torch.matmul(res, self.V_ner.transpose(-1, -2)) + self.b_c_ner  # [seq_len, batch, num_token_type]
        
        return ans
    
    def forward(self, data_item, is_test=False):
        # 因为不是多跳机制，所以hidden_init不能继承之前的最终隐含态
        '''
        
        :param data_item: data_item = {'',}
        :type data_item: dict
        :return:
        :rtype:
        '''
        # print("hello5")
        embeddings = self.word_embedding(data_item['text_tokened'].to(torch.int64))  # 要转化为int64
        if self.config.use_dropout:
            embeddings = self.dropout_embedding_layer(embeddings)
        # if hidden_init is None:
        # print("hello6")
        if USE_CUDA:
            hidden_init = torch.randn(2*self.num_layers, self.batch_size, self.hidden_dim).cuda()
        else:
            hidden_init = torch.randn(2 * self.num_layers, self.batch_size, self.hidden_dim)
        output_lstm, h_n =self.gru(embeddings, hidden_init)
        # output_lstm [batch, seq_len, 2*hidden_dim]  h_n [2*num_layers, batch, hidden_dim]
        # if self.config.use_dropout:
        #     output_lstm = self.dropout_lstm_layer(output_lstm)  # 用了效果变差
        ner_score = self.get_ner_score(output_lstm)
        # 下面是使用CFR
        if USE_CUDA:
            self.crf_model = self.crf_model.cuda()
        if not is_test:
            log_likelihood = self.crf_model(ner_score, data_item['token_type_list'].to(torch.int64),
                                       mask=data_item['mask_tokens'])
            loss_ner = -log_likelihood
            
        pred_ner = self.crf_model.decode(ner_score)  # , mask=data_item['mask_tokens']
        
        if is_test:
            return pred_ner
        return loss_ner, pred_ner




Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Collecting pytorch-crf==0.7.0
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/8b/1f/4b11a3547623953e33f4645e1672ef21dcd9d9b8e5a48337b270840ce9a0/pytorch_crf-0.7.0-py3-none-any.whl (10 kB)
Installing collected packages: pytorch-crf
Successfully installed pytorch-crf-0.7.0
[0m

In [None]:
# -*- coding: utf-8 -*-

# @Author  : xmh
# @Time    : 2021/3/3 17:28
# @File    : trainer_ner.py

"""
file description:：

"""

!pip install setuptools-scm
!pip install seqeval
!pip install neptune

import sys


import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm


import math

from seqeval.metrics import f1_score
from seqeval.metrics import precision_score
from seqeval.metrics import accuracy_score
from seqeval.metrics import recall_score
from seqeval.metrics import classification_report
import neptune


class Trainer:
    def __init__(self,
                 model,
                 config,
                 train_dataset=None,
                 dev_dataset=None,
                 test_dataset=None,
                 ):
        self.model = model
        self.train_dataset = train_dataset
        self.dev_dataset = dev_dataset
        self.test_dataset = test_dataset
        self.config = config
        
        if USE_CUDA:
            self.model = self.model.cuda()
        # 初始优化器
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=config.lr)
        # 学习率调控
        self.scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(self.optimizer, factor=0.5,
                                                                   patience=8, min_lr=1e-5, verbose=True)
        self.get_id2token_type()

    def get_id2token_type(self):
        self.id2token_type = {}
        for i, token_type in enumerate(self.config.token_types):
            self.id2token_type[i] = token_type
        
    def train(self):
        print('STARTING TRAIN...')
        f1_ner_total_best = 0.0
        self.num_sample_total = len(self.train_dataset) * self.config.batch_size
        for epoch in range(self.config.epochs):
            print("Epoch: {}".format(epoch))
            pbar = tqdm(enumerate(self.train_dataset), total=len(self.train_dataset))
            loss_ner_total, f1_ner_total = 0, 0
            for i, data_item in pbar:
                loss_ner, f1_ner, pred_ner = self.train_batch(data_item)
                loss_ner_total += loss_ner
                f1_ner_total += f1_ner
                
            if (epoch+1) % 1 == 0:
                self.predict_sample()
            loss_ner_train_ave = loss_ner_total/self.num_sample_total
            print("train ner loss: {0}, f1 score: {1}".format(loss_ner_train_ave,
                                                        f1_ner_total/self.num_sample_total*self.config.batch_size))
            # neptune.log_metric("train ner loss", loss_ner_train_ave)
            # pbar.set_description('TRAIN LOSS: {}'.format(loss_total/self.num_sample_total))
            if (epoch+1) % 1 == 0:
                self.evaluate()
            if epoch > 8 and f1_ner_total >= f1_ner_total_best:
                f1_ner_total_best = f1_ner_total
                torch.save({
                    'epoch': epoch+1, 'state_dict': self.model.state_dict(), 'f1_best': f1_ner_total,
                    'optimizer': self.optimizer.state_dict(),
                },
                str(epoch) + 'm-' + 'f'+str("%.2f"%f1_ner_total) + 'n'+
                    str("%.2f"%loss_ner_total) +'ccks2019_ner.pth'
                )
    
    def train_batch(self, data_item):
        self.optimizer.zero_grad()
        loss_ner, pred_ner = self.model(data_item)
        pred_token_type = self.restore_ner(pred_ner, data_item['mask_tokens'])
        f1_ner = f1_score(data_item['token_type_origin'], pred_token_type)
        loss_ner.backward()
        self.optimizer.step()
        
        return loss_ner,f1_ner, pred_ner
    
    def restore_ner(self, pred_ner, mask_tokens):
        pred_token_type = []
        for i in range(len(pred_ner)):
            list_tmp = []
            for j in range(len(pred_ner[0])):
                if mask_tokens[i, j] == 0:
                    break
                list_tmp.append(self.id2token_type[pred_ner[i][j]])
            pred_token_type.append(list_tmp)
            
        return pred_token_type
    
    def evaluate(self):
        print('STARTING EVALUATION...')
        self.model.train(False)
        pbar_dev = tqdm(enumerate(self.dev_dataset), total=len(self.dev_dataset))
        
        loss_total, loss_ner_total = 0, 0
        for i, data_item in pbar_dev:
            loss_ner, pred_ner = self.model(data_item)
            loss_ner_total += loss_ner
        
        self.model.train(True)
        print("eval ner loss: {0}".format(loss_ner_total / (len(self.dev_dataset) * self.config.batch_size)))
        # return loss_ner_total / (len(self.dev_dataset) * self.config.batch_size)
    
    def predict(self):
        print('STARTING PREDICTING...')
        self.model.train(False)
        pbar = tqdm(enumerate(self.test_dataset), total=len(self.test_dataset))
        for i, data_item in pbar:
            pred_ner = self.model(data_item, is_test=True)
        self.model.train(True)
        token_pred = [[] for _ in range(len(pred_ner))]
        for i in range(len(pred_ner)):
            for item in pred_ner[i]:
                token_pred[i].append(self.id2token_type[item])
        return token_pred

    def predict_sample(self):
        print('STARTING TESTING...')
        self.model.train(False)
        pbar = tqdm(enumerate(self.test_dataset), total=len(self.test_dataset))

        for i, data_item in pbar:
            pred_ner = self.model(data_item, is_test=True)
        data_item0 = data_item
        pred_ner = pred_ner[0]
        token_pred = []
        for i in pred_ner:
            token_pred.append(self.id2token_type[i])
        print("token_pred: {}".format(token_pred))
        print(data_item0['text'][0])
        print(data_item0['spo_list'][0])
        self.model.train(True)
        
        
if __name__ == '__main__':
    # neptune.init(api_token='eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vdWkubmVwdHVuZS5haSIsImFwaV91cmwiOiJodHRwczovL3VpLm5lcHR1bmUuYWkiLCJhcGlfa2V5IjoiNTM3OTQzY2ItMzRhNC00YjYzLWJhMTktMzI0NTk4NmM4NDc3In0=', project_qualified_name='mangopudding/EntityRelationExtraction')
    # neptune.create_experiment('ner_train')
    print("Run EntityRelationExtraction NER ...")
    config = ConfigNer()
    model = SeqLabel(config)
    data_processor = ModelDataPreparation(config)
    train_loader, dev_loader, test_loader = data_processor.get_train_dev_data(
        '/kaggle/input/casrel-adr-data/adr-train.csv',
    '/kaggle/input/casrel-adr-data/test.csv',
    '/kaggle/input/casrel-adr-data/test.csv')
    # train_loader, dev_loader, test_loader = data_processor.get_train_dev_data('../data/train_data_small.json')
    trainer = Trainer(model, config, train_loader, dev_loader, test_loader)
    trainer.train()

Collecting setuptools-scm
  Downloading setuptools_scm-7.1.0-py3-none-any.whl (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.8/43.8 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: setuptools-scm
Successfully installed setuptools-scm-7.1.0
[0mCollecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25ldone
[?25h  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16179 sha256=e83855a83d119d0dbe205057f855129642fe8ccdcd690785a60027c3c4412e06
  Stored in directory: /root/.cache/pip/wheels/b2/a1/b7/0d3b008d0c77cd57332d724b92cf7650b4185b493dc785f00a
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-

  score = torch.where(mask[i].unsqueeze(1), next_score, score)
100%|██████████| 83/83 [00:51<00:00,  1.61it/s]


STARTING TESTING...


100%|██████████| 10/10 [00:02<00:00,  4.85it/s]


token_pred: ['B-adverse', 'I-adverse', 'I-adverse', 'I-adverse', 'I-adverse', 'I-adverse', 'I-adverse', 'I-adverse', 'I-adverse', 'I-adverse', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'I-drug', 'I-drug', 'I-drug', 'I-drug', 'I-drug', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'I-adverse', 'I-adverse', 'I-adverse', 'I-adverse', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'I-drug', 'I-drug', 'I-drug', 'I-drug', 'I-drug', 'I-drug', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 

100%|██████████| 10/10 [00:03<00:00,  3.28it/s]


eval ner loss: 126.29268646240234
Epoch: 1


100%|██████████| 83/83 [00:49<00:00,  1.68it/s]


STARTING TESTING...


 60%|██████    | 6/10 [00:01<00:00,  5.18it/s]