In [None]:
import torch
import torch.nn as nn
from torchcrf import CRF
from peft import  LoraConfig, get_peft_model
from transformers import BertTokenizerFast, BertModel, AutoModel,  AutoTokenizer, AutoModelForCausalLM

# Model.ipynb

## LoRATrainBert_BiLSTM_CRF

In [2]:
class LoRATrainBert_BiLSTM_CRF(nn.Module):

    def __init__(self, tag_to_ix, embedding_dim=21128, hidden_dim=256):
        super(LoRATrainBert_BiLSTM_CRF, self).__init__()
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)
        self.hidden_dim = hidden_dim
        self.embedding_dim = embedding_dim

        self.bert = AutoModelForCausalLM.from_pretrained('UJForSchool/Bert_base_chinese_LoRA', is_decoder=True, return_dict=False)
        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_dim//2,
                            num_layers=2, bidirectional=True, batch_first=True)
        self.dropout = nn.Dropout(p=0.1)
        self.linear = nn.Linear(hidden_dim, self.tagset_size)
        self.crf = CRF(self.tagset_size, batch_first=True)

    def _get_features(self, sentence):
        with torch.no_grad():
          embeds, _  = self.bert(sentence)
        enc, _ = self.lstm(embeds)
        enc = self.dropout(enc)
        feats = self.linear(enc)
        return feats

    def forward(self, sentence, tags, mask, is_test=False):
        emissions = self._get_features(sentence)
        if not is_test: # Training，return loss
            loss=-self.crf.forward(emissions, tags, mask, reduction='mean')
            return loss
        else: # Testing，return decoding
            decode=self.crf.decode(emissions, mask)
            return decode

## Lamma_BiLSTM_CRF

In [3]:
class Lamma_BiLSTM_CRF(nn.Module):

    def __init__(self, tag_to_ix, embedding_dim=768, hidden_dim=256):
        super(Lamma_BiLSTM_CRF, self).__init__()
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)
        self.hidden_dim = hidden_dim
        self.embedding_dim = embedding_dim

        self.lamma = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", use_auth_token=True)
        # self.lamma = AutoModelForCausalLM.from_pretrained("hfl/chinese-llama-2-lora-7b-64k")

        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_dim//2,
                            num_layers=2, bidirectional=True, batch_first=True)
        self.dropout = nn.Dropout(p=0.1)
        self.linear = nn.Linear(hidden_dim, self.tagset_size)
        self.crf = CRF(self.tagset_size, batch_first=True)

    def _get_features(self, sentence):
        with torch.no_grad():
            embeds = self.lamma(sentence)['last_hidden_state']
        enc, _ = self.lstm(embeds)
        enc = self.dropout(enc)
        feats = self.linear(enc)
        return feats

    def forward(self, sentence, tags, mask, is_test=False):
        emissions = self._get_features(sentence)
        if not is_test:  # Training, return loss
            loss = -self.crf.forward(emissions, tags, mask, reduction='mean')
            return loss
        else:  # Testing, return decoding
            decode = self.crf.decode(emissions, mask)
            return decode

## BERTWWM_BiLSTM_CRF

In [4]:
class BERTWWM_BiLSTM_CRF(nn.Module):
    def __init__(self, tag_to_ix, embedding_dim=768, hidden_dim=256):
        super(BERTWWM_BiLSTM_CRF, self).__init__()
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)
        self.hidden_dim = hidden_dim
        self.embedding_dim = embedding_dim

        config = AutoTokenizer.from_pretrained("hfl/chinese-bert-wwm", output_hidden_states=True)
        self.bert = AutoModel.from_pretrained("hfl/chinese-bert-wwm", config=config)

        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_dim//2,
                            num_layers=2, bidirectional=True, batch_first=True)
        self.dropout = nn.Dropout(p=0.1)
        self.linear = nn.Linear(hidden_dim, self.tagset_size)
        self.crf = CRF(self.tagset_size, batch_first=True)

    def _get_features(self, sentence):
        with torch.no_grad():
            embeds = self.bert(sentence)['last_hidden_state']
        enc, _ = self.lstm(embeds)
        enc = self.dropout(enc)
        feats = self.linear(enc)
        return feats

    def forward(self, sentence, tags, mask, is_test=False):
        emissions = self._get_features(sentence)
        if not is_test:  # Training, return loss
            loss = -self.crf.forward(emissions, tags, mask, reduction='mean')
            return loss
        else:  # Testing, return decoding
            decode = self.crf.decode(emissions, mask)
            return decode

## MacBERT_BiLSTM_CRF

In [5]:
class MacBERT_BiLSTM_CRF(nn.Module):

    def __init__(self, tag_to_ix, embedding_dim=768, hidden_dim=256):
        super(MacBERT_BiLSTM_CRF, self).__init__()
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)
        self.hidden_dim = hidden_dim
        self.embedding_dim = embedding_dim

        config = AutoTokenizer.from_pretrained("hfl/chinese-macbert-base", output_hidden_states=True)
        self.bert = AutoModel.from_pretrained("hfl/chinese-macbert-base", config=config)

        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_dim//2,
                            num_layers=2, bidirectional=True, batch_first=True)
        self.dropout = nn.Dropout(p=0.1)
        self.linear = nn.Linear(hidden_dim, self.tagset_size)
        self.crf = CRF(self.tagset_size, batch_first=True)

    def _get_features(self, sentence):
        with torch.no_grad():
            embeds = self.bert(sentence)['last_hidden_state']
        enc, _ = self.lstm(embeds)
        enc = self.dropout(enc)
        feats = self.linear(enc)
        return feats

    def forward(self, sentence, tags, mask, is_test=False):
        emissions = self._get_features(sentence)
        if not is_test:  # Training, return loss
            loss = -self.crf.forward(emissions, tags, mask, reduction='mean')
            return loss
        else:  # Testing, return decoding
            decode = self.crf.decode(emissions, mask)
            return decode

## Bert_LoRA_BiLSTM_CRF

In [6]:
class Bert_LoRA_BiLSTM_CRF(nn.Module):
    def __init__(self, tag_to_ix, num_layers_to_finetune, r, lora_alpha, lora_dropout, embedding_dim=768, hidden_dim=256):
        super(Bert_LoRA_BiLSTM_CRF, self).__init__()
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)
        self.num_layers_to_finetune = num_layers_to_finetune
        self.r = r
        self.lora_alpha = lora_alpha
        self.lora_dropout = lora_dropout
        self.hidden_dim = hidden_dim
        self.embedding_dim = embedding_dim


        # 创建并加载具有隐藏状态的BERT模型。
        self.bert = AutoModel.from_pretrained('bert-base-chinese')
        # self.bert = AutoModel.from_pretrained('hfl/chinese-llama-2-lora-7b')

        # 微调指定的层。
        self.fine_tune_layers()

        self.lora_config = LoraConfig(
            r = self.r,
            lora_alpha = self.lora_alpha,
            lora_dropout = self.lora_dropout,
            # task_type = None,
            # task_type = TaskType.SEQ_CLS, # TypeError: forward() got an unexpected keyword argument 'labels'
            # task_type = TaskType.CAUSAL_LM, # TypeError: forward() got an unexpected keyword argument 'labels'
            # task_type = TaskType.SEQ_2_SEQ_LM, # TypeError: forward() got an unexpected keyword argument 'decoder_input_ids'
            task_type = TaskType.TOKEN_CLS, # TypeError: forward() got an unexpected keyword argument 'labels'
            inference_mode = True,
        )

        self.bert = get_peft_model(self.bert, self.lora_config)

        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_dim // 2,
                            num_layers=2, bidirectional=True, batch_first=True)
        self.dropout = nn.Dropout(p=0.1)
        self.linear = nn.Linear(hidden_dim, self.tagset_size)
        self.crf = CRF(self.tagset_size, batch_first=True)

    def fine_tune_layers(self):
        # 冻结所有层。
        for param in self.bert.parameters():
            param.requires_grad = False

        # # 解冻指定的层。
        for i in range(12 - self.num_layers_to_finetune, 12):
            for param in self.bert.encoder.layer[i].parameters():
                param.requires_grad = True

    def _get_features(self, sentence,mask):
        # with torch.no_grad():
        #   embeds = self.bert(sentence)['last_hidden_state']
        # enc, _ = self.lstm(embeds)
        # enc = self.dropout(enc)
        # feats = self.linear(enc)
        with torch.no_grad():
           embeds = self.bert(sentence, attention_mask=mask)['last_hidden_state']
        enc, _ = self.lstm(embeds)
        enc = self.dropout(enc)
        feats = self.linear(enc)
        return feats

    def forward(self, sentence, tags, mask, is_test=False):
        emissions = self._get_features(sentence,mask)
        if not is_test: # Training，return loss
            loss=-self.crf.forward(emissions, tags, mask, reduction='mean')
            return loss
        else: # Testing，return decoding
            decode=self.crf.decode(emissions, mask)
            return decode

## ClinicalDistilBERT_BiLSTM_CRF

In [7]:
class ClinicalDistilBERT_BiLSTM_CRF(nn.Module):

    def __init__(self, tag_to_ix, embedding_dim=768, hidden_dim=256):
        super(ClinicalDistilBERT_BiLSTM_CRF, self).__init__()
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)
        self.hidden_dim = hidden_dim
        self.embedding_dim = embedding_dim

        # Build the DistilBERT model, including outputting all hidden states
        config = AutoTokenizer.from_pretrained("medicalai/ClinicalBERT", output_hidden_states=True)
        self.bert = AutoModel.from_pretrained("medicalai/ClinicalBERT", config=config)


        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_dim//2,
                            num_layers=2, bidirectional=True, batch_first=True)
        self.dropout = nn.Dropout(p=0.1)
        self.linear = nn.Linear(hidden_dim, self.tagset_size)
        self.crf = CRF(self.tagset_size, batch_first=True)

    def _get_features(self, sentence):
        with torch.no_grad():
            embeds = self.bert(sentence)['last_hidden_state']
        enc, _ = self.lstm(embeds)
        enc = self.dropout(enc)
        feats = self.linear(enc)
        return feats

    def forward(self, sentence, tags, mask, is_test=False):
        emissions = self._get_features(sentence)
        if not is_test:  # Training, return loss
            loss = -self.crf.forward(emissions, tags, mask, reduction='mean')
            return loss
        else:  # Testing, return decoding
            decode = self.crf.decode(emissions, mask)
            return decode

## Bert_BiLSTM_CRF

In [8]:
class Bert_BiLSTM_CRF(nn.Module):
    def __init__(self, tag_to_ix, embedding_dim=768, hidden_dim=256):
        super(Bert_BiLSTM_CRF, self).__init__()
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)
        self.hidden_dim = hidden_dim
        self.embedding_dim = embedding_dim

        # self.bert = BertModel.from_pretrained('bert-base-chinese')
        self.bert = BertModel.from_pretrained('bert-base-chinese',return_dict=False)
        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_dim//2,
                            num_layers=2, bidirectional=True, batch_first=True)
        self.dropout = nn.Dropout(p=0.1)
        self.linear = nn.Linear(hidden_dim, self.tagset_size)
        self.crf = CRF(self.tagset_size, batch_first=True)

    def _get_features(self, sentence):
        with torch.no_grad():
          embeds, _  = self.bert(sentence)
        enc, _ = self.lstm(embeds)
        enc = self.dropout(enc)
        feats = self.linear(enc)
        return feats

    def forward(self, sentence, tags, mask, is_test=False):
        emissions = self._get_features(sentence)
        if not is_test: # Training，return loss
            loss=-self.crf.forward(emissions, tags, mask, reduction='mean')
            return loss
        else: # Testing，return decoding
            decode=self.crf.decode(emissions, mask)
            return decode

## Bert_CRF

In [9]:
class Bert_CRF(nn.Module):

    # def __init__(self, tag_to_ix, num_layers_to_finetune, embedding_dim=768, hidden_dim=768):
    def __init__(self, tag_to_ix, embedding_dim=768, hidden_dim=768):
        super(Bert_CRF, self).__init__()
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)
        # self.num_layers_to_finetune = num_layers_to_finetune
        self.hidden_dim = hidden_dim
        self.embedding_dim = embedding_dim

        # self.bert = BertModel.from_pretrained('bert-base-chinese')
        self.bert = BertModel.from_pretrained('bert-base-chinese',return_dict=False)

        # # 微调指定的层。
        # self.fine_tune_layers()

        self.dropout = nn.Dropout(p=0.1)
        self.linear = nn.Linear(self.hidden_dim, self.tagset_size)
        self.crf = CRF(self.tagset_size, batch_first=True)

    def _get_features(self, sentence):
        with torch.no_grad():
          embeds, _  = self.bert(sentence)
        enc = self.dropout(embeds)
        feats = self.linear(enc)
        return feats

    def forward(self, sentence, tags, mask, is_test=False):
        emissions = self._get_features(sentence)
        if not is_test: # Training，return loss
            loss=-self.crf.forward(emissions, tags, mask, reduction='mean')
            return loss
        else: # Testing，return decoding
            decode=self.crf.decode(emissions, mask)
            return decode

    # def fine_tune_layers(self):
    #     # 冻结所有层。
    #     for param in self.bert.parameters():
    #         param.requires_grad = False

    #     # # 解冻指定的层。
    #     for i in range(12 - self.num_layers_to_finetune, 12):
    #         for param in self.bert.encoder.layer[i].parameters():
    #             param.requires_grad = True

## Roberta_BiLSTM_CRF

In [10]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

class Roberta_BiLSTM_CRF(nn.Module):
    def __init__(self, tag_to_ix, embedding_dim=768, hidden_dim=256):
        super(Roberta_BiLSTM_CRF, self).__init__()
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)
        self.hidden_dim = hidden_dim
        self.embedding_dim = embedding_dim

        # Build the DistilBERT model, including outputting all hidden states
        config = AutoTokenizer.from_pretrained("roberta-base", output_hidden_states=True)
        self.bert = AutoModel.from_pretrained("roberta-base", config=config)


        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_dim//2,
                            num_layers=2, bidirectional=True, batch_first=True)
        self.dropout = nn.Dropout(p=0.1)
        self.linear = nn.Linear(hidden_dim, self.tagset_size)
        self.crf = CRF(self.tagset_size, batch_first=True)

    def _get_features(self, sentence):
        with torch.no_grad():
            embeds = self.bert(sentence)['last_hidden_state']
        enc, _ = self.lstm(embeds)
        enc = self.dropout(enc)
        feats = self.linear(enc)
        return feats

    def forward(self, sentence, tags, mask, is_test=False):
        emissions = self._get_features(sentence)
        if not is_test:  # Training, return loss
            loss = -self.crf.forward(emissions, tags, mask, reduction='mean')
            return loss
        else:  # Testing, return decoding
            decode = self.crf.decode(emissions, mask)
            return decode

## Roberta_wwm_BiLSTM_CRF

In [11]:
class Roberta_wwm_BiLSTM_CRF(nn.Module):
    def __init__(self, tag_to_ix, embedding_dim=768, hidden_dim=256):
        super(Roberta_wwm_BiLSTM_CRF, self).__init__()
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)
        self.hidden_dim = hidden_dim
        self.embedding_dim = embedding_dim

        # Build the DistilBERT model, including outputting all hidden states
        config = AutoTokenizer.from_pretrained("hfl/chinese-roberta-wwm-ext", output_hidden_states=True)
        self.bert = AutoModel.from_pretrained("hfl/chinese-roberta-wwm-ext", config=config)


        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_dim//2,
                            num_layers=2, bidirectional=True, batch_first=True)
        self.dropout = nn.Dropout(p=0.1)
        self.linear = nn.Linear(hidden_dim, self.tagset_size)
        self.crf = CRF(self.tagset_size, batch_first=True)

    def _get_features(self, sentence):
        with torch.no_grad():
            embeds = self.bert(sentence)['last_hidden_state']
        enc, _ = self.lstm(embeds)
        enc = self.dropout(enc)
        feats = self.linear(enc)
        return feats

    def forward(self, sentence, tags, mask, is_test=False):
        emissions = self._get_features(sentence)
        if not is_test:  # Training, return loss
            loss = -self.crf.forward(emissions, tags, mask, reduction='mean')
            return loss
        else:  # Testing, return decoding
            decode = self.crf.decode(emissions, mask)
            return decode

## Bert_LoRA_CRF

In [12]:
class Bert_LoRA_CRF(nn.Module):
    def __init__(self, tag_to_ix, num_layers_to_finetune, r, lora_alpha, lora_dropout, embedding_dim=768, hidden_dim=768):
        super(Bert_LoRA_CRF, self).__init__()
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)
        self.num_layers_to_finetune = num_layers_to_finetune
        self.r = r
        self.lora_alpha = lora_alpha
        self.lora_dropout = lora_dropout
        self.hidden_dim = hidden_dim
        self.embedding_dim = embedding_dim


        # 创建并加载具有隐藏状态的BERT模型。
        self.bert = AutoModel.from_pretrained('bert-base-chinese')
        # self.bert = AutoModel.from_pretrained('hfl/chinese-llama-2-lora-7b')

        # 微调指定的层。
        self.fine_tune_layers()

        self.lora_config = LoraConfig(
            r = self.r,
            lora_alpha = self.lora_alpha,
            lora_dropout = self.lora_dropout,
            task_type = None,
            # task_type = TaskType.SEQ_CLS, # TypeError: forward() got an unexpected keyword argument 'labels'
            # task_type = TaskType.CAUSAL_LM, # TypeError: forward() got an unexpected keyword argument 'labels'
            # task_type = TaskType.SEQ_2_SEQ_LM, # TypeError: forward() got an unexpected keyword argument 'decoder_input_ids'
            # task_type = TaskType.TOKEN_CLS, # TypeError: forward() got an unexpected keyword argument 'labels'
            inference_mode = True,
        )

        self.bert = get_peft_model(self.bert, self.lora_config)
        self.dropout = nn.Dropout(p=0.1)
        self.linear = nn.Linear(hidden_dim, self.tagset_size)
        self.crf = CRF(self.tagset_size, batch_first=True)

    def fine_tune_layers(self):
        # 冻结所有层。
        for param in self.bert.parameters():
            param.requires_grad = False

        # # 解冻指定的层。
        for i in range(12 - self.num_layers_to_finetune, 12):
            for param in self.bert.encoder.layer[i].parameters():
                param.requires_grad = True

    def _get_features(self, sentence,mask):
        # with torch.no_grad():
        #   embeds = self.bert(sentence)['last_hidden_state']
        # enc, _ = self.lstm(embeds)
        # enc = self.dropout(enc)
        # feats = self.linear(enc)
        with torch.no_grad():
           embeds = self.bert(sentence, attention_mask=mask)
        enc = self.dropout(embeds)
        feats = self.linear(enc)
        return feats

    def forward(self, sentence, tags, mask, is_test=False):
        emissions = self._get_features(sentence,mask)
        if not is_test: # Training，return loss
            loss=-self.crf.forward(emissions, tags, mask, reduction='mean')
            return loss
        else: # Testing，return decoding
            decode=self.crf.decode(emissions, mask)
            return decode

# Main_config

In [13]:
trainset = 'data/train.txt'
testset = 'data/test.txt'
validset="data/msra_eval.txt"

In [14]:
labels = ['O', 'B-BODY','I-BODY', 'B-SYMP', 'I-SYMP', 'B-INST', 'I-INST', 'B-EXAM', 'I-EXAM','B-CHEM', 'I-CHEM',
          'B-DISE', 'I-DISE', 'B-DRUG', 'I-DRUG', 'B-SUPP', 'I-SUPP', 'B-TREAT', 'I-TREAT', 'B-TIME', 'I-TIME']

In [15]:
import os
import time
import torch
import warnings
import numpy as np
import torch.nn as nn
from sklearn import metrics
import torch.optim as optim
from torch.utils import data
from transformers import AdamW, get_linear_schedule_with_warmup

warnings.filterwarnings("ignore", category=DeprecationWarning)
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

def train(e, model, iterator, optimizer, scheduler, device):
    start_time = time.time()  # Record the start time

    model.train().to(device)
    losses = 0.0
    step = 0
    for i, batch in enumerate(iterator):
        step += 1
        x, y, z = batch
        x = x.to(device)
        y = y.to(device)
        z = z.to(device)

        loss = model(x, y, z)
        losses += loss.item()
        """ Gradient Accumulation """
        '''
          full_loss = loss / 2                            # normalize loss
          full_loss.backward()                            # backward and accumulate gradient
          if step % 2 == 0:
              optimizer.step()                            # update optimizer
              scheduler.step()                            # update scheduler
              optimizer.zero_grad()                       # clear gradient
        '''
        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    end_time = time.time()  # Record the end time
    epoch_time = end_time - start_time

    print("Epoch: {}, Loss:{:.4f}, epoch_time:{:.2f} sec".format(e, losses/step, epoch_time))

def validate(e, model, iterator, device):
    start_time = time.time()  # Record the start time

    model.eval()
    Y, Y_hat = [], []
    losses = 0
    step = 0
    with torch.no_grad():
        for i, batch in enumerate(iterator):
            step += 1

            x, y, z = batch
            x = x.to(device)
            y = y.to(device)
            z = z.to(device)

            y_hat = model(x, y, z, is_test=True)

            loss = model(x, y, z)
            losses += loss.item()
            # Save prediction
            for j in y_hat:
              Y_hat.extend(j)
            # Save labels
            mask = (z==1)
            y_orig = torch.masked_select(y, mask)
            Y.append(y_orig.cpu())

    Y = torch.cat(Y, dim=0).numpy()
    Y_hat = np.array(Y_hat)
    acc = (Y_hat == Y).mean()*100

    end_time = time.time()  # Record the end time
    epoch_time = end_time - start_time

    print("Epoch: {}, Val Loss:{:.4f}, Val Acc:{:.3f}, epoch_time:{:.2f} sec".format(e, losses/step, acc, epoch_time))
    return model, losses/step, acc

def test(model, iterator, device):
    model.eval()
    Y, Y_hat = [], []
    with torch.no_grad():
        for i, batch in enumerate(iterator):
            x, y, z = batch
            x = x.to(device)
            z = z.to(device)
            y_hat = model(x, y, z, is_test=True)
            # Save prediction
            for j in y_hat:
              Y_hat.extend(j)
            # Save labels
            mask = (z==1).cpu()
            y_orig = torch.masked_select(y, mask)
            Y.append(y_orig)

    Y = torch.cat(Y, dim=0).numpy()
    y_true = [idx2tag[i] for i in Y]
    y_pred = [idx2tag[i] for i in Y_hat]

    return y_true, y_pred

## Bert + Bi-LSTM + CRF

In [None]:
import torch
from torch.utils.data import Dataset
from transformers import BertConfig, AutoTokenizer
from peft import PeftConfig

import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:2048"

bert_model = 'bert-base-chinese'

tokenizer = AutoTokenizer.from_pretrained(bert_model)

VOCAB = ('<PAD>', '[CLS]', '[SEP]', 'O', 'B-BODY', 'I-BODY',
         'B-SYMP', 'I-SYMP', 'B-INST', 'I-INST', 'B-EXAM', 'I-EXAM',
         'B-CHEM', 'I-CHEM','B-DISE', 'I-DISE', 'B-DRUG', 'I-DRUG',
         'B-SUPP', 'I-SUPP', 'B-TREAT', 'I-TREAT', 'B-TIME', 'I-TIME')

tag2idx = {tag: idx for idx, tag in enumerate(VOCAB)}
idx2tag = {idx: tag for idx, tag in enumerate(VOCAB)}
MAX_LEN = 256 - 2

bert_config = BertConfig.from_pretrained(bert_model, output_hidden_states=True)

class NerDataset(Dataset):
    ''' Generate our dataset '''

    def __init__(self, f_path):
        self.sents = []
        self.tags_li = []

        with open(f_path, 'r', encoding='utf-8') as f:
            lines = [line.split('\n')[0] for line in f.readlines() if len(line.strip())!=0]

        tags =  [line.split('\t')[1] for line in lines]
        words = [line.split('\t')[0] for line in lines]

        word, tag = [], []
        for char, t in zip(words, tags):
            if char != '。':
                word.append(char)
                tag.append(t)
            else:
                if len(word) > MAX_LEN:
                  self.sents.append(['[CLS]'] + word[:MAX_LEN] + ['[SEP]'])
                  self.tags_li.append(['[CLS]'] + tag[:MAX_LEN] + ['[SEP]'])
                else:
                  self.sents.append(['[CLS]'] + word + ['[SEP]'])
                  self.tags_li.append(['[CLS]'] + tag + ['[SEP]'])
                word, tag = [], []

    def __getitem__(self, idx):
        words, tags = self.sents[idx], self.tags_li[idx]
        token_ids = tokenizer.convert_tokens_to_ids(words)
        laebl_ids = [tag2idx[tag] for tag in tags]
        seqlen = len(laebl_ids)
        return token_ids, laebl_ids, seqlen

    def __len__(self):
        return len(self.sents)

def PadBatch(batch):#[Pad]
    maxlen = max([i[2] for i in batch])
    token_tensors = torch.LongTensor([i[0] + [0] * (maxlen - len(i[0])) for i in batch])
    label_tensors = torch.LongTensor([i[1] + [0] * (maxlen - len(i[1])) for i in batch])
    # mask = (token_tensors > 0)
    mask = (token_tensors > 0).to(torch.bool)

    return token_tensors, label_tensors, mask

In [17]:
def main(batch_size=64, lr=0.001, n_epochs=40, num_layers_to_finetune=2, r=16, lora_alpha=16,
         lora_dropout=0.05, trainset= trainset, validset=validset, testset=testset):

    best_model = None
    _best_val_loss = 1e18
    _best_val_acc = 1e-18

    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    model = Bert_BiLSTM_CRF(tag2idx).cuda()

    print('Initial model Done.')
    train_dataset = NerDataset(trainset)
    eval_dataset = NerDataset(validset)
    test_dataset = NerDataset(testset)
    print('Load Data Done.')

    train_iter = data.DataLoader(dataset=train_dataset,
                                 batch_size=batch_size,
                                 shuffle=True,
                                 num_workers=4,
                                 collate_fn=PadBatch)

    eval_iter = data.DataLoader(dataset=eval_dataset,
                                 batch_size=(batch_size)//2,
                                 shuffle=False,
                                 num_workers=4,
                                 collate_fn=PadBatch)

    test_iter = data.DataLoader(dataset=test_dataset,
                                batch_size=(batch_size)//2,
                                shuffle=False,
                                num_workers=4,
                                collate_fn=PadBatch)

    #optimizer = optim.Adam(self.model.parameters(), lr=lr, weight_decay=0.01)
    optimizer = AdamW(model.parameters(), lr=lr, eps=1e-6)

    # Warmup
    len_dataset = len(train_dataset)
    epoch = n_epochs
    batch_size = batch_size
    total_steps = (len_dataset // batch_size) * epoch if len_dataset % batch_size == 0 else (len_dataset // batch_size + 1) * epoch

    warm_up_ratio = 0.1 # Define 10% steps
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = warm_up_ratio * total_steps, num_training_steps = total_steps)

    print('Start Train...,')
    for epoch in range(1, n_epochs+1):

        train(epoch, model, train_iter, optimizer, scheduler, device)
        candidate_model, loss, acc = validate(epoch, model, eval_iter, device)

        save_dir = '/home/yenling/Code In Lunix/mamba/Save Model'
        os.makedirs(save_dir, exist_ok=True)

        if loss < _best_val_loss and acc > _best_val_acc:
          best_model = candidate_model
          _best_val_loss = loss
          _best_val_acc = acc

          save_path = os.path.join(save_dir, f'Roberta + WWM+ Bi-LSTM + CRF best_model_epoch_{epoch}_loss_{loss:.4f}_acc_{acc:.4f}.pt')
          torch.save(best_model.state_dict(), save_path)
          print(f"Best model saved at epoch {epoch} with val_loss: {loss:.4f} and val_acc: {acc:.4f} to {save_path}")

        print("=============================================")

    y_test, y_pred = test(best_model, test_iter, device)
    print(metrics.classification_report(y_test, y_pred, labels=labels, digits=3))
    print(metrics.confusion_matrix(y_test, y_pred, labels=labels))


In [None]:
main(n_epochs=100)

## Bert + CRF

In [None]:
import torch
from torch.utils.data import Dataset
from transformers import BertConfig, AutoTokenizer
from peft import PeftConfig

import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:2048"

bert_model = 'bert-base-chinese'

tokenizer = AutoTokenizer.from_pretrained(bert_model)

VOCAB = ('<PAD>', '[CLS]', '[SEP]', 'O', 'B-BODY', 'I-BODY',
         'B-SYMP', 'I-SYMP', 'B-INST', 'I-INST', 'B-EXAM', 'I-EXAM',
         'B-CHEM', 'I-CHEM','B-DISE', 'I-DISE', 'B-DRUG', 'I-DRUG',
         'B-SUPP', 'I-SUPP', 'B-TREAT', 'I-TREAT', 'B-TIME', 'I-TIME')

tag2idx = {tag: idx for idx, tag in enumerate(VOCAB)}
idx2tag = {idx: tag for idx, tag in enumerate(VOCAB)}
MAX_LEN = 256 - 2

bert_config = BertConfig.from_pretrained(bert_model, output_hidden_states=True)


class NerDataset(Dataset):
    ''' Generate our dataset '''

    def __init__(self, f_path):
        self.sents = []
        self.tags_li = []

        with open(f_path, 'r', encoding='utf-8') as f:
            lines = [line.split('\n')[0] for line in f.readlines() if len(line.strip())!=0]

        tags =  [line.split('\t')[1] for line in lines]
        words = [line.split('\t')[0] for line in lines]

        word, tag = [], []
        for char, t in zip(words, tags):
            if char != '。':
                word.append(char)
                tag.append(t)
            else:
                if len(word) > MAX_LEN:
                  self.sents.append(['[CLS]'] + word[:MAX_LEN] + ['[SEP]'])
                  self.tags_li.append(['[CLS]'] + tag[:MAX_LEN] + ['[SEP]'])
                else:
                  self.sents.append(['[CLS]'] + word + ['[SEP]'])
                  self.tags_li.append(['[CLS]'] + tag + ['[SEP]'])
                word, tag = [], []

    def __getitem__(self, idx):
        words, tags = self.sents[idx], self.tags_li[idx]
        token_ids = tokenizer.convert_tokens_to_ids(words)
        laebl_ids = [tag2idx[tag] for tag in tags]
        seqlen = len(laebl_ids)
        return token_ids, laebl_ids, seqlen

    def __len__(self):
        return len(self.sents)

def PadBatch(batch):#[Pad]
    maxlen = max([i[2] for i in batch])
    token_tensors = torch.LongTensor([i[0] + [0] * (maxlen - len(i[0])) for i in batch])
    label_tensors = torch.LongTensor([i[1] + [0] * (maxlen - len(i[1])) for i in batch])
    # mask = (token_tensors > 0)
    mask = (token_tensors > 0).to(torch.bool)

    return token_tensors, label_tensors, mask

In [20]:
def main(batch_size=64, lr=0.001, n_epochs=40, num_layers_to_finetune=2, r=16, lora_alpha=16,
         lora_dropout=0.05, trainset= trainset, validset=validset, testset=testset):

    best_model = None
    _best_val_loss = 1e18
    _best_val_acc = 1e-18

    device = 'cuda' if torch.cuda.is_available() else 'cpu'


    # model = Bert_CRF(tag2idx,num_layers_to_finetune).cuda()
    model = Bert_CRF(tag2idx).cuda()


    print('Initial model Done.')
    train_dataset = NerDataset(trainset)
    eval_dataset = NerDataset(validset)
    test_dataset = NerDataset(testset)
    print('Load Data Done.')

    train_iter = data.DataLoader(dataset=train_dataset,
                                 batch_size=batch_size,
                                 shuffle=True,
                                 num_workers=4,
                                 collate_fn=PadBatch)

    eval_iter = data.DataLoader(dataset=eval_dataset,
                                 batch_size=(batch_size)//2,
                                 shuffle=False,
                                 num_workers=4,
                                 collate_fn=PadBatch)

    test_iter = data.DataLoader(dataset=test_dataset,
                                batch_size=(batch_size)//2,
                                shuffle=False,
                                num_workers=4,
                                collate_fn=PadBatch)

    #optimizer = optim.Adam(self.model.parameters(), lr=lr, weight_decay=0.01)
    optimizer = AdamW(model.parameters(), lr=lr, eps=1e-6)

    # Warmup
    len_dataset = len(train_dataset)
    epoch = n_epochs
    batch_size = batch_size
    total_steps = (len_dataset // batch_size) * epoch if len_dataset % batch_size == 0 else (len_dataset // batch_size + 1) * epoch

    warm_up_ratio = 0.1 # Define 10% steps
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = warm_up_ratio * total_steps, num_training_steps = total_steps)

    print('Start Train...,')
    for epoch in range(1, n_epochs+1):

        train(epoch, model, train_iter, optimizer, scheduler, device)
        candidate_model, loss, acc = validate(epoch, model, eval_iter, device)

        save_dir = '/home/yenling/Code In Lunix/mamba/Save Model'
        os.makedirs(save_dir, exist_ok=True)

        if loss < _best_val_loss and acc > _best_val_acc:
          best_model = candidate_model
          _best_val_loss = loss
          _best_val_acc = acc

          save_path = os.path.join(save_dir, f'Roberta + WWM+ Bi-LSTM + CRF best_model_epoch_{epoch}_loss_{loss:.4f}_acc_{acc:.4f}.pt')
          torch.save(best_model.state_dict(), save_path)
          print(f"Best model saved at epoch {epoch} with val_loss: {loss:.4f} and val_acc: {acc:.4f} to {save_path}")

        print("=============================================")

    y_test, y_pred = test(best_model, test_iter, device)
    print(metrics.classification_report(y_test, y_pred, labels=labels, digits=3))
    print(metrics.confusion_matrix(y_test, y_pred, labels=labels))


In [None]:
main(n_epochs=100)

## ClinicalDistil BERT + Bi-LSTM + CRF


In [None]:
import torch
from torch.utils.data import Dataset
from transformers import BertConfig, AutoTokenizer
from peft import PeftConfig

import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:2048"

bert_model = 'medicalai/ClinicalBERT'

tokenizer = AutoTokenizer.from_pretrained(bert_model)

VOCAB = ('<PAD>', '[CLS]', '[SEP]', 'O', 'B-BODY', 'I-BODY',
         'B-SYMP', 'I-SYMP', 'B-INST', 'I-INST', 'B-EXAM', 'I-EXAM',
         'B-CHEM', 'I-CHEM','B-DISE', 'I-DISE', 'B-DRUG', 'I-DRUG',
         'B-SUPP', 'I-SUPP', 'B-TREAT', 'I-TREAT', 'B-TIME', 'I-TIME')

tag2idx = {tag: idx for idx, tag in enumerate(VOCAB)}
idx2tag = {idx: tag for idx, tag in enumerate(VOCAB)}
MAX_LEN = 256 - 2

bert_config = BertConfig.from_pretrained(bert_model, output_hidden_states=True)


class NerDataset(Dataset):
    ''' Generate our dataset '''

    def __init__(self, f_path):
        self.sents = []
        self.tags_li = []

        with open(f_path, 'r', encoding='utf-8') as f:
            lines = [line.split('\n')[0] for line in f.readlines() if len(line.strip())!=0]

        tags =  [line.split('\t')[1] for line in lines]
        words = [line.split('\t')[0] for line in lines]

        word, tag = [], []
        for char, t in zip(words, tags):
            if char != '。':
                word.append(char)
                tag.append(t)
            else:
                if len(word) > MAX_LEN:
                  self.sents.append(['[CLS]'] + word[:MAX_LEN] + ['[SEP]'])
                  self.tags_li.append(['[CLS]'] + tag[:MAX_LEN] + ['[SEP]'])
                else:
                  self.sents.append(['[CLS]'] + word + ['[SEP]'])
                  self.tags_li.append(['[CLS]'] + tag + ['[SEP]'])
                word, tag = [], []

    def __getitem__(self, idx):
        words, tags = self.sents[idx], self.tags_li[idx]
        token_ids = tokenizer.convert_tokens_to_ids(words)
        laebl_ids = [tag2idx[tag] for tag in tags]
        seqlen = len(laebl_ids)
        return token_ids, laebl_ids, seqlen

    def __len__(self):
        return len(self.sents)

def PadBatch(batch):#[Pad]
    maxlen = max([i[2] for i in batch])
    token_tensors = torch.LongTensor([i[0] + [0] * (maxlen - len(i[0])) for i in batch])
    label_tensors = torch.LongTensor([i[1] + [0] * (maxlen - len(i[1])) for i in batch])
    # mask = (token_tensors > 0)
    mask = (token_tensors > 0).to(torch.bool)

    return token_tensors, label_tensors, mask

In [23]:
def main(batch_size=64, lr=0.001, n_epochs=40, num_layers_to_finetune=2, r=16, lora_alpha=16,
         lora_dropout=0.05, trainset= trainset, validset=validset, testset=testset):

    best_model = None
    _best_val_loss = 1e18
    _best_val_acc = 1e-18

    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    model = ClinicalDistilBERT_BiLSTM_CRF(tag2idx).cuda()

    print('Initial model Done.')
    train_dataset = NerDataset(trainset)
    eval_dataset = NerDataset(validset)
    test_dataset = NerDataset(testset)
    print('Load Data Done.')

    train_iter = data.DataLoader(dataset=train_dataset,
                                 batch_size=batch_size,
                                 shuffle=True,
                                 num_workers=4,
                                 collate_fn=PadBatch)

    eval_iter = data.DataLoader(dataset=eval_dataset,
                                 batch_size=(batch_size)//2,
                                 shuffle=False,
                                 num_workers=4,
                                 collate_fn=PadBatch)

    test_iter = data.DataLoader(dataset=test_dataset,
                                batch_size=(batch_size)//2,
                                shuffle=False,
                                num_workers=4,
                                collate_fn=PadBatch)

    #optimizer = optim.Adam(self.model.parameters(), lr=lr, weight_decay=0.01)
    optimizer = AdamW(model.parameters(), lr=lr, eps=1e-6)

    # Warmup
    len_dataset = len(train_dataset)
    epoch = n_epochs
    batch_size = batch_size
    total_steps = (len_dataset // batch_size) * epoch if len_dataset % batch_size == 0 else (len_dataset // batch_size + 1) * epoch

    warm_up_ratio = 0.1 # Define 10% steps
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = warm_up_ratio * total_steps, num_training_steps = total_steps)

    print('Start Train...,')
    for epoch in range(1, n_epochs+1):

        train(epoch, model, train_iter, optimizer, scheduler, device)
        candidate_model, loss, acc = validate(epoch, model, eval_iter, device)

        save_dir = '/home/yenling/Code In Lunix/mamba/Save Model'
        os.makedirs(save_dir, exist_ok=True)

        if loss < _best_val_loss and acc > _best_val_acc:
          best_model = candidate_model
          _best_val_loss = loss
          _best_val_acc = acc

          save_path = os.path.join(save_dir, f'Roberta + WWM+ Bi-LSTM + CRF best_model_epoch_{epoch}_loss_{loss:.4f}_acc_{acc:.4f}.pt')
          torch.save(best_model.state_dict(), save_path)
          print(f"Best model saved at epoch {epoch} with val_loss: {loss:.4f} and val_acc: {acc:.4f} to {save_path}")

        print("=============================================")

    y_test, y_pred = test(best_model, test_iter, device)
    print(metrics.classification_report(y_test, y_pred, labels=labels, digits=3))
    print(metrics.confusion_matrix(y_test, y_pred, labels=labels))


In [None]:
main(n_epochs=100)

## Mac BERT + Bi-LSTM + CRF


In [None]:
import torch
from torch.utils.data import Dataset
from transformers import BertConfig, AutoTokenizer
from peft import PeftConfig

import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:2048"

bert_model = 'hfl/chinese-macbert-base'

tokenizer = AutoTokenizer.from_pretrained(bert_model)

VOCAB = ('<PAD>', '[CLS]', '[SEP]', 'O', 'B-BODY', 'I-BODY',
         'B-SYMP', 'I-SYMP', 'B-INST', 'I-INST', 'B-EXAM', 'I-EXAM',
         'B-CHEM', 'I-CHEM','B-DISE', 'I-DISE', 'B-DRUG', 'I-DRUG',
         'B-SUPP', 'I-SUPP', 'B-TREAT', 'I-TREAT', 'B-TIME', 'I-TIME')

tag2idx = {tag: idx for idx, tag in enumerate(VOCAB)}
idx2tag = {idx: tag for idx, tag in enumerate(VOCAB)}
MAX_LEN = 256 - 2

bert_config = BertConfig.from_pretrained(bert_model, output_hidden_states=True)


class NerDataset(Dataset):
    ''' Generate our dataset '''

    def __init__(self, f_path):
        self.sents = []
        self.tags_li = []

        with open(f_path, 'r', encoding='utf-8') as f:
            lines = [line.split('\n')[0] for line in f.readlines() if len(line.strip())!=0]

        tags =  [line.split('\t')[1] for line in lines]
        words = [line.split('\t')[0] for line in lines]

        word, tag = [], []
        for char, t in zip(words, tags):
            if char != '。':
                word.append(char)
                tag.append(t)
            else:
                if len(word) > MAX_LEN:
                  self.sents.append(['[CLS]'] + word[:MAX_LEN] + ['[SEP]'])
                  self.tags_li.append(['[CLS]'] + tag[:MAX_LEN] + ['[SEP]'])
                else:
                  self.sents.append(['[CLS]'] + word + ['[SEP]'])
                  self.tags_li.append(['[CLS]'] + tag + ['[SEP]'])
                word, tag = [], []

    def __getitem__(self, idx):
        words, tags = self.sents[idx], self.tags_li[idx]
        token_ids = tokenizer.convert_tokens_to_ids(words)
        laebl_ids = [tag2idx[tag] for tag in tags]
        seqlen = len(laebl_ids)
        return token_ids, laebl_ids, seqlen

    def __len__(self):
        return len(self.sents)

def PadBatch(batch):#[Pad]
    maxlen = max([i[2] for i in batch])
    token_tensors = torch.LongTensor([i[0] + [0] * (maxlen - len(i[0])) for i in batch])
    label_tensors = torch.LongTensor([i[1] + [0] * (maxlen - len(i[1])) for i in batch])
    # mask = (token_tensors > 0)
    mask = (token_tensors > 0).to(torch.bool)

    return token_tensors, label_tensors, mask

In [26]:
def main(batch_size=64, lr=0.001, n_epochs=40, num_layers_to_finetune=2, r=16, lora_alpha=16,
         lora_dropout=0.05, trainset= trainset, validset=validset, testset=testset):

    best_model = None
    _best_val_loss = 1e18
    _best_val_acc = 1e-18

    device = 'cuda' if torch.cuda.is_available() else 'cpu'


    model = MacBERT_BiLSTM_CRF(tag2idx).cuda()

    print('Initial model Done.')
    train_dataset = NerDataset(trainset)
    eval_dataset = NerDataset(validset)
    test_dataset = NerDataset(testset)
    print('Load Data Done.')

    train_iter = data.DataLoader(dataset=train_dataset,
                                 batch_size=batch_size,
                                 shuffle=True,
                                 num_workers=4,
                                 collate_fn=PadBatch)

    eval_iter = data.DataLoader(dataset=eval_dataset,
                                 batch_size=(batch_size)//2,
                                 shuffle=False,
                                 num_workers=4,
                                 collate_fn=PadBatch)

    test_iter = data.DataLoader(dataset=test_dataset,
                                batch_size=(batch_size)//2,
                                shuffle=False,
                                num_workers=4,
                                collate_fn=PadBatch)

    #optimizer = optim.Adam(self.model.parameters(), lr=lr, weight_decay=0.01)
    optimizer = AdamW(model.parameters(), lr=lr, eps=1e-6)

    # Warmup
    len_dataset = len(train_dataset)
    epoch = n_epochs
    batch_size = batch_size
    total_steps = (len_dataset // batch_size) * epoch if len_dataset % batch_size == 0 else (len_dataset // batch_size + 1) * epoch

    warm_up_ratio = 0.1 # Define 10% steps
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = warm_up_ratio * total_steps, num_training_steps = total_steps)

    print('Start Train...,')
    for epoch in range(1, n_epochs+1):

        train(epoch, model, train_iter, optimizer, scheduler, device)
        candidate_model, loss, acc = validate(epoch, model, eval_iter, device)

        save_dir = '/home/yenling/Code In Lunix/mamba/Save Model'
        os.makedirs(save_dir, exist_ok=True)

        if loss < _best_val_loss and acc > _best_val_acc:
          best_model = candidate_model
          _best_val_loss = loss
          _best_val_acc = acc

          save_path = os.path.join(save_dir, f'Roberta + WWM+ Bi-LSTM + CRF best_model_epoch_{epoch}_loss_{loss:.4f}_acc_{acc:.4f}.pt')
          torch.save(best_model.state_dict(), save_path)
          print(f"Best model saved at epoch {epoch} with val_loss: {loss:.4f} and val_acc: {acc:.4f} to {save_path}")

        print("=============================================")

    y_test, y_pred = test(best_model, test_iter, device)
    print(metrics.classification_report(y_test, y_pred, labels=labels, digits=3))
    print(metrics.confusion_matrix(y_test, y_pred, labels=labels))


In [None]:
main(n_epochs=100)

## BERTWWM + Bi-LSTM + CRF


In [None]:
import torch
from torch.utils.data import Dataset
from transformers import BertConfig, AutoTokenizer
from peft import PeftConfig

import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:2048"


bert_model = 'hfl/chinese-bert-wwm'

tokenizer = AutoTokenizer.from_pretrained(bert_model)

VOCAB = ('<PAD>', '[CLS]', '[SEP]', 'O', 'B-BODY', 'I-BODY',
         'B-SYMP', 'I-SYMP', 'B-INST', 'I-INST', 'B-EXAM', 'I-EXAM',
         'B-CHEM', 'I-CHEM','B-DISE', 'I-DISE', 'B-DRUG', 'I-DRUG',
         'B-SUPP', 'I-SUPP', 'B-TREAT', 'I-TREAT', 'B-TIME', 'I-TIME')

tag2idx = {tag: idx for idx, tag in enumerate(VOCAB)}
idx2tag = {idx: tag for idx, tag in enumerate(VOCAB)}
MAX_LEN = 256 - 2

bert_config = BertConfig.from_pretrained(bert_model, output_hidden_states=True)


class NerDataset(Dataset):
    ''' Generate our dataset '''

    def __init__(self, f_path):
        self.sents = []
        self.tags_li = []

        with open(f_path, 'r', encoding='utf-8') as f:
            lines = [line.split('\n')[0] for line in f.readlines() if len(line.strip())!=0]

        tags =  [line.split('\t')[1] for line in lines]
        words = [line.split('\t')[0] for line in lines]

        word, tag = [], []
        for char, t in zip(words, tags):
            if char != '。':
                word.append(char)
                tag.append(t)
            else:
                if len(word) > MAX_LEN:
                  self.sents.append(['[CLS]'] + word[:MAX_LEN] + ['[SEP]'])
                  self.tags_li.append(['[CLS]'] + tag[:MAX_LEN] + ['[SEP]'])
                else:
                  self.sents.append(['[CLS]'] + word + ['[SEP]'])
                  self.tags_li.append(['[CLS]'] + tag + ['[SEP]'])
                word, tag = [], []

    def __getitem__(self, idx):
        words, tags = self.sents[idx], self.tags_li[idx]
        token_ids = tokenizer.convert_tokens_to_ids(words)
        laebl_ids = [tag2idx[tag] for tag in tags]
        seqlen = len(laebl_ids)
        return token_ids, laebl_ids, seqlen

    def __len__(self):
        return len(self.sents)

def PadBatch(batch):#[Pad]
    maxlen = max([i[2] for i in batch])
    token_tensors = torch.LongTensor([i[0] + [0] * (maxlen - len(i[0])) for i in batch])
    label_tensors = torch.LongTensor([i[1] + [0] * (maxlen - len(i[1])) for i in batch])
    # mask = (token_tensors > 0)
    mask = (token_tensors > 0).to(torch.bool)

    return token_tensors, label_tensors, mask

In [35]:
def main(batch_size=64, lr=0.001, n_epochs=40, num_layers_to_finetune=2, r=16, lora_alpha=16,
         lora_dropout=0.05, trainset= trainset, validset=validset, testset=testset):

    best_model = None
    _best_val_loss = 1e18
    _best_val_acc = 1e-18

    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    model = BERTWWM_BiLSTM_CRF(tag2idx).cuda()


    print('Initial model Done.')
    train_dataset = NerDataset(trainset)
    eval_dataset = NerDataset(validset)
    test_dataset = NerDataset(testset)
    print('Load Data Done.')

    train_iter = data.DataLoader(dataset=train_dataset,
                                 batch_size=batch_size,
                                 shuffle=True,
                                 num_workers=4,
                                 collate_fn=PadBatch)

    eval_iter = data.DataLoader(dataset=eval_dataset,
                                 batch_size=(batch_size)//2,
                                 shuffle=False,
                                 num_workers=4,
                                 collate_fn=PadBatch)

    test_iter = data.DataLoader(dataset=test_dataset,
                                batch_size=(batch_size)//2,
                                shuffle=False,
                                num_workers=4,
                                collate_fn=PadBatch)

    #optimizer = optim.Adam(self.model.parameters(), lr=lr, weight_decay=0.01)
    optimizer = AdamW(model.parameters(), lr=lr, eps=1e-6)

    # Warmup
    len_dataset = len(train_dataset)
    epoch = n_epochs
    batch_size = batch_size
    total_steps = (len_dataset // batch_size) * epoch if len_dataset % batch_size == 0 else (len_dataset // batch_size + 1) * epoch

    warm_up_ratio = 0.1 # Define 10% steps
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = warm_up_ratio * total_steps, num_training_steps = total_steps)

    print('Start Train...,')
    for epoch in range(1, n_epochs+1):

        train(epoch, model, train_iter, optimizer, scheduler, device)
        candidate_model, loss, acc = validate(epoch, model, eval_iter, device)

        save_dir = '/home/yenling/Code In Lunix/mamba/Save Model'
        os.makedirs(save_dir, exist_ok=True)

        if loss < _best_val_loss and acc > _best_val_acc:
          best_model = candidate_model
          _best_val_loss = loss
          _best_val_acc = acc

          save_path = os.path.join(save_dir, f'Roberta + WWM+ Bi-LSTM + CRF best_model_epoch_{epoch}_loss_{loss:.4f}_acc_{acc:.4f}.pt')
          torch.save(best_model.state_dict(), save_path)
          print(f"Best model saved at epoch {epoch} with val_loss: {loss:.4f} and val_acc: {acc:.4f} to {save_path}")

        print("=============================================")

    y_test, y_pred = test(best_model, test_iter, device)
    print(metrics.classification_report(y_test, y_pred, labels=labels, digits=3))
    print(metrics.confusion_matrix(y_test, y_pred, labels=labels))


In [None]:
main(n_epochs=100)

## Roberta + WWM+ Bi-LSTM + CRF


In [31]:
import torch
from torch.utils.data import Dataset
from transformers import BertConfig, AutoTokenizer
from peft import PeftConfig

import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:2048"


bert_model = 'hfl/chinese-roberta-wwm-ext'

tokenizer = AutoTokenizer.from_pretrained(bert_model)

VOCAB = ('<PAD>', '[CLS]', '[SEP]', 'O', 'B-BODY', 'I-BODY',
         'B-SYMP', 'I-SYMP', 'B-INST', 'I-INST', 'B-EXAM', 'I-EXAM',
         'B-CHEM', 'I-CHEM','B-DISE', 'I-DISE', 'B-DRUG', 'I-DRUG',
         'B-SUPP', 'I-SUPP', 'B-TREAT', 'I-TREAT', 'B-TIME', 'I-TIME')

tag2idx = {tag: idx for idx, tag in enumerate(VOCAB)}
idx2tag = {idx: tag for idx, tag in enumerate(VOCAB)}
MAX_LEN = 256 - 2

bert_config = BertConfig.from_pretrained(bert_model, output_hidden_states=True)


class NerDataset(Dataset):
    ''' Generate our dataset '''

    def __init__(self, f_path):
        self.sents = []
        self.tags_li = []

        with open(f_path, 'r', encoding='utf-8') as f:
            lines = [line.split('\n')[0] for line in f.readlines() if len(line.strip())!=0]

        tags =  [line.split('\t')[1] for line in lines]
        words = [line.split('\t')[0] for line in lines]

        word, tag = [], []
        for char, t in zip(words, tags):
            if char != '。':
                word.append(char)
                tag.append(t)
            else:
                if len(word) > MAX_LEN:
                  self.sents.append(['[CLS]'] + word[:MAX_LEN] + ['[SEP]'])
                  self.tags_li.append(['[CLS]'] + tag[:MAX_LEN] + ['[SEP]'])
                else:
                  self.sents.append(['[CLS]'] + word + ['[SEP]'])
                  self.tags_li.append(['[CLS]'] + tag + ['[SEP]'])
                word, tag = [], []

    def __getitem__(self, idx):
        words, tags = self.sents[idx], self.tags_li[idx]
        token_ids = tokenizer.convert_tokens_to_ids(words)
        laebl_ids = [tag2idx[tag] for tag in tags]
        seqlen = len(laebl_ids)
        return token_ids, laebl_ids, seqlen

    def __len__(self):
        return len(self.sents)

def PadBatch(batch):#[Pad]
    maxlen = max([i[2] for i in batch])
    token_tensors = torch.LongTensor([i[0] + [0] * (maxlen - len(i[0])) for i in batch])
    label_tensors = torch.LongTensor([i[1] + [0] * (maxlen - len(i[1])) for i in batch])
    # mask = (token_tensors > 0)
    mask = (token_tensors > 0).to(torch.bool)

    return token_tensors, label_tensors, mask

In [32]:
def main(batch_size=64, lr=0.001, n_epochs=40, num_layers_to_finetune=2, r=16, lora_alpha=16,
         lora_dropout=0.05, trainset= trainset, validset=validset, testset=testset):

    best_model = None
    _best_val_loss = 1e18
    _best_val_acc = 1e-18

    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    model = Roberta_wwm_BiLSTM_CRF(tag2idx).cuda()


    print('Initial model Done.')
    train_dataset = NerDataset(trainset)
    eval_dataset = NerDataset(validset)
    test_dataset = NerDataset(testset)
    print('Load Data Done.')

    train_iter = data.DataLoader(dataset=train_dataset,
                                 batch_size=batch_size,
                                 shuffle=True,
                                 num_workers=4,
                                 collate_fn=PadBatch)

    eval_iter = data.DataLoader(dataset=eval_dataset,
                                 batch_size=(batch_size)//2,
                                 shuffle=False,
                                 num_workers=4,
                                 collate_fn=PadBatch)

    test_iter = data.DataLoader(dataset=test_dataset,
                                batch_size=(batch_size)//2,
                                shuffle=False,
                                num_workers=4,
                                collate_fn=PadBatch)

    #optimizer = optim.Adam(self.model.parameters(), lr=lr, weight_decay=0.01)
    optimizer = AdamW(model.parameters(), lr=lr, eps=1e-6)

    # Warmup
    len_dataset = len(train_dataset)
    epoch = n_epochs
    batch_size = batch_size
    total_steps = (len_dataset // batch_size) * epoch if len_dataset % batch_size == 0 else (len_dataset // batch_size + 1) * epoch

    warm_up_ratio = 0.1 # Define 10% steps
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = warm_up_ratio * total_steps, num_training_steps = total_steps)

    print('Start Train...,')
    for epoch in range(1, n_epochs+1):

        train(epoch, model, train_iter, optimizer, scheduler, device)
        candidate_model, loss, acc = validate(epoch, model, eval_iter, device)

        save_dir = '/home/yenling/Code In Lunix/mamba/Save Model'
        os.makedirs(save_dir, exist_ok=True)

        if loss < _best_val_loss and acc > _best_val_acc:
          best_model = candidate_model
          _best_val_loss = loss
          _best_val_acc = acc

          save_path = os.path.join(save_dir, f'Roberta + WWM+ Bi-LSTM + CRF best_model_epoch_{epoch}_loss_{loss:.4f}_acc_{acc:.4f}.pt')
          torch.save(best_model.state_dict(), save_path)
          print(f"Best model saved at epoch {epoch} with val_loss: {loss:.4f} and val_acc: {acc:.4f} to {save_path}")

        print("=============================================")

    y_test, y_pred = test(best_model, test_iter, device)
    print(metrics.classification_report(y_test, y_pred, labels=labels, digits=3))
    print(metrics.confusion_matrix(y_test, y_pred, labels=labels))


In [None]:
main(n_epochs=100)