## NER prediction

In [4]:
import pandas as pd
from ipywidgets import widgets
from IPython.display import display
import re
from pymatgen.core import Composition
from torch.utils.data import DataLoader
from torch import cuda
from transformers import BertTokenizerFast
from seqeval.metrics import classification_report
import os
import json

import psie

import nltk
# nltk.download("punkt", quiet=True)

In [6]:
device = "cuda" if cuda.is_available() else "cpu"
print(device)

cuda


In [1]:
radio_buttons = widgets.RadioButtons(
    options=["Solvus"], value="Solvus", description=''
)
print("Extraction Target: ")
display(radio_buttons)

NameError: name 'widgets' is not defined

In [8]:
extr_target = "Solvus"
MAX_LEN = 256
MAIN_DIR = os.getcwd()
MODEL_DIR = os.path.join("models", extr_target, "ner")          # Fine-tuned NER model
CORPUS = os.path.join("corpus", extr_target, "relevant_sentences.json")
OUTPUT = "test_extraction"      # Name of the output file
BERT_VERSION = r'/pretrained_models/m3rg-iitd/matscibert'

In [9]:
id_to_BOI = {
    1: "B-CHEM",     # Chemical entity
    0: "O",          # No entity
}

if extr_target == "Solvus":
  id_to_BOI[2] = "B-SolvusTemp"

In [10]:
BOI_to_id = {'B-CHEM': 1, "0": 0, 'B-SolvusTemp': 2}

## NER微调

In [1]:
from datasets import load_dataset

dataset = load_dataset('json', data_files='./ner_input.json')["train"]

Generating train split: 0 examples [00:00, ? examples/s]

In [5]:
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'  # 启用 CUDA 调试

In [11]:
model = psie.BertForNer.from_pretrained(BERT_VERSION, num_labels=3)
# model = psie.BertForNer.from_pretrained('./models/Tc/ner', num_labels=3)
model.to(device)

Some weights of BertForNer were not initialized from the model checkpoint at /datashare/dir_cyx/pretrained_models/m3rg-iitd/matscibert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForNer(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(31090, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine

In [12]:
# 创建实例，对文本进行编码
tokenizer = BertTokenizerFast.from_pretrained(BERT_VERSION)

def encode(paper):
  return tokenizer(paper["sentence"], truncation=True, max_length=256, padding="max_length")

dataset = dataset.map(encode, batched=True)
dataset.set_format(type="torch", columns=["source", "sentence", "input_ids", "attention_mask",
                                         "label"])

Map:   0%|          | 0/219 [00:00<?, ? examples/s]

划分数据集
* train_size = 0.6  # 训练集比例
* val_size = 0.2  # 验证集比例
* test_size = 0.2  # 测试集比例

In [14]:
train_test = dataset.train_test_split(
    test_size=0.4,shuffle=True,seed=666)
train_dataset = train_test['train']
test_dataset = train_test['test']

test_val = test_dataset.train_test_split(
    test_size=0.5,shuffle=True,seed=666)
test_dataset = test_val['train']
val_dataset = test_val['test']

In [16]:
# 加载编码后的数据集
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

In [17]:
import torch
import torch.optim as optim
from transformers import AdamW, get_linear_schedule_with_warmup

# 优化器
num_epochs = 150

max_norm = 1.0  # 梯度裁剪的最大范数
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
total_steps = len(train_loader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)



In [2]:
tr_Loss_list = []
tr_Acc_list = []
val_Acc_list = []
val_Loss_list = []
val_recall_list = []
val_f1_list = []
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    tr_loss, tr_accuracy, val_loss, val_accuracy, val_recall, val_f1= model.finetuning(train_loader, val_loader, device, max_norm, optimizer,scheduler)
    tr_Loss_list.append(tr_loss)
    tr_Acc_list.append(tr_accuracy)
    val_Loss_list.append(val_loss)
    val_Acc_list.append(val_accuracy)
    val_recall_list.append(val_recall)
    val_f1_list.append(val_f1)

NameError: name 'num_epochs' is not defined

In [28]:
# 保存模型参数
torch.save(model.state_dict(), './ner_epoch.pt')

## test

In [42]:
# strict=False
model = psie.BertForNer.from_pretrained(BERT_VERSION, num_labels=3)
model.load_state_dict(torch.load('./ner_epoch.pt'))
model.eval()    # 制定model.eval()固定dropout和BN层。
model.to(device)

Some weights of BertForNer were not initialized from the model checkpoint at /datashare/dir_cyx/pretrained_models/m3rg-iitd/matscibert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForNer(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(31090, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine

In [43]:
labels, predictions = model.testLabeledData(test_loader, device, id_to_BOI)

Validation loss per 100 evaluation steps: 0.2686489224433899
Loss: 0.24651015301545462
Accuracy: 0.9363878038194445


In [44]:
BOI_to_id = {'B-CHEM': 1, 'O': 0, 'B-SolvusTemp': 2}
new_predictions = [BOI_to_id[label] for label in predictions]
new_labels = [BOI_to_id[label] for label in labels]

在多类别分类问题中，评估指标的 average 参数通常可以选择以下几个选项：
* None: 返回每个类别的评估指标，不进行平均。
* micro: 对所有类别的真阳性、假阳性和假阴性进行全局计算，然后计算指标。
* macro: 对每个类别计算指标，然后求取平均值。每个类别的权重相同。
* weighted: 对每个类别计算指标，然后按照每个类别的支持样本数进行加权平均。