## 文本分类

## 单晶合金

In [None]:
import numpy as np
import pandas as pd
from ipywidgets import widgets
from IPython.display import display
import torch
from torch.utils.data import DataLoader, random_split
from transformers import BertTokenizerFast
import psie

from seqeval.metrics import classification_report
import os
from torch import cuda
import nltk
# nltk.download("punkt", quiet=True)

import re
from pymatgen.core import Composition

import json

In [None]:
import torch

if torch.cuda.is_available():
    print("PyTorch is using GPU")
else:
    print("PyTorch is using CPU")

In [None]:
BERT_VERSION = r'/pretrained_models/m3rg-iitd/matscibert'
MAX_LEN = 256
extr_target = 'Solvus'
MAIN_DIR = os.getcwd()
MODEL_DIR = os.path.join("models", extr_target, "classifier")
CORPUS = os.path.join("corpus", extr_target, "classifier/corpus_sentences.json")
OUTPUT="relevant_sentences"

In [None]:
device = "cuda" if cuda.is_available() else "cpu"
print(device)

In [None]:
from datasets import load_dataset

dataset = load_dataset('json', data_files=os.path.join(MAIN_DIR, CORPUS))["train"]

In [None]:
import importlib
from psie import classifier

# 重新加载模块
importlib.reload(classifier)
from psie import classifier

In [None]:
model = psie.classifier.BertClassifier()
model.to(device)   # 将模型移动到指定GPU上进行计算

# # 加载模型参数。strict=False表示只加载部分权重
# model.load_state_dict(torch.load(r'./models/Tc/classifier.pt'),strict=False)

In [None]:
# 创建实例，对文本进行编码
tokenizer = BertTokenizerFast.from_pretrained(BERT_VERSION)

def encode(paper):
  return tokenizer(paper["sentence"], truncation=True, max_length=MAX_LEN, padding="max_length")

dataset = dataset.map(encode, batched=True)
dataset.set_format(type="torch", columns=["source", "sentence", "input_ids", "attention_mask",
                                         "isrelevant"])

In [None]:
len(dataset[0]['attention_mask'])

划分数据集
* train_size = 0.6  # 训练集比例
* val_size = 0.2  # 验证集比例
* test_size =0.2  # 测试集比例

In [None]:
train_val = dataset.train_test_split(
    test_size=0.4,shuffle=True,seed=666)
train_dataset = train_val['train']
test_dataset = train_val['test']
test_val = test_dataset.train_test_split(test_size=0.5,shuffle=True,seed=666)
val_dataset = test_val['train']
test_dataset = test_val['test']

In [None]:
# 加载编码后的数据集
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=False)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=32, shuffle=False)

训练

In [None]:
import torch
import torch.optim as optim
# 优化器
optimizer = optim.Adam(model.parameters(), lr=0.0001)
max_norm = 1.0  # 梯度裁剪的最大范数

# 加权损失
class_weights = torch.tensor([1.0, 303.0/50.0],dtype=torch.float32).to(device)  # 将权重列表转换为张量，并移动到设备上

In [None]:
optimizer

In [None]:
num_epochs = Num_Epochs
tr_Loss_list = []
tr_Acc_list = []
val_Loss_list = []
val_Acc_list = []
val_f1_list = []
val_recall_list = []
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    epoch_loss_tr, tr_accuracy, val_loss, val_accuracy, val_f1, val_recall = model.finetuning(train_loader,val_loader,
                        device, max_norm, optimizer,weight=class_weights)
    tr_Loss_list.append(epoch_loss_tr)
    tr_Acc_list.append(tr_accuracy)
    val_Loss_list.append(val_loss)
    val_Acc_list.append(val_accuracy)
    val_f1_list.append(val_f1)
    val_recall_list.append(val_recall)

In [None]:
# 保存模型参数
torch.save(model.state_dict(), './classifier.pt')

测试

In [None]:
model = psie.classifier.BertClassifier()
# 加载预训练的模型参数。strict=False表示只加载部分权重
model.load_state_dict(torch.load('./classifier.pt'),strict=False)
model.to(device)   # 将模型移动到指定GPU上进行计算

In [None]:
labels_tensor, preds = model.testLabeledData(test_loader, device)
# 将张量转换为标准的 Python 列表
labels = [label.item() for label in labels_tensor]
predictions = []
for i in range(len(preds)):
  predictions.append(np.argmax(preds[i].cpu().numpy()))   # 返回最大值的索引

In [None]:
print(labels_tensor[0])
print(preds[0])

预测

In [None]:
from datasets import load_dataset
model = psie.classifier.BertClassifier()
# 加载预训练的模型参数。strict=False表示只加载部分权重
model.load_state_dict(torch.load('./classifier.pt'),strict=False)
model.to(device)   # 将模型移动到指定GPU上进行计算

In [None]:
combined_filtered_sentences = {"sentence": [], "source": []}
tokenizer = BertTokenizerFast.from_pretrained(BERT_VERSION)
def encode(paper):
  return tokenizer(paper["sentence"], truncation=True, max_length=MAX_LEN, padding="max_length")

for filename in os.listdir(INPUT_DIR):
    if filename.endswith(".json"):
        file_path = os.path.join(INPUT_DIR, filename)
        
        # 导入数据集
        dataset = load_dataset('json', data_files=file_path)['train']
        dataset = dataset.map(encode, batched=True)
        dataset.set_format(type="torch", columns=["source", "sentence", "input_ids", "attention_mask"])
        # 加载编码后的数据集
        dataset_loader = torch.utils.data.DataLoader(dataset, batch_size=32, shuffle=False)
        # 预测
        pred = model.predict(dataset_loader, device)
        predictions = []
        for i in range(len(pred)):
            predictions.append(np.argmax(pred[i].cpu().numpy())) 
  
        filtered_sentences = {"sentence": [], "source":[]}
        for i in range(len(predictions)):
            if predictions[i] == 1:
                filtered_sentences["sentence"].append((dataset[i]["sentence"]))
                filtered_sentences["source"].append((dataset[i]["source"]))
        
        combined_filtered_sentences["sentence"].extend(filtered_sentences["sentence"])
        combined_filtered_sentences["source"].extend(filtered_sentences["source"])
        

with open(OUTPUT_FILE, 'w', encoding='utf-8') as file:
    json.dump(combined_filtered_sentences, file, ensure_ascii=False, indent=4)

处理单个文档

In [None]:
from datasets import load_dataset
model = psie.classifier.BertClassifier()
# 加载预训练的模型参数。strict=False表示只加载部分权重
model.load_state_dict(torch.load('./classifier.pt'),strict=False)
model.to(device)   # 将模型移动到指定GPU上进行计算

In [None]:
from datasets import load_dataset
dataset = load_dataset('json', data_files=os.path.join(MAIN_DIR,'/results.json'))['train']

In [None]:
# 创建实例，对文本进行编码
tokenizer = BertTokenizerFast.from_pretrained(BERT_VERSION)

def encode(paper):
  return tokenizer(paper["sentence"], truncation=True, max_length=MAX_LEN, padding="max_length")


dataset = dataset.map(encode, batched=True)
dataset.set_format(type="torch", columns=["source", "sentence", "input_ids", "attention_mask"])
# 加载编码后的数据集
dataset_loader = torch.utils.data.DataLoader(dataset, batch_size=32, shuffle=False)

In [None]:
pred = model.predict(dataset_loader, device)
predictions = []
for i in range(len(pred)):
  predictions.append(np.argmax(pred[i].cpu().numpy()))   # 返回最大值的索引

In [None]:
filtered_sentences = {"sentence": [], "source":[]}
for i in range(len(predictions)):
  if predictions[i] == 1:
    filtered_sentences["sentence"].append((dataset[i]["sentence"]))
    filtered_sentences["source"].append((dataset[i]["source"]))

with open(os.path.join(MAIN_DIR,"/relevant_sentences_all.json"), "w") as f:
  json.dump(filtered_sentences, f)