In [1]:
import json
# 将数据转为BIO格式
def convert_to_bio(data):
    bio_data = []
    
    for item in data:
        sentence_tokens = []
        sentence_labels = []
        
        # Process the metric (indicator)
        if item["metric"]:
            metric_tokens = item["metric"].split()
            sentence_tokens.extend(metric_tokens)
            # 标记第一个词为B-INDICATOR，其余为I-INDICATOR
            sentence_labels.append("B-INDICATOR")
            sentence_labels.extend(["I-INDICATOR"] * (len(metric_tokens) - 1))
        
        # Process the value
        if item["value"]:
            value_tokens = str(item["value"]).split()
            sentence_tokens.extend(value_tokens)
            # 标记第一个词为B-VALUE
            sentence_labels.append("B-VALUE")
            sentence_labels.extend(["I-VALUE"] * (len(value_tokens) - 1))
        
        # Process the unit
        if item["unit"]:
            unit_tokens = item["unit"].split()
            sentence_tokens.extend(unit_tokens)
            # 标记第一个词为B-UNIT，其余为I-UNIT
            sentence_labels.append("B-UNIT")
            sentence_labels.extend(["I-UNIT"] * (len(unit_tokens) - 1))
        
        # Append sentence tokens and labels to bio_data
        bio_sentence = list(zip(sentence_tokens, sentence_labels))
        bio_data.append(bio_sentence)
    
    return bio_data

# 读取JSON文件
with open("../json/grouped_data_full1.json", "r") as file:
    data = json.load(file)

# 转换为BIO格式
bio_data = convert_to_bio(data)

# 保存为BIO格式的文本文件
with open("../output/bio_data2.txt", "w") as file:
    for sentence in bio_data:
        for word, label in sentence:
            file.write(f"{word} {label}\n")
        file.write("\n")  # 每个句子之间用空行分隔


In [25]:
import json
import torch
from transformers import DebertaTokenizer, DebertaForTokenClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset

# 定义标签映射
label2id = {"O": 0, "B-INDICATOR": 1, "I-INDICATOR": 2, "B-VALUE": 3, "I-VALUE": 4, "B-UNIT": 5, "I-UNIT": 6}
id2label = {v: k for k, v in label2id.items()}

def load_json_data(filepath):
    with open(filepath, 'r') as file:
        data = json.load(file)
    return data

def load_raw_text(filepath):
    with open(filepath, 'r') as file:
        return [line.strip() for line in file if line.strip()]

# 1. 加载并解析BIO格式数据
def load_bio_data(filepath):
    texts, labels = [], []
    with open(filepath, 'r') as file:
        text, label = [], []
        for line in file:
            if line.strip() == "":
                if text:
                    texts.append(" ".join(text))
                    labels.append(label)
                    text, label = [], []
                continue
            
            # 从文件中读取单词和BIO标签，并将标签映射为数字
            word, tag = line.strip().split()
            text.append(word)
            label.append(label2id[tag])  # 使用自定义的label2id映射
            
        if text:  # 处理文件最后一行
            texts.append(" ".join(text))
            labels.append(label)
    return texts, labels



In [26]:
id2label

{0: 'O',
 1: 'B-INDICATOR',
 2: 'I-INDICATOR',
 3: 'B-VALUE',
 4: 'I-VALUE',
 5: 'B-UNIT',
 6: 'I-UNIT'}

In [27]:
# File paths
json_filepath = '../json/grouped_data_full1.json'
bio_filepath = '../output/bio_data.txt'
raw_text_filepath = '../txt/AML.txt'

# Load data
json_data = load_json_data(json_filepath)
bio_texts, bio_labels = load_bio_data(bio_filepath)
raw_texts = load_raw_text(raw_text_filepath)

In [28]:
# 加载BIO格式数据
bio_texts, bio_labels = load_bio_data(bio_filepath)

# 2. 自定义Dataset
class ESGDataset(Dataset):
    def __init__(self, texts, labels=None):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = tokenizer(self.texts[idx], truncation=True, padding="max_length", max_length=128, return_tensors="pt")
        item = {key: val.squeeze() for key, val in encoding.items()}
        if self.labels:
            labels = self.labels[idx] + [0] * (128 - len(self.labels[idx]))  # 使用0填充标签
            item["labels"] = torch.tensor(labels)
        return item

# 加载分词器和模型
model_name = "microsoft/deberta-base"
tokenizer = DebertaTokenizer.from_pretrained(model_name)
model = DebertaForTokenClassification.from_pretrained(model_name, num_labels=len(label2id))

# 设置标签映射
model.config.label2id = label2id
model.config.id2label = id2label

# 3. 准备训练数据和Trainer
train_dataset = ESGDataset(bio_texts, bio_labels)
unlabeled_dataset = ESGDataset(raw_texts)

# 4. Training parameters
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="no",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    # logging_dir='./logs',  # 启用日志
    logging_steps=10,
)

# 5. Trainer for supervised training with labeled data
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
)


loading file vocab.json from cache at C:\Users\ariaH/.cache\huggingface\hub\models--microsoft--deberta-base\snapshots\0d1b43ccf21b5acd9f4e5f7b077fa698f05cf195\vocab.json
loading file merges.txt from cache at C:\Users\ariaH/.cache\huggingface\hub\models--microsoft--deberta-base\snapshots\0d1b43ccf21b5acd9f4e5f7b077fa698f05cf195\merges.txt
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at C:\Users\ariaH/.cache\huggingface\hub\models--microsoft--deberta-base\snapshots\0d1b43ccf21b5acd9f4e5f7b077fa698f05cf195\tokenizer_config.json
loading configuration file config.json from cache at C:\Users\ariaH/.cache\huggingface\hub\models--microsoft--deberta-base\snapshots\0d1b43ccf21b5acd9f4e5f7b077fa698f05cf195\config.json
Model config DebertaConfig {
  "_name_or_path": "microsoft/deberta-base",
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden

In [None]:
# 6. 开始微调并保存模型
trainer.train()
trainer.save_model("../model/fine_tuned_deberta")
tokenizer.save_pretrained("../model/fine_tuned_deberta")

***** Running training *****
  Num examples = 18
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 9
  Number of trainable parameters = 138607111


  0%|          | 0/9 [00:00<?, ?it/s]



Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to ./fine_tuned_deberta
Configuration saved in ./fine_tuned_deberta\config.json


{'train_runtime': 75.0504, 'train_samples_per_second': 0.72, 'train_steps_per_second': 0.12, 'train_loss': 0.6539309819539388, 'epoch': 3.0}


Model weights saved in ./fine_tuned_deberta\pytorch_model.bin
tokenizer config file saved in ./fine_tuned_deberta\tokenizer_config.json
Special tokens file saved in ./fine_tuned_deberta\special_tokens_map.json


('./fine_tuned_deberta\\tokenizer_config.json',
 './fine_tuned_deberta\\special_tokens_map.json',
 './fine_tuned_deberta\\vocab.json',
 './fine_tuned_deberta\\merges.txt',
 './fine_tuned_deberta\\added_tokens.json')

In [None]:
from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification
from tqdm import tqdm
import re

# 加载保存的微调模型和分词器
model_path = "../model/fine_tuned_deberta"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForTokenClassification.from_pretrained(model_path)

# 初始化NER pipeline
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

# 读取新的TXT文件
def read_text_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()
    return text

# 分段处理长文本
def split_text_by_sentences(text, max_len=512):
    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s', text)
    chunks = []
    current_chunk = ""
    
    for sentence in sentences:
        if len(current_chunk) + len(sentence) < max_len:
            current_chunk += " " + sentence
        else:
            chunks.append(current_chunk.strip())
            current_chunk = sentence
    if current_chunk:
        chunks.append(current_chunk.strip())
    
    return chunks

# 提取结构化数据
def extract_structured_data(text_chunks):
    structured_data = []
    for chunk in tqdm(text_chunks, desc="Processing Text Chunks"):
        entities = ner_pipeline(chunk)
        current_data = {}
        for entity in entities:
            label_id = entity["entity_group"]  # 直接获取数值标签
            word = entity["word"]
            
            # 根据数值标签进行分类
            if label_id == 1:  # B-METRIC
                current_data["indicator"] = word
            elif label_id == 2:  # I-METRIC
                if "indicator" in current_data:
                    current_data["indicator"] += " " + word
            elif label_id == 3:  # B-VALUE
                current_data["value"] = word
            elif label_id == 4:  # I-VALUE
                if "value" in current_data:
                    current_data["value"] += " " + word
            elif label_id == 5:  # B-UNIT
                current_data["unit"] = word
            elif label_id == 6:  # I-UNIT
                if "unit" in current_data:
                    current_data["unit"] += " " + word
            
            # 保存完整的结构化数据项
            if "indicator" in current_data and "value" in current_data and "unit" in current_data:
                structured_data.append(current_data)
                current_data = {}
    return structured_data
    

# 使用示例
file_path = raw_text_filepath  # 新的报告TXT文件
text = read_text_file(file_path)
text_chunks = split_text_by_sentences(text)  # 分段处理文本，避免超长输入

# 提取结构化数据
structured_data = extract_structured_data(text_chunks)

# 输出结果
for item in structured_data:
    print(item)


loading file vocab.json
loading file merges.txt
loading file tokenizer.json
loading file added_tokens.json


loading file special_tokens_map.json
loading file tokenizer_config.json
loading configuration file ./fine_tuned_deberta\config.json
Model config DebertaConfig {
  "_name_or_path": "./fine_tuned_deberta",
  "architectures": [
    "DebertaForTokenClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "O",
    "1": "B-INDICATOR",
    "2": "I-INDICATOR",
    "3": "B-VALUE",
    "4": "I-VALUE",
    "5": "B-UNIT",
    "6": "I-UNIT"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "B-INDICATOR": 1,
    "B-UNIT": 5,
    "B-VALUE": 3,
    "I-INDICATOR": 2,
    "I-UNIT": 6,
    "I-VALUE": 4,
    "O": 0
  },
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "p

In [31]:
structured_data

[]

In [32]:
bio_texts, bio_labels = load_bio_data(bio_filepath)
print(bio_texts[:2])
print(bio_labels[:2])

['Net profit 1.9 S$ million', 'Total greenhouse gas ( GHG ) emissions 346 tonnes CO2e']
[[1, 2, 3, 5, 6], [1, 2, 2, 2, 2, 2, 2, 3, 5, 6]]
