## Import Packages

In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split

import json

from transformers import BertTokenizer, BertForTokenClassification

from transformers import BertTokenizerFast



import torch

from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

from transformers import AdamW, get_linear_schedule_with_warmup

import numpy as np

import random



# 把預訓練好的modelload進來

from tqdm.auto import tqdm



# 设置设备（如果有GPU，则使用GPU）

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')



def same_seeds(seed):



    torch.manual_seed(seed)



    if torch.cuda.is_available():



        torch.cuda.manual_seed(seed)



        torch.cuda.manual_seed_all(seed)



    np.random.seed(seed)



    random.seed(seed)



    torch.backends.cudnn.benchmark = False



    torch.backends.cudnn.deterministic = True



same_seeds(0)

  from .autonotebook import tqdm as notebook_tqdm


## Read Data



- {train/test}set:



  - List of dicts with the following keys:



   - ID (string)



   - Sentence (string)



   - Aspect (string)



   - AspectFromTo (string): start point#end point



   - Category (string) : "食物#品質"



   - Opinion (string)



   - OpinionFromTo (string):  start point#end point





   - Intensity (string) :  "6.17#6.33"







- vaild set:



  - List of[ID, Sentence] 


In [3]:
def read_data(file):

    with open(file, 'r', encoding="utf-8") as reader:

        data = json.load(reader)
    # 將JSON數據加載到Pandas DataFrame中
    df = pd.json_normalize(data)
    # 将数据分成训练集和测试集
    train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
    return train_df, test_df
# 读取训练集和开发集

train_df, test_df = read_data(r'E:\NYCU-Project\Class\NLP\Dimensional ASTE\NYCU_NLP_113A_Dataset\NYCU_NLP_113A_TrainingSet.json')

# test_questions, test_paragraphs = read_data('/kaggle/input/ml2021-spring-hw7/hw7_test.json')

## Load Model and Tokenizer


### 創建自定義的 Dataset 類別

我們需要創建一個繼承自 torch.utils.data.Dataset 的類別，來處理資料並提供給 DataLoader。

In [4]:
from torch.utils.data import Dataset



class TripletExtractionDataset(torch.utils.data.Dataset):
    def __init__(self, inputs, labels, intensity_labels):
        self.input_ids = inputs['input_ids']
        self.attention_mask = inputs['attention_mask']
        self.labels = labels
        self.intensity_labels = intensity_labels

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.input_ids[idx], dtype=torch.long),
            'attention_mask': torch.tensor(self.attention_mask[idx], dtype=torch.long),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long),
            'intensity_labels': torch.tensor(self.intensity_labels[idx], dtype=torch.float)
        }



### Tokenize Data



将数据准备为 BERT 输入格式

您可以提取 Sentence 字段并将其分词。为了准备 BERT 的输入，可以根据需要将 Aspect 和 Opinion 部分组合到句子中。这里假设要为每个 Sentence 生成分词结果并对齐其 Aspect 和 Opinion 信息：

In [5]:
# 確認標籤數量

unique_labels = set()

for _, row in train_df.iterrows():

    for category in row['Category']:

        unique_labels.add(category)



# for i in unique_labels:

    # print(i)

num_labels = len(unique_labels)

#print("num_labels:", num_labels)

model_name = 'bert-base-chinese'

# 使用BERT快速標記器來自動對齊標記和標籤

tokenizer = BertTokenizerFast.from_pretrained('bert-base-chinese') # 使用支持 offset_mapping 的 BertTokenizerFast

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [6]:
# 定义标签映射

label_to_id = {'O': 0, 'B-Aspect': 1, 'I-Aspect': 2, 'B-Opinion': 3, 'I-Opinion': 4}



def encode_labels(text, aspects, opinions, aspect_positions, opinion_positions, intensities):
    # 使用相同的编码过程，确保 tokens 和 offset_mapping 对齐
    encoding = tokenizer(text, return_offsets_mapping=True, truncation=True, max_length=32, add_special_tokens=True)
    tokens = encoding.tokens()
    offset_mapping = encoding['offset_mapping']
    labels = ['O'] * len(tokens)
    # 初始化 intensity_labels，每个 token 对应一个 [Valence, Arousal]，初始为 [0.0, 0.0]
    intensity_labels = [[0.0, 0.0] for _ in range(len(tokens))]

    # 创建一个 mapping，将每个字符位置映射到对应的 token 索引
    char_to_token_map = {}
    for idx, (start, end) in enumerate(offset_mapping):
        if start == end:
            continue  # 跳过特殊标记或填充部分
        for char_pos in range(start, end):
            char_to_token_map[char_pos] = idx

    # 处理 Aspect 标签
    for aspect, pos in zip(aspects, aspect_positions):
        start_char, end_char = map(int, pos.split('#'))
        start_char -= 1  # 调整为 0-based 索引
        # 获取起始和结束 token 索引
        try:
            start_token_idx = char_to_token_map[start_char]
            end_token_idx = char_to_token_map[end_char - 1]
        except KeyError:
            continue  # 如果字符位置不在映射中，跳过
        labels[start_token_idx] = 'B-Aspect'
        for idx in range(start_token_idx + 1, end_token_idx + 1):
            labels[idx] = 'I-Aspect'

    # 处理 Opinion 标签，并赋予对应的情感强度
    for opinion, pos, intensity in zip(opinions, opinion_positions, intensities):
        start_char, end_char = map(int, pos.split('#'))
        start_char -= 1  # 调整为 0-based 索引
        valence, arousal = map(float, intensity.split('#'))  # 解析情感强度值
        try:
            start_token_idx = char_to_token_map[start_char]
            end_token_idx = char_to_token_map[end_char - 1]
        except KeyError:
            continue
        labels[start_token_idx] = 'B-Opinion'
        intensity_labels[start_token_idx] = [valence, arousal]
        for idx in range(start_token_idx + 1, end_token_idx + 1):
            labels[idx] = 'I-Opinion'
            intensity_labels[idx] = [valence, arousal]

    # 将标签转换为ID
    label_ids = [label_to_id[label] for label in labels]
    return label_ids, intensity_labels



def data_generator(df):
    # 处理整个数据集
    inputs = {'input_ids': [], 'attention_mask': []}
    label_ids_list = []
    intensity_labels_list = []

    for idx, row in df.iterrows():
        text = row['Sentence']
        aspects = row['Aspect']
        opinions = row['Opinion']
        aspect_positions = row['AspectFromTo']
        opinion_positions = row['OpinionFromTo']
        intensities = row['Intensity']  # 获取情感强度

        # 分词并编码
        encoding = tokenizer(text, truncation=True, padding='max_length', max_length=32, return_offsets_mapping=True)

        # 标签编码，获取 label_ids 和 intensity_labels
        label_ids, intensity_labels = encode_labels(text, aspects, opinions, aspect_positions, opinion_positions, intensities)

        # 填充或截断标签
        seq_length = len(encoding['input_ids'])
        if len(label_ids) < seq_length:
            label_ids += [label_to_id['O']] * (seq_length - len(label_ids))
            intensity_labels += [[0.0, 0.0]] * (seq_length - len(intensity_labels))
        else:
            label_ids = label_ids[:seq_length]
            intensity_labels = intensity_labels[:seq_length]

        # 移除 offset_mapping，因后续不需要
        encoding.pop('offset_mapping')

        # 添加到列表
        inputs['input_ids'].append(encoding['input_ids'])
        inputs['attention_mask'].append(encoding['attention_mask'])
        label_ids_list.append(label_ids)
        intensity_labels_list.append(intensity_labels)

    # 创建 Dataset
    dataset = TripletExtractionDataset(inputs, label_ids_list, intensity_labels_list)
    return dataset



Train_dataset = data_generator(train_df)
test_dataset = data_generator(test_df)

# 创建DataLoader

train_loader = DataLoader(Train_dataset, sampler=RandomSampler(Train_dataset), batch_size=8)

test_loader = DataLoader(test_dataset, sampler=SequentialSampler(test_dataset), batch_size=8)


## Train

In [None]:
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# # 定义模型
# model = BertForTokenClassification.from_pretrained('bert-base-chinese', num_labels=len(label_to_id))
# model.to(device)


# # 训练参数
# epochs = 3
# learning_rate = 5e-5
# validation = True



# optimizer = AdamW(model.parameters(), lr=learning_rate, eps=1e-8)

# total_steps = len(train_loader) * epochs

# scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# print("开始训练...")


# for epoch in range(epochs):

#     print(f'正在训练第 {epoch + 1}/{epochs} 个 epoch')

#     model.train()

#     total_loss = 0  # 初始化本 epoch 的总损失

#     # 训练步骤

#     for step, batch in enumerate(tqdm(train_loader)):

#         # print(f"Batch type: {type(batch)}")

#         # print(f"Batch keys: {batch.keys() if isinstance(batch, dict) else 'Not a dict'}")
#         batch_input_ids = batch['input_ids'].to(device)
#         batch_attention_masks = batch['attention_mask'].to(device)
#         batch_labels = batch['labels'].to(device)
#         optimizer.zero_grad()
#         outputs = model(input_ids=batch_input_ids, attention_mask=batch_attention_masks, labels=batch_labels)
#         loss = outputs.loss



#         loss.backward()

#         torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

#         scheduler.step()
#         total_loss += loss.item()
        
#     avg_train_loss = total_loss / len(train_loader)
#     print(f'平均训练损失: {avg_train_loss}')

#     if validation:
#         model.eval()
#         eval_loss = 0
#         eval_accuracy = 0
#         nb_eval_steps = 0

#         for batch in test_loader:
#             batch_input_ids = batch['input_ids'].to(device)
#             batch_attention_masks = batch['attention_mask'].to(device)
#             batch_labels = batch['labels'].to(device)

#             with torch.no_grad():

#                 outputs = model(input_ids=batch_input_ids, attention_mask=batch_attention_masks, labels=batch_labels)

#             loss = outputs.loss
#             logits = outputs.logits
#             eval_loss += loss.item()

#             predictions = torch.argmax(logits, dim=2).detach().cpu().numpy()
#             labels = batch_labels.cpu().numpy()



#             # 忽略填充标签 (-100) 计算准确度

#             for i in range(len(labels)):

#                 valid_labels = labels[i] != -100

#                 eval_accuracy += np.sum(predictions[i][valid_labels] == labels[i][valid_labels]) / np.sum(valid_labels)

#                 nb_eval_steps += 1

#         avg_eval_loss = eval_loss / len(test_loader)
#         avg_eval_accuracy = eval_accuracy / nb_eval_steps
#         print(f'验证损失: {avg_eval_loss}')
#         print(f'验证准确度: {avg_eval_accuracy}')
       
# # 保存模型和分词器
# model.save_pretrained('saved_model')
# tokenizer.save_pretrained('saved_model')
# print("训练完成！")

### Define ASTE model

In [8]:
import torch
import torch.nn as nn
from transformers import BertModel, BertPreTrainedModel

class ASTEModel(BertPreTrainedModel):
    def __init__(self, config):
        super(ASTEModel, self).__init__(config)
        self.num_labels = config.num_labels  # 序列标注的标签数量
        self.bert = BertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        # 序列标注层
        self.classifier = nn.Linear(config.hidden_size, self.num_labels)
        # 情感强度回归层
        self.regressor = nn.Linear(config.hidden_size, 2)  # 输出 Valence 和 Arousal

        self.init_weights()

    def forward(self, input_ids, attention_mask=None, token_type_ids=None, labels=None, intensity_labels=None):
        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )
        sequence_output = self.dropout(outputs.last_hidden_state)  # [batch_size, seq_len, hidden_size]
    
        # 序列标注的 logits
        logits = self.classifier(sequence_output)  # [batch_size, seq_len, num_labels]
    
        total_loss = None
    
        # 计算序列标注的损失
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            active_loss = attention_mask.view(-1) == 1  # 只计算非填充部分的损失
            active_logits = logits.view(-1, self.num_labels)[active_loss]
            active_labels = labels.view(-1)[active_loss]
            loss = loss_fct(active_logits, active_labels)
            total_loss = loss
    
        # 计算情感强度的损失
        if intensity_labels is not None:
            # 获取观点词的位置
            opinion_mask = ((labels == 3) | (labels == 4)) & attention_mask  # [batch_size, seq_len]
            # 将张量重塑为 [batch_size * seq_len, ...]
            batch_size, seq_len, hidden_size = sequence_output.size()
            sequence_output_flat = sequence_output.view(-1, hidden_size)  # [batch_size * seq_len, hidden_size]
            opinion_mask_flat = opinion_mask.view(-1)  # [batch_size * seq_len]
            # 提取观点词的隐藏状态
            opinion_outputs = sequence_output_flat[opinion_mask_flat]  # [num_opinions, hidden_size]
            if opinion_outputs.size(0) > 0:
                # 预测情感强度
                intensity_preds = self.regressor(opinion_outputs)  # [num_opinions, 2]
                # 取对应的情感强度标签
                intensity_labels_flat = intensity_labels.view(-1, 2)  # [batch_size * seq_len, 2]
                intensity_targets = intensity_labels_flat[opinion_mask_flat]  # [num_opinions, 2]
                # 计算回归损失
                loss_mse = nn.MSELoss()
                regression_loss = loss_mse(intensity_preds, intensity_targets)
                total_loss = total_loss + regression_loss if total_loss is not None else regression_loss
    
        output = (logits,)
        return ((total_loss,) + output) if total_loss is not None else output
    


### 辅助函数的定义
1. 提取 BIO 标签对应的实体（方面和观点）:extract_spans
2. 提取预测的三元组
3. 提取真实的三元组
4. 计算评估指标

In [9]:
def extract_spans(tags, tokens, label_prefix):
    spans = []
    i = 0
    while i < len(tags):
        if tags[i] == f'B-{label_prefix}':
            start = i
            i += 1
            while i < len(tags) and tags[i] == f'I-{label_prefix}':
                i += 1
            end = i  # 结束位置的下一个索引
            span_tokens = tokens[start:end]
            spans.append((start, end, span_tokens))
        else:
            i += 1
    return spans


def extract_triplets(tokens, tags, model, input_ids, attention_mask):
    aspect_spans = extract_spans(tags, tokens, 'Aspect')
    opinion_spans = extract_spans(tags, tokens, 'Opinion')

    triplets = []

    # 获取模型的序列输出
    with torch.no_grad():
        outputs = model.bert(input_ids=input_ids.unsqueeze(0), attention_mask=attention_mask.unsqueeze(0))
        sequence_output = outputs.last_hidden_state.squeeze(0)  # [seq_len, hidden_size]

    for opinion_span in opinion_spans:
        start, end, opinion_tokens = opinion_span

        # 获取观点词的隐藏状态
        opinion_hidden_states = sequence_output[start:end]

        # 计算平均表示
        opinion_representation = opinion_hidden_states.mean(dim=0)  # [hidden_size]

        # 预测情感强度
        intensity_pred = model.regressor(opinion_representation.unsqueeze(0))  # [1, 2]
        intensity_pred = intensity_pred.squeeze(0).detach().cpu().numpy()

        # 将 Valence 和 Arousal 四舍五入为整数
        valence = int(round(intensity_pred[0]))
        arousal = int(round(intensity_pred[1]))

        # 获取观点词文本
        opinion_text = ''.join(opinion_tokens).replace('##', '')

        # 处理对应的方面词
        if len(aspect_spans) == 0:
            # 如果没有检测到方面词
            aspect_text = ''
            triplet = (aspect_text, opinion_text, f"{valence}#{arousal}")
            triplets.append(triplet)
        else:
            # 如果有多个方面词，可以根据需要进行匹配
            for aspect_span in aspect_spans:
                a_start, a_end, aspect_tokens = aspect_span
                aspect_text = ''.join(aspect_tokens).replace('##', '')
                triplet = (aspect_text, opinion_text, f"{valence}#{arousal}")
                triplets.append(triplet)

    return triplets

def extract_gold_triplets(tokens, tags, intensity_labels):
    aspect_spans = extract_spans(tags, tokens, 'Aspect')
    opinion_spans = extract_spans(tags, tokens, 'Opinion')

    triplets = []

    for opinion_span in opinion_spans:
        start, end, opinion_tokens = opinion_span

        # 获取观点词的情感强度标签
        intensity_values = intensity_labels[start:end]  # [span_len, 2]
        intensity_values = intensity_values.mean(dim=0).detach().cpu().numpy()

        # 将 Valence 和 Arousal 四舍五入为整数
        valence = int(round(intensity_values[0]))
        arousal = int(round(intensity_values[1]))

        opinion_text = ''.join(opinion_tokens).replace('##', '')

        if len(aspect_spans) == 0:
            aspect_text = ''
            triplet = (aspect_text, opinion_text, f"{valence}#{arousal}")
            triplets.append(triplet)
        else:
            for aspect_span in aspect_spans:
                a_start, a_end, aspect_tokens = aspect_span
                aspect_text = ''.join(aspect_tokens).replace('##', '')
                triplet = (aspect_text, opinion_text, f"{valence}#{arousal}")
                triplets.append(triplet)

    return triplets


def compute_metrics(pred_triplets, gold_triplets):
    pred_set = set(pred_triplets)
    gold_set = set(gold_triplets)

    true_positives = pred_set & gold_set
    precision = len(true_positives) / len(pred_set) if len(pred_set) > 0 else 0.0
    recall = len(true_positives) / len(gold_set) if len(gold_set) > 0 else 0.0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0

    return precision, recall, f1


### Training

In [11]:
from transformers import BertConfig
from torch.amp import autocast, GradScaler
scaler = GradScaler()

import os
import numpy as np
from tqdm import tqdm

epochs = 10000
# valid用 : 耐心指數
best_val_loss = float('inf')
best_avg_loss = float('inf')
patience = 15
patience_counter = 0
min_delta = 0.001  # 最小损失改善


label_to_id = {'O': 0, 'B-Aspect': 1, 'I-Aspect': 2, 'B-Opinion': 3, 'I-Opinion': 4}
id_to_label = {v: k for k, v in label_to_id.items()}
num_labels = len(label_to_id)
config = BertConfig.from_pretrained('bert-base-chinese', num_labels=num_labels)
model = ASTEModel.from_pretrained('bert-base-chinese', config=config)
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
total_steps = len(train_loader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
model.to(device)
scaler = GradScaler()

for epoch in range(epochs):
    model.train()
    total_loss = 0

    # 训练循环中的数据加载
    for step, batch in enumerate(tqdm(train_loader)):
        batch_input_ids = batch['input_ids'].to(device)
        batch_attention_masks = batch['attention_mask'].to(device)
        batch_labels = batch['labels'].to(device)
        batch_intensity_labels = batch['intensity_labels'].to(device)
        optimizer.zero_grad()

        with autocast("cuda"):
            outputs = model(
                input_ids=batch_input_ids,
                attention_mask=batch_attention_masks,
                labels=batch_labels,
                intensity_labels=batch_intensity_labels
            )
        loss = outputs[0]
        
        scaler.scale(loss).backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{epochs}, Average Loss: {avg_loss}")

    # **验证步骤**
    model.eval()
    total_eval_loss = 0
    all_pred_triplets = []
    all_gold_triplets = []

    for batch in tqdm(test_loader):
        batch_input_ids = batch['input_ids'].to(device)
        batch_attention_masks = batch['attention_mask'].to(device)
        batch_labels = batch['labels'].to(device)
        batch_intensity_labels = batch['intensity_labels'].to(device)

        with torch.no_grad():
            outputs = model(
                input_ids=batch_input_ids,
                attention_mask=batch_attention_masks,
                labels=batch_labels,
                intensity_labels=batch_intensity_labels
            )
        loss = outputs[0]
        total_eval_loss += loss.item()

        logits = outputs[1]  # 模型的输出 logits
        predictions = torch.argmax(logits, dim=2)  # 获取预测的标签

        # 遍历批次中的每个样本
        for i in range(batch_input_ids.size(0)):
            input_ids = batch_input_ids[i]
            pred_labels = predictions[i]
            gold_labels = batch_labels[i]
            attention_mask = batch_attention_masks[i]
            intensity_labels = batch_intensity_labels[i]

            # 将 input_ids 转换为 tokens
            tokens = tokenizer.convert_ids_to_tokens(input_ids.cpu().numpy())

            # 获取预测标签和真实标签
            pred_tags = [id_to_label[label_id.item()] for label_id in pred_labels]
            gold_tags = [id_to_label[label_id.item()] for label_id in gold_labels]

            # 提取预测的三元组
            pred_triplets = extract_triplets(tokens, pred_tags, model, input_ids, attention_mask)
            # 提取真实的三元组
            gold_triplets = extract_gold_triplets(tokens, gold_tags, intensity_labels)

            all_pred_triplets.extend(pred_triplets)
            all_gold_triplets.extend(gold_triplets)

    # 计算评估指标
    precision, recall, f1 = compute_metrics(all_pred_triplets, all_gold_triplets)

    avg_val_loss = total_eval_loss / len(test_loader)
    print(f"Validation Loss: {avg_val_loss},Precision: {precision:.4f}, Recall: {recall:.4f}, F1-score: {f1:.4f}")

    # 检查验证损失是否改善
    if ((avg_loss < best_avg_loss - min_delta) | (avg_val_loss < best_val_loss - min_delta)):
        best_val_loss = avg_val_loss
        patience_counter = 0
        # 保存当前最好的模型
        torch.save(model.state_dict(), 'best_model.pt')
    else:
        patience_counter += 1
        print(f"验证损失未改善，耐心值计数：{patience_counter}")
        if patience_counter >= patience:
            print("验证损失在连续多个 epoch 中未改善，停止训练。")
            break

print("--------------")
print("训练完成")
# 保存模型
output_dir = './saved_model/'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)


Some weights of ASTEModel were not initialized from the model checkpoint at bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight', 'regressor.bias', 'regressor.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  2%|▏         | 12/605 [00:01<01:12,  8.23it/s]


KeyboardInterrupt: 

# Testing

 1. 使用自定义的 ASTEModel 进行预测
 2. 提取实体并格式化输出

In [None]:
import torch
from transformers import BertConfig, BertTokenizerFast

# 定义标签映射
label_to_id = {'O': 0, 'B-Aspect': 1, 'I-Aspect': 2, 'B-Opinion': 3, 'I-Opinion': 4}
id_to_label = {v: k for k, v in label_to_id.items()}

# 定义设备
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 加载配置和自定义模型 ASTEModel
config = BertConfig.from_pretrained('./saved_model/', num_labels=len(label_to_id))
model = ASTEModel.from_pretrained('./saved_model/', config=config)
model.to(device)
model.eval()

# 加载 tokenizer
tokenizer = BertTokenizerFast.from_pretrained('./saved_model/')

def predict(sentence):
    encoding = tokenizer(
        sentence,
        return_offsets_mapping=True,
        truncation=True,
        padding='max_length',
        max_length=128,
        return_tensors='pt',
        add_special_tokens=True
    )
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    offset_mapping = encoding['offset_mapping'][0]  # batch_size=1

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs[0]  # 获取预测的标签 logits
    predictions = torch.argmax(logits, dim=2)  # [1, seq_len]
    predictions = predictions[0].cpu().numpy()  # 转换为 numpy 数组

    # 将 label id 转换为 label 名称
    predicted_labels = [id_to_label[int(label_id)] for label_id in predictions]

    # 将 tokens 转换回文字
    tokens = tokenizer.convert_ids_to_tokens(input_ids[0])

    # 去除填充部分
    effective_length = (attention_mask[0] == 1).sum().item()
    tokens = tokens[:effective_length]
    predicted_labels = predicted_labels[:effective_length]
    offset_mapping = offset_mapping[:effective_length]

    # 获取模型的序列输出，用于情感强度预测
    with torch.no_grad():
        bert_outputs = model.bert(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = bert_outputs.last_hidden_state[0, :effective_length, :]  # [seq_len, hidden_size]

    return tokens, predicted_labels, offset_mapping, sequence_output

def get_entities(tokens, labels, offsets, sequence_output):
    aspects = []
    opinions = []
    idx = 0
    while idx < len(labels):
        label = str(labels[idx])  # 确保 label 是字符串
        if label == 'B-Aspect':
            aspect_tokens = [tokens[idx]]
            aspect_start = offsets[idx][0].item()
            aspect_end = offsets[idx][1].item()
            aspect_positions = [idx]
            idx += 1
            while idx < len(labels) and str(labels[idx]) == 'I-Aspect':
                aspect_tokens.append(tokens[idx])
                aspect_end = offsets[idx][1].item()
                aspect_positions.append(idx)
                idx += 1
            aspect_text = ''.join([token.lstrip('##') for token in aspect_tokens])
            aspects.append({'text': aspect_text, 'start': aspect_start, 'end': aspect_end, 'positions': aspect_positions})
        elif label == 'B-Opinion':
            opinion_tokens = [tokens[idx]]
            opinion_start = offsets[idx][0].item()
            opinion_end = offsets[idx][1].item()
            opinion_positions = [idx]
            idx += 1
            while idx < len(labels) and str(labels[idx]) == 'I-Opinion':
                opinion_tokens.append(tokens[idx])
                opinion_end = offsets[idx][1].item()
                opinion_positions.append(idx)
                idx += 1
            opinion_text = ''.join([token.lstrip('##') for token in opinion_tokens])

            # 获取观点词的隐藏状态
            opinion_hidden_states = sequence_output[opinion_positions, :]  # [opinion_len, hidden_size]
            # 计算平均表示
            opinion_representation = opinion_hidden_states.mean(dim=0)  # [hidden_size]
            # 预测情感强度
            with torch.no_grad():
                intensity_pred = model.regressor(opinion_representation.unsqueeze(0))  # [1, 2]
                intensity_pred = intensity_pred.squeeze(0).detach().cpu().numpy()
                # 四舍五入为整数
                valence = int(round(intensity_pred[0]))
                arousal = int(round(intensity_pred[1]))
                intensity = f"{valence}#{arousal}"

            opinions.append({
                'text': opinion_text,
                'start': opinion_start,
                'end': opinion_end,
                'intensity': intensity
            })
        else:
            idx += 1
    return aspects, opinions

def format_output(sentence_id, aspects, opinions):
    triplets = []
    for aspect in aspects:
        for opinion in opinions:
            # 使用预测的情感强度
            intensity = opinion['intensity']
            triplet = f"({aspect['text']}, {opinion['text']}, {intensity})"
            triplets.append(triplet)
    triplets_str = ' '.join(triplets)
    return f"{sentence_id}\t{triplets_str}"

# 示例句子
sentence_id = 'R3530:S002'
sentence = '整体上菜速度非常快。'

# 预测
tokens, predict_labels, offsets, sequence_output = predict(sentence)

# 提取实体并预测情感强度
aspects, opinions = get_entities(tokens, predict_labels, offsets, sequence_output)
print("Tokens:", tokens)
print("Predicted Labels:", predict_labels)
print("Aspects:", aspects)
print("Opinions:", opinions)

# 格式化输出
output_line = format_output(sentence_id, aspects, opinions)
print('ID\tTriplets')
print(output_line)


Tokens: ['[CLS]', '整', '体', '上', '菜', '速', '度', '非', '常', '快', '。', '[SEP]']
Predicted Labels: ['O', 'O', 'O', 'B-Aspect', 'I-Aspect', 'I-Aspect', 'I-Aspect', 'B-Opinion', 'I-Opinion', 'I-Opinion', 'O', 'O']
Aspects: [{'text': '上菜速度', 'start': 2, 'end': 6, 'positions': [3, 4, 5, 6]}]
Opinions: [{'text': '非常快', 'start': 6, 'end': 9, 'intensity': '3#3'}]
ID	Triplets
R3530:S002	(上菜速度, 非常快, 3#3)
