## Import Packages

In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split

import json

from transformers import BertTokenizer, BertForTokenClassification

from transformers import BertTokenizerFast



import torch

from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

from transformers import AdamW, get_linear_schedule_with_warmup

import numpy as np

import random



# 把預訓練好的modelload進來

from tqdm.auto import tqdm



# 设置设备（如果有GPU，则使用GPU）

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')



def same_seeds(seed):



    torch.manual_seed(seed)



    if torch.cuda.is_available():



        torch.cuda.manual_seed(seed)



        torch.cuda.manual_seed_all(seed)



    np.random.seed(seed)



    random.seed(seed)



    torch.backends.cudnn.benchmark = False



    torch.backends.cudnn.deterministic = True



same_seeds(0)

  from .autonotebook import tqdm as notebook_tqdm


## Read Data



- {train/test}set:



  - List of dicts with the following keys:



   - ID (string)



   - Sentence (string)



   - Aspect (string)



   - AspectFromTo (string): start point#end point



   - Category (string) : "食物#品質"



   - Opinion (string)



   - OpinionFromTo (string):  start point#end point





   - Intensity (string) :  "6.17#6.33"







- vaild set:



  - List of[ID, Sentence] 


In [36]:
import json
import pandas as pd
from sklearn.model_selection import train_test_split, KFold

# 读取数据函数
def read_data(file):
    with open(file, 'r', encoding="utf-8") as reader:
        data = json.load(reader)
    df = pd.json_normalize(data)
    return df


## Load Model and Tokenizer


### 創建自定義的 Dataset 類別

我們需要創建一個繼承自 torch.utils.data.Dataset 的類別，來處理資料並提供給 DataLoader。

In [37]:
from torch.utils.data import Dataset



class TripletExtractionDataset(torch.utils.data.Dataset):
    def __init__(self, inputs, labels, intensity_labels):
        self.input_ids = inputs['input_ids']
        self.attention_mask = inputs['attention_mask']
        self.labels = labels
        self.intensity_labels = intensity_labels

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.input_ids[idx], dtype=torch.long),
            'attention_mask': torch.tensor(self.attention_mask[idx], dtype=torch.long),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long),
            'intensity_labels': torch.tensor(self.intensity_labels[idx], dtype=torch.float)
        }



### Tokenize Data



将数据准备为 BERT 输入格式

您可以提取 Sentence 字段并将其分词。为了准备 BERT 的输入，可以根据需要将 Aspect 和 Opinion 部分组合到句子中。这里假设要为每个 Sentence 生成分词结果并对齐其 Aspect 和 Opinion 信息：

In [38]:
# 確認標籤數量
unique_labels = set()

for _, row in train_df.iterrows():
    for category in row['Category']:
        unique_labels.add(category)

# for i in unique_labels:

    # print(i)

num_labels = len(unique_labels)
#print("num_labels:", num_labels)

# 使用BERT快速標記器來自動對齊標記和標籤

tokenizer = BertTokenizerFast.from_pretrained('bert-base-chinese') # 使用支持 offset_mapping 的 BertTokenizerFast

In [39]:
# 定义标签映射

label_to_id = {'O': 0, 'B-Aspect': 1, 'I-Aspect': 2, 'B-Opinion': 3, 'I-Opinion': 4}



def encode_labels(text, aspects, opinions, aspect_positions, opinion_positions, intensities):
    # 使用相同的编码过程，确保 tokens 和 offset_mapping 对齐
    encoding = tokenizer(text, return_offsets_mapping=True, truncation=True, max_length=32, add_special_tokens=True)
    tokens = encoding.tokens()
    offset_mapping = encoding['offset_mapping']
    labels = ['O'] * len(tokens)
    # 初始化 intensity_labels，每个 token 对应一个 [Valence, Arousal]，初始为 [5.0, 5.0]
    intensity_labels = [[5.0, 5.0] for _ in range(len(tokens))]

    # 创建一个 mapping，将每个字符位置映射到对应的 token 索引
    char_to_token_map = {}
    for idx, (start, end) in enumerate(offset_mapping):
        if start == end:
            continue  # 跳过特殊标记或填充部分
        for char_pos in range(start, end):
            char_to_token_map[char_pos] = idx

    # 处理 Aspect 标签
    for aspect, pos in zip(aspects, aspect_positions):
        start_char, end_char = map(int, pos.split('#'))
        start_char -= 1  # 调整为 0-based 索引
        # 获取起始和结束 token 索引
        try:
            start_token_idx = char_to_token_map[start_char]
            end_token_idx = char_to_token_map[end_char - 1]
        except KeyError:
            continue  # 如果字符位置不在映射中，跳过
        labels[start_token_idx] = 'B-Aspect'
        for idx in range(start_token_idx + 1, end_token_idx + 1):
            labels[idx] = 'I-Aspect'

    # 处理 Opinion 标签，并赋予对应的情感强度
    for opinion, pos, intensity in zip(opinions, opinion_positions, intensities):
        start_char, end_char = map(int, pos.split('#'))
        start_char -= 1  # 调整为 0-based 索引
        valence, arousal = map(float, intensity.split('#'))  # 解析情感强度值
        try:
            start_token_idx = char_to_token_map[start_char]
            end_token_idx = char_to_token_map[end_char - 1]
        except KeyError:
            continue
        labels[start_token_idx] = 'B-Opinion'
        intensity_labels[start_token_idx] = [valence, arousal]
        for idx in range(start_token_idx + 1, end_token_idx + 1):
            labels[idx] = 'I-Opinion'
            intensity_labels[idx] = [valence, arousal]

    # 将标签转换为ID
    label_ids = [label_to_id[label] for label in labels]
    return label_ids, intensity_labels



def data_generator(df):
    # 处理整个数据集
    inputs = {'input_ids': [], 'attention_mask': []}
    label_ids_list = []
    intensity_labels_list = []

    for idx, row in df.iterrows():
        text = row['Sentence']
        aspects = row['Aspect']
        opinions = row['Opinion']
        aspect_positions = row['AspectFromTo']
        opinion_positions = row['OpinionFromTo']
        intensities = row['Intensity']  # 获取情感强度

        # 分词并编码
        encoding = tokenizer(text, truncation=True, padding='max_length', max_length=32, return_offsets_mapping=True)

        # 标签编码，获取 label_ids 和 intensity_labels
        label_ids, intensity_labels = encode_labels(text, aspects, opinions, aspect_positions, opinion_positions, intensities)

        # 填充或截断标签
        seq_length = len(encoding['input_ids'])
        if len(label_ids) < seq_length:
            label_ids += [label_to_id['O']] * (seq_length - len(label_ids))
            intensity_labels += [[0.0, 0.0]] * (seq_length - len(intensity_labels))
        else:
            label_ids = label_ids[:seq_length]
            intensity_labels = intensity_labels[:seq_length]

        # 移除 offset_mapping，因后续不需要
        encoding.pop('offset_mapping')

        # 添加到列表
        inputs['input_ids'].append(encoding['input_ids'])
        inputs['attention_mask'].append(encoding['attention_mask'])
        label_ids_list.append(label_ids)
        intensity_labels_list.append(intensity_labels)

    # 创建 Dataset
    dataset = TripletExtractionDataset(inputs, label_ids_list, intensity_labels_list)
    return dataset


# Train

### Define ASTE model

In [43]:
class SimpleASTEModel(BertPreTrainedModel):
    def __init__(self, config):
        super(SimpleASTEModel, self).__init__(config)
        self.num_labels = config.num_labels
        self.bert = BertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, self.num_labels)
        self.init_weights()

    def forward(self, input_ids, attention_mask=None, token_type_ids=None, labels=None):
        outputs = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        sequence_output = self.dropout(outputs.last_hidden_state)  # [batch_size, seq_len, hidden_size]
        logits = self.classifier(sequence_output)  # [batch_size, seq_len, num_labels]
        loss = None

        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            active_loss = attention_mask.view(-1) == 1  # 忽略填充部分
            active_logits = logits.view(-1, self.num_labels)[active_loss]
            active_labels = labels.view(-1)[active_loss]
            loss = loss_fct(active_logits, active_labels)
        # print(f"Sequence output shape: {sequence_output.shape}")
        # print(f"Logits shape: {logits.shape}")

        return (loss, logits) if loss is not None else logits


### 辅助函数的定义
1. 提取 BIO 标签对应的实体（方面和观点）:extract_spans
2. 提取预测的三元组
3. 提取真实的三元组
4. 计算评估指标

In [44]:
def extract_spans(tags, tokens, label_prefix):
    spans = []
    i = 0
    while i < len(tags):
        if tags[i] == f'B-{label_prefix}':
            start = i
            i += 1
            while i < len(tags) and tags[i] == f'I-{label_prefix}':
                i += 1
            end = i  # 结束位置的下一个索引
            span_tokens = tokens[start:end]
            spans.append((start, end, span_tokens))
        else:
            i += 1
    return spans

def extract_triplets(tokens, tags, model, input_ids, attention_mask):
    aspect_spans = extract_spans(tags, tokens, 'Aspect')
    opinion_spans = extract_spans(tags, tokens, 'Opinion')

    binary_aspect_opinion = []  # 1. Aspect-Opinion 二元组
    v_triplets = []            # 2. 包含 Valence 的 triplets
    a_triplets = []            # 3. 包含 Arousal 的 triplets
    full_triplets = []         # 4. 原始的 Aspect-Opinion-Valence-Arousal 三元组

    # 获取模型的序列输出
    with torch.no_grad():
        outputs = model.bert(input_ids=input_ids.unsqueeze(0), attention_mask=attention_mask.unsqueeze(0))
        sequence_output = outputs.last_hidden_state.squeeze(0)  # [seq_len, hidden_size]

    for opinion_span in opinion_spans:
        start, end, opinion_tokens = opinion_span

        # 获取观点词的隐藏状态
        opinion_hidden_states = sequence_output[start:end]

        # 计算平均表示
        opinion_representation = opinion_hidden_states.mean(dim=0)  # [hidden_size]

        # 预测情感强度
        intensity_pred = model.regressor(opinion_representation.unsqueeze(0))  # [1, 2]
        intensity_pred = intensity_pred.squeeze(0).detach().cpu().numpy()

        # 原本只有round
        valence = int(round(intensity_pred[0] * 10))
        arousal = int(round(abs(intensity_pred[1] * 10)))

        opinion_text = ''.join(opinion_tokens).replace('##', '')

        if len(aspect_spans) == 0:
            aspect_text = ''
            binary_aspect_opinion.append((aspect_text, opinion_text))
            v_triplets.append((aspect_text, opinion_text, valence))
            a_triplets.append((aspect_text, opinion_text, arousal))
            full_triplets.append((aspect_text, opinion_text, valence, arousal))
        else:
            for aspect_span in aspect_spans:
                a_start, a_end, aspect_tokens = aspect_span
                aspect_text = ''.join(aspect_tokens).replace('##', '')
                binary_aspect_opinion.append((aspect_text, opinion_text))
                v_triplets.append((aspect_text, opinion_text, valence))
                a_triplets.append((aspect_text, opinion_text, arousal))
                full_triplets.append((aspect_text, opinion_text, valence, arousal))

    return binary_aspect_opinion, v_triplets, a_triplets, full_triplets


def extract_gold_triplets(tokens, tags, intensity_labels):
    aspect_spans = extract_spans(tags, tokens, 'Aspect')
    opinion_spans = extract_spans(tags, tokens, 'Opinion')

    binary_aspect_opinion = []  # 1. Aspect-Opinion 二元组
    v_triplets = []            # 2. 包含 Valence 的 triplets
    a_triplets = []            # 3. 包含 Arousal 的 triplets
    full_triplets = []         # 4. 原始的 Aspect-Opinion-Valence-Arousal 三元组

    for opinion_span in opinion_spans:
        start, end, opinion_tokens = opinion_span

        # 获取观点词的情感强度标签
        intensity_values = intensity_labels[start:end]  # [span_len, 2]
        intensity_values = intensity_values.mean(dim=0).detach().cpu().numpy()

        valence = int(round(intensity_values[0]))
        arousal = int(round(intensity_values[1]))

        opinion_text = ''.join(opinion_tokens).replace('##', '')

        if len(aspect_spans) == 0:
            aspect_text = ''
            binary_aspect_opinion.append((aspect_text, opinion_text))
            v_triplets.append((aspect_text, opinion_text, valence))
            a_triplets.append((aspect_text, opinion_text, arousal))
            full_triplets.append((aspect_text, opinion_text, valence, arousal))
        else:
            for aspect_span in aspect_spans:
                a_start, a_end, aspect_tokens = aspect_span
                aspect_text = ''.join(aspect_tokens).replace('##', '')
                binary_aspect_opinion.append((aspect_text, opinion_text))
                v_triplets.append((aspect_text, opinion_text, valence))
                a_triplets.append((aspect_text, opinion_text, arousal))
                full_triplets.append((aspect_text, opinion_text, valence, arousal))

    return binary_aspect_opinion, v_triplets, a_triplets, full_triplets


def compute_metrics(pred_triplets, gold_triplets):

    
    pred_set = set(pred_triplets)
    gold_set = set(gold_triplets)

    true_positives = pred_set & gold_set
    # if(true_positives):
    #     print(pred_triplets[:3])
    #     print(gold_triplets[:3])
    # print(true_positives)
        
    precision = len(true_positives) / len(pred_set) if len(pred_set) > 0 else 0.0
    recall = len(true_positives) / len(gold_set) if len(gold_set) > 0 else 0.0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0

    return precision, recall, f1


### Training

In [None]:
from transformers import BertConfig
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import AdamW, get_scheduler
from sklearn.model_selection import train_test_split
import torch
from sklearn.metrics import classification_report


import os

# Define the save path
save_path = r"E:\NYCU-Project\Class\NLP\Dimensional ASTE\log"

# Ensure the directory exists
os.makedirs(save_path, exist_ok=True)

# 加載數據集並設置標籤
data_path = r'E:\NYCU-Project\Class\NLP\Dimensional ASTE\NYCU_NLP_113A_Dataset\NYCU_NLP_113A_TrainingSet.json'
df = read_data(data_path)

label_to_id = {'O': 0, 'B-Aspect': 1, 'I-Aspect': 2, 'B-Opinion': 3, 'I-Opinion': 4}
id_to_label = {v: k for k, v in label_to_id.items()}
num_labels = len(label_to_id)

# 分割訓練和驗證集
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)

train_dataset = data_generator(train_df)
val_dataset = data_generator(val_df)

train_loader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=32)
val_loader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=32)

# 加載模型
config = BertConfig.from_pretrained('bert-base-chinese', num_labels=num_labels)
model = SimpleASTEModel.from_pretrained('bert-base-chinese', config=config)
model.to(device)

optimizer = AdamW(model.parameters(), lr=5e-5, weight_decay=0.01)

num_training_steps = len(train_loader) * 3  # 假設訓練 3 個 epoch
scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# 使用BERT快速標記器來自動對齊標記和標籤

tokenizer = BertTokenizerFast.from_pretrained('bert-base-chinese') # 使用支持 offset_mapping 的 BertTokenizerFast
# 訓練
for epoch in range(3):
    model.train()
    total_loss = 0
    for batch in train_loader:
        batch_input_ids = batch['input_ids'].to(device)
        batch_attention_mask = batch['attention_mask'].to(device)
        batch_labels = batch['labels'].to(device)
        
        optimizer.zero_grad()
        outputs = model(input_ids=batch_input_ids, attention_mask=batch_attention_mask, labels=batch_labels)
        loss = outputs[0]
        loss.backward()
        optimizer.step()
        scheduler.step()
        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_loader)}")

    # Evaluation
    model.eval()
    all_preds = []
    all_labels = []
    all_attention_masks = []

    with torch.no_grad():
        for batch in val_loader:
            batch_input_ids = batch['input_ids'].to(device)
            batch_attention_mask = batch['attention_mask'].to(device)
            batch_labels = batch['labels'].to(device)

            outputs = model(input_ids=batch_input_ids, attention_mask=batch_attention_mask)
            preds = torch.argmax(outputs, dim=2)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(batch_labels.cpu().numpy())
            all_attention_masks.extend(batch_attention_mask.cpu().numpy())

    # 展平預測和標籤
    flat_preds = []
    flat_labels = []

    for pred, label, mask in zip(all_preds, all_labels, all_attention_masks):
        active_indices = mask.flatten() == 1
        flat_preds.extend(pred.flatten()[active_indices])
        flat_labels.extend(label.flatten()[active_indices])

    # 將數值標籤轉換為標籤名稱
    flat_preds = [id_to_label[pred_id] for pred_id in flat_preds]
    flat_labels = [id_to_label[label_id] for label_id in flat_labels]
    # print(flat_labels[:3])

    # 計算分類報告
    report = classification_report(flat_labels, flat_preds, labels=list(label_to_id.keys()), digits=4)

    print(report)
    # After training is complete

# Save the model and tokenizer after training
model_save_path = os.path.join(save_path, "trained_model.pt")
torch.save(model.state_dict(), model_save_path)
print(f"Model saved at {model_save_path}")

# Save the tokenizer
tokenizer.save_pretrained(save_path)
print(f"Tokenizer saved at {save_path}")


Some weights of SimpleASTEModel were not initialized from the model checkpoint at bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, Loss: 0.30141684094890514
              precision    recall  f1-score   support

           O     0.9421    0.9233    0.9326      4863
    B-Aspect     0.9253    0.9114    0.9183       734
    I-Aspect     0.9166    0.9239    0.9202      1498
   B-Opinion     0.8834    0.8855    0.8844       847
   I-Opinion     0.8834    0.9313    0.9067      1733

    accuracy                         0.9206      9675
   macro avg     0.9102    0.9151    0.9125      9675
weighted avg     0.9212    0.9206    0.9208      9675

Epoch 2, Loss: 0.14351983266005738
              precision    recall  f1-score   support

           O     0.9586    0.9134    0.9355      4863
    B-Aspect     0.9179    0.9292    0.9235       734
    I-Aspect     0.9029    0.9559    0.9287      1498
   B-Opinion     0.8834    0.9032    0.8932       847
   I-Opinion     0.8808    0.9383    0.9086      1733

    accuracy                         0.9248      9675
   macro avg     0.9087    0.9280    0.9179      9675
weighte

# Testing

In [54]:
import torch
from transformers import BertTokenizerFast, BertConfig

# Load the tokenizer from the saved path
tokenizer = BertTokenizerFast.from_pretrained(save_path)
print(f"Tokenizer loaded from {save_path}")

# Label mappings
label_to_id = {'O': 0, 'B-Aspect': 1, 'I-Aspect': 2, 'B-Opinion': 3, 'I-Opinion': 4}
id_to_label = {v: k for k, v in label_to_id.items()}
num_labels = len(label_to_id)

# Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the model configuration
config = BertConfig.from_pretrained('bert-base-chinese', num_labels=num_labels)

# Initialize the model
model = SimpleASTEModel.from_pretrained('bert-base-chinese', config=config)

# Load the saved state dictionary
model_load_path = os.path.join(save_path, "trained_model.pt")
model.load_state_dict(torch.load(model_load_path, map_location=device))

# Move model to device
model.to(device)

def preprocess_sentence(sentence):
    encoding = tokenizer(sentence, return_offsets_mapping=True, add_special_tokens=False)
    tokens = tokenizer.convert_ids_to_tokens(encoding['input_ids'])
    input_ids = encoding['input_ids']
    attention_mask = encoding['attention_mask']
    return tokens, input_ids, attention_mask

def create_input_tensors(input_ids, attention_mask):
    input_ids_tensor = torch.tensor([input_ids], dtype=torch.long).to(device)
    attention_mask_tensor = torch.tensor([attention_mask], dtype=torch.long).to(device)
    return input_ids_tensor, attention_mask_tensor

def get_predictions(model, input_ids_tensor, attention_mask_tensor):
    model.eval()
    with torch.no_grad():
        outputs = model(input_ids=input_ids_tensor, attention_mask=attention_mask_tensor)
        predictions = torch.argmax(outputs, dim=2)  # outputs is logits tensor
        predictions = predictions.cpu().numpy()[0]
    return predictions

def map_predictions_to_labels(predictions):
    predicted_labels = [id_to_label.get(pred_id, 'O') for pred_id in predictions]
    return predicted_labels

def extract_terms(tokens, predicted_labels):
    aspects = []
    opinions = []
    i = 0
    while i < len(predicted_labels):
        label = predicted_labels[i]
        if label == 'B-Aspect':
            aspect_tokens = [tokens[i]]
            i += 1
            while i < len(predicted_labels) and predicted_labels[i] == 'I-Aspect':
                aspect_tokens.append(tokens[i])
                i += 1
            aspect = ''.join(aspect_tokens).replace('##', '')
            aspects.append(aspect)
        elif label == 'B-Opinion':
            opinion_tokens = [tokens[i]]
            i += 1
            while i < len(predicted_labels) and predicted_labels[i] == 'I-Opinion':
                opinion_tokens.append(tokens[i])
                i += 1
            opinion = ''.join(opinion_tokens).replace('##', '')
            opinions.append(opinion)
        else:
            i += 1
    return aspects, opinions

def test_model(sentence):
    tokens, input_ids, attention_mask = preprocess_sentence(sentence)
    print("Tokens:", tokens)
    input_ids_tensor, attention_mask_tensor = create_input_tensors(input_ids, attention_mask)
    predictions = get_predictions(model, input_ids_tensor, attention_mask_tensor)
    print("Predictions:", predictions)
    predicted_labels = map_predictions_to_labels(predictions)
    print("Predicted Labels:", predicted_labels)
    aspects, opinions = extract_terms(tokens, predicted_labels)
    print("Input Sentence:", sentence)
    print("Aspects:", aspects)
    print("Opinions:", opinions)

# Test the model
test_sentence = "上菜速度相當的快，服務人員也很親切"
test_model(test_sentence)


Tokenizer loaded from E:\NYCU-Project\Class\NLP\Dimensional ASTE\log


Some weights of SimpleASTEModel were not initialized from the model checkpoint at bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  model.load_state_dict(torch.load(model_load_path, map_location=device))


Tokens: ['上', '菜', '速', '度', '相', '當', '的', '快', '，', '服', '務', '人', '員', '也', '很', '親', '切']
Predictions: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
Predicted Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Input Sentence: 上菜速度相當的快，服務人員也很親切
Aspects: []
Opinions: []
