## Import Packages

In [77]:
import pandas as pd
from sklearn.model_selection import train_test_split
import json
from transformers import BertTokenizer, BertForTokenClassification
from transformers import BertTokenizerFast

import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AdamW, get_linear_schedule_with_warmup
import numpy as np
import random

# 把預訓練好的modelload進來
from tqdm.auto import tqdm

# 设置设备（如果有GPU，则使用GPU）
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def same_seeds(seed):

    torch.manual_seed(seed)

    if torch.cuda.is_available():

        torch.cuda.manual_seed(seed)

        torch.cuda.manual_seed_all(seed)

    np.random.seed(seed)

    random.seed(seed)

    torch.backends.cudnn.benchmark = False

    torch.backends.cudnn.deterministic = True

same_seeds(0)

## Read Data

- {train/test}set:

  - List of dicts with the following keys:

   - ID (string)

   - Sentence (string)

   - Aspect (string)

   - AspectFromTo (string): start point#end point

   - Category (string) : "食物#品質"

   - Opinion (string)

   - OpinionFromTo (string):  start point#end point


   - Intensity (string) :  "6.17#6.33"



- vaild set:

  - List of[ID, Sentence] 


In [78]:
def read_data(file):
    with open(file, 'r', encoding="utf-8") as reader:
        data = json.load(reader)

    # 將JSON數據加載到Pandas DataFrame中
    df = pd.json_normalize(data)
    
    # 将数据分成训练集和测试集
    train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
    
    return train_df, test_df

# 读取训练集和开发集
train_df, test_df = read_data('/mnt/md0/chen-wei/zi/Dimensional-ASTE/NYCU_NLP_113A_Dataset/NYCU_NLP_113A_TrainingSet.json')

# test_questions, test_paragraphs = read_data('/kaggle/input/ml2021-spring-hw7/hw7_test.json')

## Load Model and Tokenizer


### 創建自定義的 Dataset 類別
我們需要創建一個繼承自 torch.utils.data.Dataset 的類別，來處理資料並提供給 DataLoader。

In [80]:
from torch.utils.data import Dataset

class TripletExtractionDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings  # 包含 input_ids 和 attention_mask
        self.labels = labels        # 標籤列表

    def __len__(self):
        return len(self.encodings['input_ids'])

    def __getitem__(self, idx):
        # 取得編碼後的輸入和對應的標籤
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item


### Tokenize Data

将数据准备为 BERT 输入格式
您可以提取 Sentence 字段并将其分词。为了准备 BERT 的输入，可以根据需要将 Aspect 和 Opinion 部分组合到句子中。这里假设要为每个 Sentence 生成分词结果并对齐其 Aspect 和 Opinion 信息：

In [81]:
# 確認標籤數量
unique_labels = set()
for _, row in train_df.iterrows():
    for category in row['Category']:
        unique_labels.add(category)

# for i in unique_labels:
    # print(i)
num_labels = len(unique_labels)
#print("num_labels:", num_labels)

model_name = 'bert-base-chinese'


# 使用BERT快速標記器來自動對齊標記和標籤
tokenizer = BertTokenizerFast.from_pretrained('bert-base-chinese') # 使用支持 offset_mapping 的 BertTokenizerFast

model = BertForTokenClassification.from_pretrained(model_name, num_labels=num_labels)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [87]:
# 定义标签映射
label_to_id = {'O': 0, 'B-Aspect': 1, 'I-Aspect': 2, 'B-Opinion': 3, 'I-Opinion': 4}

def encode_labels(text, aspects, opinions, aspect_positions, opinion_positions):
    # 使用相同的编码过程，确保 tokens 和 offset_mapping 对齐
    encoding = tokenizer(text, return_offsets_mapping=True, truncation=True, max_length=128, add_special_tokens=False)
    tokens = encoding.tokens()
    offset_mapping = encoding['offset_mapping']
    labels = ['O'] * len(tokens)
    # for idx, (token, (start, end)) in enumerate(zip(tokens, offset_mapping)):
    #     print(f"Index: {idx}, Token: {token}, Start: {start}, End: {end}")
    
    # 處理 Aspect 標籤
    for aspect, pos in zip(aspects, aspect_positions):
        start_char, end_char = map(int, pos.split('#'))
        # 找到 token 位置
        for idx, (start, end) in enumerate(offset_mapping):
            if start == start_char-1:
                labels[idx] = 'B-Aspect'
                # 如果 aspect 跨越多个 tokens，需要标记后续的 I-Aspect
                for i in range(idx+1, len(tokens)):
                    if offset_mapping[i][0] >= end_char:
                        break
                    labels[i] = 'I-Aspect'
                break
    # 處理 Opinion 標籤
    for opinion, pos in zip(opinions, opinion_positions):
        start_char, end_char = map(int, pos.split('#'))
        for idx, (start, end) in enumerate(offset_mapping):
            if start == start_char-1:
                labels[idx] = 'B-Opinion'
                for i in range(idx+1, len(tokens)):
                    if offset_mapping[i][0] >= end_char:
                        break
                    labels[i] = 'I-Opinion'
                print("Labels:", labels)
                break
    # 将标签转换为ID
    label_ids = [label_to_id[label] for label in labels]
    return label_ids


def data_generator(df):
    # 處理整個資料集
    inputs = {'input_ids': [], 'attention_mask': []}
    label_ids_list = []

    for idx, row in df.iterrows():
        text = row['Sentence']
        aspects = row['Aspect']
        opinions = row['Opinion']
        aspect_positions = row['AspectFromTo']
        opinion_positions = row['OpinionFromTo']
        # 分詞並編碼
        encoding = tokenizer(text, truncation=True, padding='max_length', max_length=128, return_offsets_mapping=True)
        # 標籤編碼
        label_ids = encode_labels(text, aspects, opinions, aspect_positions, opinion_positions)
        
        print(text)
        print(label_ids)
        # 填充或截斷標籤
        if len(label_ids) < 128:
            label_ids += [label_to_id['O']] * (128 - len(label_ids))
        else:
            label_ids = label_ids[:128]
        # 移除 offset_mapping，因為後續不需要
        encoding.pop('offset_mapping')
        # 添加到列表
        inputs['input_ids'].append(encoding['input_ids'])
        inputs['attention_mask'].append(encoding['attention_mask'])
        label_ids_list.append(label_ids)

    # 創建 Dataset
    dataset = TripletExtractionDataset(inputs, label_ids_list)

    return dataset

Train_dataset = data_generator(train_df)

test_dataset = data_generator(test_df)


# 创建DataLoader
train_loader = DataLoader(Train_dataset, sampler=RandomSampler(Train_dataset), batch_size=16)
test_loader = DataLoader(test_dataset, sampler=SequentialSampler(test_dataset), batch_size=16)


Labels: ['B-Aspect', 'I-Aspect', 'I-Aspect', 'I-Aspect', 'I-Aspect', 'I-Aspect', 'I-Aspect', 'I-Aspect', 'I-Aspect', 'B-Opinion', 'I-Opinion', 'I-Opinion', 'O']
炙烤牛排佐波特酒醬超好吃。
[1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 4, 4, 0]
Labels: ['B-Aspect', 'I-Aspect', 'B-Opinion', 'I-Opinion', 'I-Opinion', 'I-Opinion', 'O']
脆度蠻不錯的。
[1, 2, 3, 4, 4, 4, 0]
Labels: ['B-Aspect', 'I-Aspect', 'O', 'B-Opinion', 'I-Opinion', 'I-Opinion', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-Aspect', 'I-Aspect', 'O', 'O', 'O', 'O', 'O']
Labels: ['B-Aspect', 'I-Aspect', 'O', 'B-Opinion', 'I-Opinion', 'I-Opinion', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-Aspect', 'I-Aspect', 'O', 'B-Opinion', 'I-Opinion', 'I-Opinion', 'O']
鱸魚也很好吃，怎麼會有人失望呢?附上的配菜也很用心。
[1, 2, 0, 3, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 3, 4, 4, 0]
Labels: ['B-Aspect', 'I-Aspect', 'I-Aspect', 'I-Aspect', 'I-Aspect', 'B-Opinion', 'I-Opinion', 'O']
豆腐的醬汁爽口。
[1, 2, 2, 2, 2, 3, 4, 0]
Labels: ['O', 'O', 

In [83]:
for i in inputs:
    print(i)

input_ids
attention_mask


In [84]:
count =0
for i in label_ids_list:
    for j in i:
        if j == 0:
            count+=1
    print(i)
    print(count)
    count=0


[1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
116
[1, 2, 3, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
122
[1, 2, 0, 3, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 3, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

## Train

In [91]:

# 设置设备
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 定义模型
model = BertForTokenClassification.from_pretrained('bert-base-chinese', num_labels=len(label_to_id))
model.to(device)

# 训练参数
epochs = 3
learning_rate = 5e-5
validation = True

optimizer = AdamW(model.parameters(), lr=learning_rate, eps=1e-8)
total_steps = len(train_loader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

print("开始训练...")

for epoch in range(epochs):
    print(f'正在训练第 {epoch + 1}/{epochs} 个 epoch')
    model.train()
    
    total_loss = 0  # 初始化本 epoch 的总损失

    # 训练步骤
    for step, batch in enumerate(tqdm(train_loader)):
        print(f"Batch type: {type(batch)}")
        print(f"Batch keys: {batch.keys() if isinstance(batch, dict) else 'Not a dict'}")
        batch_input_ids = batch['input_ids'].to(device)
        batch_attention_masks = batch['attention_mask'].to(device)
        batch_labels = batch['labels'].to(device)

        optimizer.zero_grad()

        outputs = model(input_ids=batch_input_ids, attention_mask=batch_attention_masks, labels=batch_labels)

        loss = outputs.loss

        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        optimizer.step()

        scheduler.step()

        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_loader)
    print(f'平均训练损失: {avg_train_loss}')

    if validation:
        model.eval()
        eval_loss = 0
        eval_accuracy = 0
        nb_eval_steps = 0

        for batch in test_loader:
            
            batch_input_ids = batch[0].to(device)
            batch_attention_masks = batch[1].to(device)
            batch_labels = batch[2].to(device)

            with torch.no_grad():
                outputs = model(input_ids=batch_input_ids, attention_mask=batch_attention_masks, labels=batch_labels)

            loss = outputs.loss
            logits = outputs.logits

            eval_loss += loss.item()

            predictions = torch.argmax(logits, dim=2).detach().cpu().numpy()
            labels = batch_labels.cpu().numpy()

            # 忽略填充标签 (-100) 计算准确度
            for i in range(len(labels)):
                valid_labels = labels[i] != -100
                eval_accuracy += np.sum(predictions[i][valid_labels] == labels[i][valid_labels]) / np.sum(valid_labels)
                nb_eval_steps += 1

        avg_eval_loss = eval_loss / len(test_loader)
        avg_eval_accuracy = eval_accuracy / nb_eval_steps
        print(f'验证损失: {avg_eval_loss}')
        print(f'验证准确度: {avg_eval_accuracy}')

print("训练完成！")

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


开始训练...
正在训练第 1/3 个 epoch


  0%|          | 0/303 [00:00<?, ?it/s]

Batch type: <class 'dict'>
Batch keys: dict_keys(['input_ids', 'attention_mask', 'labels'])


  0%|          | 0/303 [00:00<?, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 12.00 MiB (GPU 0; 7.79 GiB total capacity; 849.66 MiB already allocated; 3.94 MiB free; 906.00 MiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF