Label imblance

In [None]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import TensorDataset, DataLoader
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
import warnings
warnings.filterwarnings("ignore")

# 檢查是否有 GPU 可用
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("使用的設備:", device)

# 加載數據
train_df = pd.read_csv('hw2_train.csv')

# 初始化 BERT Tokenizer 和 Model
tokenizer = BertTokenizer.from_pretrained("bert-large-uncased")
bert_model = BertModel.from_pretrained("bert-large-uncased").to(device)

# 設定最大序列長度
MAX_LENGTH = 32

# 調整 BERT 編碼函數
def encode_texts(texts, tokenizer, bert_model, device, max_length=MAX_LENGTH):
    inputs = tokenizer(
        texts, padding="max_length", truncation=True, max_length=max_length, return_tensors="pt"
    ).to(device)
    with torch.no_grad():
        outputs = bert_model(**inputs)
    embeddings = outputs.last_hidden_state  # Shape: (batch_size, max_length, hidden_dim)
    return embeddings, inputs["attention_mask"].sum(dim=1)

# 提取句子的 BERT 嵌入
utterances = train_df['utterances'].tolist()
embeddings, sequence_lengths = encode_texts(utterances, tokenizer, bert_model, device)

# 構建標籤映射字典
unique_labels = set(label for tags in train_df['IOB Slot tags'] for label in tags.split())
label_to_index = {label: idx for idx, label in enumerate(unique_labels)}
idx_to_label = {idx: label for label, idx in label_to_index.items()}

# 計算每個標籤的出現次數並計算 class weights
all_labels = [label for tags in train_df['IOB Slot tags'] for label in tags.split()]
class_weights = compute_class_weight('balanced', classes=np.unique(all_labels), y=all_labels)
class_weights_dict = {label_to_index[label]: weight for label, weight in zip(np.unique(all_labels), class_weights)}
class_weights_tensor = torch.tensor([class_weights_dict[i] for i in range(len(label_to_index))], dtype=torch.float).to(device)

# 將標籤轉換為數字格式並填充到 BERT 輸出的 max_length 長度
labels = train_df['IOB Slot tags'].apply(lambda x: [label_to_index[label] for label in x.split()])
labels_padded = pad_sequence(
    [torch.tensor(label + [label_to_index["O"]] * (MAX_LENGTH - len(label))) for label in labels], 
    batch_first=True
).to(device)

# 最後確認 BERT 嵌入與標籤的形狀一致
assert embeddings.shape[1] == labels_padded.shape[1], "BERT 輸出序列長度和標籤序列長度不匹配"
X_train, X_test, y_train, y_test = train_test_split(embeddings, labels_padded, test_size=0.2, random_state=42)

# 創建數據集和數據加載器
train_dataset = TensorDataset(X_train, y_train)
test_dataset = TensorDataset(X_test, y_test)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# 定義損失函數並應用 class weights
criterion = nn.CrossEntropyLoss(weight=class_weights_tensor)
