In [1]:
import torch
torch.cuda.empty_cache()

import gc
gc.collect()
torch.cuda.empty_cache()
!nvidia-smi

Sun Nov 10 00:44:10 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.94                 Driver Version: 560.94         CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                  Driver-Model | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 3070 ...  WDDM  |   00000000:01:00.0  On |                  N/A |
| N/A   42C    P8             16W /  111W |    7421MiB /   8192MiB |     10%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import TensorDataset, DataLoader
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
import warnings
warnings.filterwarnings("ignore")

# 檢查是否有 GPU 可用
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("使用的設備:", device)

# 加載數據
train_df = pd.read_csv('hw2_train.csv')

# 初始化 BERT Tokenizer 和 Model
tokenizer = BertTokenizer.from_pretrained("bert-large-uncased")
bert_model = BertModel.from_pretrained("bert-large-uncased").to(device)

# 設定最大序列長度
MAX_LENGTH = 32

# 調整 BERT 編碼函數
def encode_texts(texts, tokenizer, bert_model, device, max_length=MAX_LENGTH):
    inputs = tokenizer(
        texts, padding="max_length", truncation=True, max_length=max_length, return_tensors="pt"
    ).to(device)
    with torch.no_grad():
        outputs = bert_model(**inputs)
    embeddings = outputs.last_hidden_state  # Shape: (batch_size, max_length, hidden_dim)
    return embeddings, inputs["attention_mask"].sum(dim=1)

# 提取句子的 BERT 嵌入
utterances = train_df['utterances'].tolist()
embeddings, sequence_lengths = encode_texts(utterances, tokenizer, bert_model, device)

# 構建標籤映射字典
unique_labels = set(label for tags in train_df['IOB Slot tags'] for label in tags.split())
label_to_index = {label: idx for idx, label in enumerate(unique_labels)}
idx_to_label = {idx: label for label, idx in label_to_index.items()}

# 計算每個標籤的出現次數並計算 class weights
all_labels = [label for tags in train_df['IOB Slot tags'] for label in tags.split()]
class_weights = compute_class_weight('balanced', classes=np.unique(all_labels), y=all_labels)
class_weights_dict = {label_to_index[label]: weight for label, weight in zip(np.unique(all_labels), class_weights)}
class_weights_tensor = torch.tensor([class_weights_dict[i] for i in range(len(label_to_index))], dtype=torch.float).to(device)

# 將標籤轉換為數字格式並填充到 BERT 輸出的 max_length 長度
labels = train_df['IOB Slot tags'].apply(lambda x: [label_to_index[label] for label in x.split()])
labels_padded = pad_sequence(
    [torch.tensor(label + [label_to_index["O"]] * (MAX_LENGTH - len(label))) for label in labels], 
    batch_first=True
).to(device)

# 最後確認 BERT 嵌入與標籤的形狀一致
assert embeddings.shape[1] == labels_padded.shape[1], "BERT 輸出序列長度和標籤序列長度不匹配"
X_train, X_test, y_train, y_test = train_test_split(embeddings, labels_padded, test_size=0.2, random_state=42)

# 創建數據集和數據加載器
train_dataset = TensorDataset(X_train, y_train)
test_dataset = TensorDataset(X_test, y_test)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# 定義損失函數並應用 class weights
criterion = nn.CrossEntropyLoss(weight=class_weights_tensor)


  from .autonotebook import tqdm as notebook_tqdm


使用的設備: cuda


In [None]:
import torch
import torch.nn as nn
from transformers import AdamW
from torch.optim.lr_scheduler import StepLR
from seqeval.metrics import classification_report, f1_score, precision_score, recall_score

# 定義使用 GRU 的模型
class SlotTaggingModelGRU(nn.Module):
    def __init__(self, bert_hidden_dim, hidden_dim=91, output_dim=None, dropout_prob=0.24224302298896844, num_layers=2):
        super(SlotTaggingModelGRU, self).__init__()
        self.gru = nn.GRU(
            bert_hidden_dim, hidden_dim, num_layers=num_layers, batch_first=True, bidirectional=True, dropout=dropout_prob
        )
        self.layer_norm = nn.LayerNorm(hidden_dim * 2)
        self.dropout = nn.Dropout(dropout_prob)
        self.attention = nn.Linear(hidden_dim * 2, 1)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
    
    def forward(self, x):
        gru_out, _ = self.gru(x)
        gru_out = self.layer_norm(gru_out)
        gru_out = self.dropout(gru_out)
        
        # 注意力機制
        attn_weights = torch.softmax(self.attention(gru_out), dim=1)
        gru_out = gru_out * attn_weights
        output = self.fc(gru_out)
        return output

# 初始化模型和優化器
bert_hidden_dim = embeddings.shape[2]
output_dim = len(label_to_index)
model = SlotTaggingModelGRU(bert_hidden_dim, hidden_dim=91, output_dim=output_dim, dropout_prob=0.24224302298896844, num_layers=2).to(device)

# 定義損失函數和優化器
criterion = nn.CrossEntropyLoss(weight=class_weights_tensor)
optimizer = AdamW(model.parameters(), lr=0.0008037087742113169, weight_decay=1e-5)
scheduler = StepLR(optimizer, step_size=5, gamma=0.5)

# 訓練模型
def train_model(model, train_loader, criterion, optimizer, device):
    model.train()
    total_loss = 0.0
    for batch_x, batch_y in train_loader:
        batch_x, batch_y = batch_x.to(device), batch_y.to(device)

        optimizer.zero_grad()
        outputs = model(batch_x).view(-1, output_dim)
        batch_y = batch_y.view(-1)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(train_loader)

# 評估模型
def evaluate_model(model, test_loader, criterion, device, idx_to_label):
    model.eval()
    total_loss = 0.0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch_x, batch_y in test_loader:
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)

            outputs = model(batch_x).view(-1, output_dim)
            batch_y = batch_y.view(-1)
            loss = criterion(outputs, batch_y)
            total_loss += loss.item()

            preds = torch.argmax(outputs, dim=1).cpu().numpy()
            labels = batch_y.cpu().numpy()

            # 將 index 轉換為對應的標籤
            preds = [idx_to_label[idx] for idx in preds]
            labels = [idx_to_label[idx] for idx in labels]
            all_preds.append(preds)
            all_labels.append(labels)

    avg_loss = total_loss / len(test_loader)
    
    # 使用 seqeval 計算精確度、召回率和 F1 分數
    report = classification_report(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds)
    recall = recall_score(all_labels, all_preds)

    print(f"Test Loss: {avg_loss:.4f}")
    print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}")
    print("Classification Report:\n", report)

    return avg_loss, precision, recall, f1

# 訓練和評估過程
num_epochs = 300  # 使用最佳 epoch 數
for epoch in range(num_epochs):
    train_loss = train_model(model, train_loader, criterion, optimizer, device)
    test_loss, precision, recall, f1 = evaluate_model(model, test_loader, criterion, device, idx_to_label)
    scheduler.step()  # 更新學習率
    print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Test Loss: {test_loss:.4f}')


Test Loss: 2.8715
Precision: 0.0280, Recall: 0.1363, F1 Score: 0.0464
Classification Report:
                precision    recall  f1-score   support

        _cast       0.00      0.00      0.00        18
        _char       0.00      0.00      0.00         1
     _country       0.00      0.00      0.00        23
    _director       0.00      0.00      0.00        32
       _genre       0.38      0.23      0.29        13
    _language       0.11      0.60      0.18        20
    _location       0.00      0.00      0.00         1
       _movie       0.02      0.09      0.04       217
 _mpaa_rating       0.02      0.31      0.03        32
      _person       0.07      0.18      0.10        39
    _producer       0.05      0.03      0.03        38
_release_year       0.00      0.00      0.00         0
     _subject       0.04      0.48      0.07        21

    micro avg       0.03      0.14      0.05       455
    macro avg       0.05      0.15      0.06       455
 weighted avg       0.04

In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import TensorDataset, DataLoader
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
import warnings
warnings.filterwarnings("ignore")

# 檢查是否有 GPU 可用
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("使用的設備:", device)

# 加載數據
train_df = pd.read_csv('hw2_train.csv')

# 初始化 BERT Tokenizer 和 Model
tokenizer = BertTokenizer.from_pretrained("bert-large-uncased")
bert_model = BertModel.from_pretrained("bert-large-uncased").to(device)

# 設定最大序列長度
MAX_LENGTH = 32

# 調整 BERT 編碼函數
def encode_texts(texts, tokenizer, bert_model, device, max_length=MAX_LENGTH):
    inputs = tokenizer(
        texts, padding="max_length", truncation=True, max_length=max_length, return_tensors="pt"
    ).to(device)
    with torch.no_grad():
        outputs = bert_model(**inputs)
    embeddings = outputs.last_hidden_state  # Shape: (batch_size, max_length, hidden_dim)
    return embeddings, inputs["attention_mask"].sum(dim=1)

# 提取句子的 BERT 嵌入
utterances = train_df['utterances'].tolist()
embeddings, sequence_lengths = encode_texts(utterances, tokenizer, bert_model, device)

# 構建標籤映射字典
unique_labels = set(label for tags in train_df['IOB Slot tags'] for label in tags.split())
label_to_index = {label: idx for idx, label in enumerate(unique_labels)}
idx_to_label = {idx: label for label, idx in label_to_index.items()}

# 計算每個標籤的出現次數並計算 class weights
all_labels = [label for tags in train_df['IOB Slot tags'] for label in tags.split()]
class_weights = compute_class_weight('balanced', classes=np.unique(all_labels), y=all_labels)
class_weights_dict = {label_to_index[label]: weight for label, weight in zip(np.unique(all_labels), class_weights)}
class_weights_tensor = torch.tensor([class_weights_dict[i] for i in range(len(label_to_index))], dtype=torch.float).to(device)

# 將標籤轉換為數字格式並填充到 BERT 輸出的 max_length 長度
labels = train_df['IOB Slot tags'].apply(lambda x: [label_to_index[label] for label in x.split()])
labels_padded = pad_sequence(
    [torch.tensor(label + [label_to_index["O"]] * (MAX_LENGTH - len(label))) for label in labels], 
    batch_first=True
).to(device)

# 最後確認 BERT 嵌入與標籤的形狀一致
assert embeddings.shape[1] == labels_padded.shape[1], "BERT 輸出序列長度和標籤序列長度不匹配"
X_train, X_test, y_train, y_test = train_test_split(embeddings, labels_padded, test_size=0.2, random_state=42)

# 創建數據集和數據加載器
train_dataset = TensorDataset(X_train, y_train)
test_dataset = TensorDataset(X_test, y_test)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# 定義損失函數並應用 class weights
criterion = nn.CrossEntropyLoss(weight=class_weights_tensor)

from transformers import AdamW
from torch.optim.lr_scheduler import StepLR
from seqeval.metrics import classification_report, f1_score, precision_score, recall_score

# 定義使用 GRU 的模型
class SlotTaggingModelGRU(nn.Module):
    def __init__(self, bert_hidden_dim, hidden_dim=91, output_dim=None, dropout_prob=0.24224302298896844, num_layers=2):
        super(SlotTaggingModelGRU, self).__init__()
        self.gru = nn.GRU(
            bert_hidden_dim, hidden_dim, num_layers=num_layers, batch_first=True, bidirectional=True, dropout=dropout_prob
        )
        self.layer_norm = nn.LayerNorm(hidden_dim * 2)
        self.dropout = nn.Dropout(dropout_prob)
        self.attention = nn.Linear(hidden_dim * 2, 1)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
    
    def forward(self, x):
        gru_out, _ = self.gru(x)
        gru_out = self.layer_norm(gru_out)
        gru_out = self.dropout(gru_out)
        
        # 注意力機制
        attn_weights = torch.softmax(self.attention(gru_out), dim=1)
        gru_out = gru_out * attn_weights
        output = self.fc(gru_out)
        return output

# 初始化模型和優化器
bert_hidden_dim = embeddings.shape[2]
output_dim = len(label_to_index)
model = SlotTaggingModelGRU(bert_hidden_dim, hidden_dim=91, output_dim=output_dim, dropout_prob=0.24224302298896844, num_layers=2).to(device)

# 定義損失函數和優化器
criterion = nn.CrossEntropyLoss(weight=class_weights_tensor)
optimizer = AdamW(model.parameters(), lr=0.0008037087742113169, weight_decay=1e-5)
scheduler = StepLR(optimizer, step_size=5, gamma=0.5)

# 訓練模型
def train_model(model, train_loader, criterion, optimizer, device):
    model.train()
    total_loss = 0.0
    for batch_x, batch_y in train_loader:
        batch_x, batch_y = batch_x.to(device), batch_y.to(device)

        optimizer.zero_grad()
        outputs = model(batch_x).view(-1, output_dim)
        batch_y = batch_y.view(-1)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(train_loader)

# 評估模型
def evaluate_model(model, test_loader, criterion, device, idx_to_label):
    model.eval()
    total_loss = 0.0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch_x, batch_y in test_loader:
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)

            outputs = model(batch_x).view(-1, output_dim)
            batch_y = batch_y.view(-1)
            loss = criterion(outputs, batch_y)
            total_loss += loss.item()

            preds = torch.argmax(outputs, dim=1).cpu().numpy()
            labels = batch_y.cpu().numpy()

            # 將 index 轉換為對應的標籤
            preds = [idx_to_label[idx] for idx in preds]
            labels = [idx_to_label[idx] for idx in labels]
            all_preds.append(preds)
            all_labels.append(labels)

    avg_loss = total_loss / len(test_loader)
    
    # 使用 seqeval 計算精確度、召回率和 F1 分數
    report = classification_report(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds)
    recall = recall_score(all_labels, all_preds)

    print(f"Test Loss: {avg_loss:.4f}")
    print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}")
    print("Classification Report:\n", report)

    return avg_loss, precision, recall, f1

# 訓練和評估過程
num_epochs = 19  # 使用最佳 epoch 數
for epoch in range(num_epochs):
    train_loss = train_model(model, train_loader, criterion, optimizer, device)
    test_loss, precision, recall, f1 = evaluate_model(model, test_loader, criterion, device, idx_to_label)
    scheduler.step()  # 更新學習率
    print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Test Loss: {test_loss:.4f}')


  from .autonotebook import tqdm as notebook_tqdm


使用的設備: cuda
Test Loss: 3.0629
Precision: 0.0501, Recall: 0.1187, F1 Score: 0.0705
Classification Report:
               precision    recall  f1-score   support

       _cast       0.01      0.11      0.02        18
       _char       0.00      0.00      0.00         1
    _country       0.00      0.00      0.00        23
   _director       0.02      0.09      0.03        32
      _genre       0.00      0.00      0.00        13
   _language       0.06      0.50      0.10        20
   _location       0.00      0.00      0.00         1
      _movie       0.01      0.00      0.01       217
_mpaa_rating       0.10      0.62      0.18        32
     _person       0.20      0.08      0.11        39
   _producer       0.03      0.11      0.05        38
    _subject       0.09      0.52      0.15        21

   micro avg       0.05      0.12      0.07       455
   macro avg       0.04      0.17      0.05       455
weighted avg       0.04      0.12      0.04       455

Epoch 1/19, Train Loss: 3.2

In [2]:
num_epochs = 190  # 使用最佳 epoch 數
for epoch in range(num_epochs):
    train_loss = train_model(model, train_loader, criterion, optimizer, device)
    test_loss, precision, recall, f1 = evaluate_model(model, test_loader, criterion, device, idx_to_label)
    scheduler.step()  # 更新學習率
    print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Test Loss: {test_loss:.4f}')


Test Loss: 1.3013
Precision: 0.4848, Recall: 0.6308, F1 Score: 0.5482
Classification Report:
               precision    recall  f1-score   support

       _cast       0.61      0.78      0.68        18
       _char       1.00      1.00      1.00         1
    _country       0.80      0.87      0.83        23
   _director       0.62      0.81      0.70        32
      _genre       0.70      0.54      0.61        13
   _language       0.74      1.00      0.85        20
   _location       0.00      0.00      0.00         1
      _movie       0.37      0.52      0.44       217
_mpaa_rating       0.50      0.69      0.58        32
     _person       0.63      0.69      0.66        39
   _producer       0.46      0.55      0.50        38
    _subject       0.57      0.76      0.65        21
       movie       0.00      0.00      0.00         0

   micro avg       0.48      0.63      0.55       455
   macro avg       0.54      0.63      0.58       455
weighted avg       0.49      0.63      0

In [4]:
# 載入測試數據
test_df = pd.read_csv('hw2_test.csv')
# 定義生成提交文件的函數
def generate_submission_file(model, test_df, tokenizer, bert_model, idx_to_label, device, output_file="submission_GRU_Best.csv"):
    model.eval()
    predictions = []

    with torch.no_grad():
        for idx, row in test_df.iterrows():
            utterance = row["utterances"]
            
            # 1. 使用 BERT 將句子轉換為嵌入
            inputs = tokenizer(utterance, padding="max_length", truncation=True, max_length=128, return_tensors="pt").to(device)
            embeddings = bert_model(**inputs).last_hidden_state  # 取得 BERT 最後一層的輸出 (batch_size, seq_len, hidden_dim)
            
            # 2. 使用模型進行預測
            outputs = model(embeddings)  # 確認這裡的輸出是 (batch_size, seq_len, output_dim)
            
            # 3. 確保 outputs 的維度符合預期
            if outputs.dim() == 2:  # 當輸出只有 (seq_len, output_dim) 時進行擴展
                outputs = outputs.unsqueeze(0)  # 增加 batch 維度

            # 4. 獲取每個位置的預測標籤
            pred_labels = torch.argmax(outputs, dim=2).squeeze().cpu().numpy()
            pred_labels = [idx_to_label[label] for label in pred_labels[:len(inputs['input_ids'][0])]]

            # 5. 將子詞標籤整合成單詞標籤，處理 "O" 標籤
            tokens = tokenizer.tokenize(utterance)
            final_labels = []
            token_idx = 0

            for label in pred_labels:
                if token_idx >= len(tokens):
                    break  # 防止 token_idx 超出 tokens 的長度
                
                # 對於子詞，給出 "O" 標籤
                if tokens[token_idx].startswith("##"):
                    token_idx += 1
                    continue
                
                # 添加主詞的標籤，並在子詞中持續使用 "O" 標籤
                final_labels.append(label if label != "O" else "O")
                token_idx += 1

            # 6. 加入預測結果
            predictions.append(" ".join(final_labels))

    # 建立提交文件格式
    submission_df = pd.DataFrame({"ID": test_df["ID"], "IOB Slot tags": predictions})
    submission_df.to_csv(output_file, index=False)
    print(f"提交文件已生成：{output_file}")

# 確保標籤映射字典中包含 "O" 標籤
unique_labels = set(label for tags in train_df['IOB Slot tags'] for label in tags.split())
label_to_index = {label: idx for idx, label in enumerate(unique_labels)}
idx_to_label = {idx: label for label, idx in label_to_index.items()}

# 生成提交文件
generate_submission_file(model, test_df, tokenizer, bert_model, idx_to_label, device)


提交文件已生成：submission_GRU_Best.csv
