In [16]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import TensorDataset, DataLoader

# 檢查是否有 GPU 可用
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("使用的設備:", device)

# 加載數據
train_df = pd.read_csv('hw2_train.csv')
print(train_df)

# 初始化 BERT Tokenizer 和 Model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert_model = BertModel.from_pretrained("bert-base-uncased").to(device)

# 設定最大序列長度
MAX_LENGTH = 128  # 可根據需要調整

# 調整 BERT 編碼函數
def encode_texts(texts, tokenizer, bert_model, device, max_length=MAX_LENGTH):
    inputs = tokenizer(
        texts, padding="max_length", truncation=True, max_length=max_length, return_tensors="pt"
    ).to(device)
    with torch.no_grad():
        outputs = bert_model(**inputs)
    # 使用 BERT 的最後一層隱藏狀態作為嵌入表示
    embeddings = outputs.last_hidden_state  # Shape: (batch_size, max_length, hidden_dim)
    return embeddings, inputs["attention_mask"].sum(dim=1)  # 返回序列長度以進行對比

# 提取句子的 BERT 嵌入
utterances = train_df['utterances'].tolist()
embeddings, sequence_lengths = encode_texts(utterances, tokenizer, bert_model, device)

# ======================== 新增的標籤處理部分 ========================
# 構建標籤映射字典
unique_labels = set(label for tags in train_df['IOB Slot tags'] for label in tags.split())
label_to_index = {label: idx for idx, label in enumerate(unique_labels)}

# 將標籤轉換為數字格式並填充到 BERT 輸出的 max_length 長度
labels = train_df['IOB Slot tags'].apply(lambda x: [label_to_index[label] for label in x.split()])

# 強制填充標籤到 BERT 輸出的長度（128）
labels_padded = pad_sequence(
    [torch.tensor(label + [label_to_index["O"]] * (MAX_LENGTH - len(label))) for label in labels], 
    batch_first=True
).to(device)

# 打印標籤填充後的形狀以進行檢查
print(f"Labels padded shape: {labels_padded.shape}")
# ===============================================================

# 最後確認 BERT 嵌入與標籤的形狀一致
assert embeddings.shape[1] == labels_padded.shape[1], "BERT 輸出序列長度和標籤序列長度不匹配"
# 分割訓練和測試數據
X_train, X_test, y_train, y_test = train_test_split(embeddings, labels_padded, test_size=0.2, random_state=42)
print
# 創建數據集和數據加載器
train_dataset = TensorDataset(X_train, y_train)
test_dataset = TensorDataset(X_test, y_test)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# 定義 LSTM 模型來處理 BERT 輸出
class SlotTaggingModel(nn.Module):
    def __init__(self, bert_hidden_dim, hidden_dim, output_dim):
        super(SlotTaggingModel, self).__init__()
        self.lstm = nn.LSTM(bert_hidden_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        output = self.fc(lstm_out)
        return output

# 初始化模型
bert_hidden_dim = embeddings.shape[2]  # BERT 隱藏層的輸出維度
hidden_dim = 128  # LSTM 隱藏層的維度
output_dim = len(label_to_index)  # 標籤數量
model = SlotTaggingModel(bert_hidden_dim, hidden_dim, output_dim).to(device)

# 定義損失函數和優化器
criterion = nn.CrossEntropyLoss(ignore_index=label_to_index["O"])  # 忽略填充標籤
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 訓練模型
def train_model(model, train_loader, criterion, optimizer, device):
    model.train()
    total_loss = 0.0
    for batch_x, batch_y in train_loader:
        batch_x, batch_y = batch_x.to(device), batch_y.to(device)
        
        # 檢查輸入和標籤的形狀
        assert batch_x.shape[1] == batch_y.shape[1], "輸出序列長度和標籤序列長度不匹配"

        optimizer.zero_grad()
        outputs = model(batch_x).view(-1, output_dim)  # (batch_size * seq_len, output_dim)
        batch_y = batch_y.view(-1)  # (batch_size * seq_len)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(train_loader)

# 評估模型
def evaluate_model(model, test_loader, criterion, device):
    model.eval()
    total_loss = 0.0
    with torch.no_grad():
        for batch_x, batch_y in test_loader:
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)
            
            # 檢查輸入和標籤的形狀
            assert batch_x.shape[1] == batch_y.shape[1], "輸出序列長度和標籤序列長度不匹配"

            outputs = model(batch_x).view(-1, output_dim)
            batch_y = batch_y.view(-1)
            loss = criterion(outputs, batch_y)
            total_loss += loss.item()
    return total_loss / len(test_loader)

# 訓練和評估過程
num_epochs = 10
for epoch in range(num_epochs):
    train_loss = train_model(model, train_loader, criterion, optimizer, device)
    test_loss = evaluate_model(model, test_loader, criterion, device)
    print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Test Loss: {test_loss:.4f}')


使用的設備: cpu
        ID                                         utterances  \
0        1               who plays luke on star wars new hope   
1        2                     show credits for the godfather   
2        3             who was the main actor in the exorcist   
3        4  find the female actress from the movie she 's ...   
4        5                    who played dory on finding nemo   
...    ...                                                ...   
2307  2308               what was the revenue for toy story 3   
2308  2309                                dark knight revenue   
2309  2310               how much did the dark night generate   
2310  2311                 can i see the lion king 's revenue   
2311  2312        can i see what the lion king 's revenue was   

                                      IOB Slot tags  
0      O O B_char O B_movie I_movie I_movie I_movie  
1                             O O O B_movie I_movie  
2                       O O O O O O B_movie I_



Labels padded shape: torch.Size([2312, 128])
Epoch 1/10, Train Loss: 1.7926, Test Loss: 1.0691
Epoch 2/10, Train Loss: 0.7987, Test Loss: 0.6648
Epoch 3/10, Train Loss: 0.4944, Test Loss: 0.5183
Epoch 4/10, Train Loss: 0.3511, Test Loss: 0.4474
Epoch 5/10, Train Loss: 0.2493, Test Loss: 0.4068
Epoch 6/10, Train Loss: 0.1879, Test Loss: 0.3916
Epoch 7/10, Train Loss: 0.1455, Test Loss: 0.3882
Epoch 8/10, Train Loss: 0.1225, Test Loss: 0.3616
Epoch 9/10, Train Loss: 0.0898, Test Loss: 0.3688
Epoch 10/10, Train Loss: 0.0732, Test Loss: 0.3705


In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import TensorDataset, DataLoader
from seqeval.metrics import classification_report, f1_score, precision_score, recall_score

# 檢查是否有 GPU 可用
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("使用的設備:", device)

# 加載數據
train_df = pd.read_csv('hw2_train.csv')
print(train_df)

# 初始化 BERT Tokenizer 和 Model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert_model = BertModel.from_pretrained("bert-base-uncased").to(device)

# 設定最大序列長度
MAX_LENGTH = 128  # 可根據需要調整

# 調整 BERT 編碼函數
def encode_texts(texts, tokenizer, bert_model, device, max_length=MAX_LENGTH):
    inputs = tokenizer(
        texts, padding="max_length", truncation=True, max_length=max_length, return_tensors="pt"
    ).to(device)
    with torch.no_grad():
        outputs = bert_model(**inputs)
    embeddings = outputs.last_hidden_state  # Shape: (batch_size, max_length, hidden_dim)
    return embeddings, inputs["attention_mask"].sum(dim=1)

# 提取句子的 BERT 嵌入
utterances = train_df['utterances'].tolist()
embeddings, sequence_lengths = encode_texts(utterances, tokenizer, bert_model, device)

# 構建標籤映射字典
unique_labels = set(label for tags in train_df['IOB Slot tags'] for label in tags.split())
label_to_index = {label: idx for idx, label in enumerate(unique_labels)}
idx_to_label = {idx: label for label, idx in label_to_index.items()}

# 將標籤轉換為數字格式並填充到 BERT 輸出的 max_length 長度
labels = train_df['IOB Slot tags'].apply(lambda x: [label_to_index[label] for label in x.split()])
labels_padded = pad_sequence(
    [torch.tensor(label + [label_to_index["O"]] * (MAX_LENGTH - len(label))) for label in labels], 
    batch_first=True
).to(device)

# 最後確認 BERT 嵌入與標籤的形狀一致
assert embeddings.shape[1] == labels_padded.shape[1], "BERT 輸出序列長度和標籤序列長度不匹配"
X_train, X_test, y_train, y_test = train_test_split(embeddings, labels_padded, test_size=0.2, random_state=42)

# 創建數據集和數據加載器
train_dataset = TensorDataset(X_train, y_train)
test_dataset = TensorDataset(X_test, y_test)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# 定義 LSTM 模型來處理 BERT 輸出
class SlotTaggingModel(nn.Module):
    def __init__(self, bert_hidden_dim, hidden_dim, output_dim):
        super(SlotTaggingModel, self).__init__()
        self.lstm = nn.LSTM(bert_hidden_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        output = self.fc(lstm_out)
        return output

# 初始化模型
bert_hidden_dim = embeddings.shape[2]
hidden_dim = 128
output_dim = len(label_to_index)
model = SlotTaggingModel(bert_hidden_dim, hidden_dim, output_dim).to(device)

# 定義損失函數和優化器
criterion = nn.CrossEntropyLoss(ignore_index=label_to_index["O"])
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 訓練模型
def train_model(model, train_loader, criterion, optimizer, device):
    model.train()
    total_loss = 0.0
    for batch_x, batch_y in train_loader:
        batch_x, batch_y = batch_x.to(device), batch_y.to(device)
        assert batch_x.shape[1] == batch_y.shape[1], "輸出序列長度和標籤序列長度不匹配"

        optimizer.zero_grad()
        outputs = model(batch_x).view(-1, output_dim)
        batch_y = batch_y.view(-1)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(train_loader)

# 調整後的評估模型
def evaluate_model(model, test_loader, criterion, device, idx_to_label):
    model.eval()
    total_loss = 0.0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch_x, batch_y in test_loader:
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)
            assert batch_x.shape[1] == batch_y.shape[1], "輸出序列長度和標籤序列長度不匹配"

            outputs = model(batch_x).view(-1, output_dim)
            batch_y = batch_y.view(-1)
            loss = criterion(outputs, batch_y)
            total_loss += loss.item()

            preds = torch.argmax(outputs, dim=1).cpu().numpy()
            labels = batch_y.cpu().numpy()

            # 將 index 轉換為對應的標籤
            preds = [idx_to_label[idx] for idx in preds]
            labels = [idx_to_label[idx] for idx in labels]

            all_preds.append(preds)
            all_labels.append(labels)

    avg_loss = total_loss / len(test_loader)
    
    # 使用 seqeval 計算精確度、召回率和 F1 分數
    report = classification_report(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds)
    recall = recall_score(all_labels, all_preds)

    print(f"Test Loss: {avg_loss:.4f}")
    print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}")
    print("Classification Report:\n", report)

    return avg_loss, precision, recall, f1

# 訓練和評估過程
num_epochs = 10
for epoch in range(num_epochs):
    train_loss = train_model(model, train_loader, criterion, optimizer, device)
    test_loss, precision, recall, f1 = evaluate_model(model, test_loader, criterion, device, idx_to_label)
    print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Test Loss: {test_loss:.4f}')


  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


使用的設備: cpu
        ID                                         utterances  \
0        1               who plays luke on star wars new hope   
1        2                     show credits for the godfather   
2        3             who was the main actor in the exorcist   
3        4  find the female actress from the movie she 's ...   
4        5                    who played dory on finding nemo   
...    ...                                                ...   
2307  2308               what was the revenue for toy story 3   
2308  2309                                dark knight revenue   
2309  2310               how much did the dark night generate   
2310  2311                 can i see the lion king 's revenue   
2311  2312        can i see what the lion king 's revenue was   

                                      IOB Slot tags  
0      O O B_char O B_movie I_movie I_movie I_movie  
1                             O O O B_movie I_movie  
2                       O O O O O O B_movie I_

  _warn_prf(average, modifier, msg_start, len(result))


Test Loss: 1.0489
Precision: 0.0019, Recall: 0.2286, F1 Score: 0.0038
Classification Report:
               precision    recall  f1-score   support

       _cast       0.02      0.11      0.03        18
       _char       0.00      0.00      0.00         1
    _country       0.00      0.83      0.01        23
   _director       0.00      0.09      0.00        32
      _genre       0.50      0.08      0.13        13
   _language       0.01      0.60      0.02        20
   _location       0.00      0.00      0.00         1
      _movie       0.00      0.13      0.00       217
_mpaa_rating       0.01      0.78      0.02        32
     _person       0.01      0.13      0.02        39
   _producer       0.01      0.05      0.02        38
    _subject       0.01      0.33      0.02        21

   micro avg       0.00      0.23      0.00       455
   macro avg       0.05      0.26      0.02       455
weighted avg       0.02      0.23      0.01       455

Epoch 1/10, Train Loss: 1.8051, Test Lo



Test Loss: 0.4121
Precision: 0.0020, Recall: 0.2132, F1 Score: 0.0040
Classification Report:
               precision    recall  f1-score   support

       _cast       0.00      0.00      0.00        18
       _char       0.33      1.00      0.50         1
    _country       0.01      0.91      0.01        23
   _director       0.00      0.06      0.00        32
      _genre       0.01      0.38      0.02        13
   _language       0.00      0.60      0.01        20
   _location       0.00      0.00      0.00         1
      _movie       0.00      0.10      0.00       217
_mpaa_rating       0.00      0.75      0.01        32
     _person       0.00      0.10      0.01        39
   _producer       0.00      0.03      0.00        38
    _subject       0.00      0.29      0.01        21

   micro avg       0.00      0.21      0.00       455
   macro avg       0.03      0.35      0.05       455
weighted avg       0.00      0.21      0.00       455

Epoch 5/10, Train Loss: 0.2563, Test Lo

In [9]:
import pandas as pd
import torch

# 假設已經定義和訓練好的模型 SlotTaggingModel
# 載入測試數據
test_df = pd.read_csv('hw2_test.csv')

# 定義生成提交文件的函數
def generate_submission_file(model, test_df, tokenizer, bert_model, idx_to_label, device, output_file="submission.csv"):
    model.eval()
    predictions = []

    with torch.no_grad():
        for idx, row in test_df.iterrows():
            utterance = row["utterances"]
            
            # 1. 使用 BERT 將句子轉換為嵌入
            inputs = tokenizer(utterance, padding="max_length", truncation=True, max_length=128, return_tensors="pt").to(device)
            embeddings = bert_model(**inputs).last_hidden_state  # 取得 BERT 最後一層的輸出 (batch_size, seq_len, hidden_dim)
            
            # 2. 使用模型進行預測
            outputs = model(embeddings)  # 確認這裡的輸出是 (batch_size, seq_len, output_dim)
            
            # 3. 確保 outputs 的維度符合預期
            if outputs.dim() == 2:  # 當輸出只有 (seq_len, output_dim) 時進行擴展
                outputs = outputs.unsqueeze(0)  # 增加 batch 維度

            # 4. 獲取每個位置的預測標籤
            pred_labels = torch.argmax(outputs, dim=2).squeeze().cpu().numpy()
            pred_labels = [idx_to_label[label] for label in pred_labels[:len(inputs['input_ids'][0])]]

            # 5. 將子詞標籤整合成單詞標籤
            tokens = tokenizer.tokenize(utterance)
            final_labels = []
            token_idx = 0

            for label in pred_labels:
                if token_idx >= len(tokens):
                    break  # 防止 token_idx 超出 tokens 的長度
                
                # 跳過以 "##" 開頭的子詞標籤
                if tokens[token_idx].startswith("##"):
                    token_idx += 1
                    continue
                
                final_labels.append(label)
                token_idx += 1

            # 6. 加入預測結果
            predictions.append(" ".join(final_labels))

    # 建立提交文件格式
    submission_df = pd.DataFrame({"ID": test_df["ID"], "IOB Slot Tags": predictions})
    submission_df.to_csv(output_file, index=False)
    print(f"提交文件已生成：{output_file}")

# 構建標籤映射字典
unique_labels = set(label for tags in train_df['IOB Slot tags'] for label in tags.split())
label_to_index = {label: idx for idx, label in enumerate(unique_labels)}
idx_to_label = {idx: label for label, idx in label_to_index.items()}

# 生成提交文件
generate_submission_file(model, test_df, tokenizer, bert_model, idx_to_label, device)


提交文件已生成：submission.csv
