In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader
import sys

# 檢查是否有 GPU 可用
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("使用的設備:", device)

使用的設備: cuda


In [7]:
train_file_path = r"C:\Users\USER\Downloads\NLP-Courses\NLP243\Assignments\HW2\hw2_train.csv"

# Load the file
train_df = pd.read_csv(train_file_path)
print(train_df)

        ID                                         utterances  \
0        1               who plays luke on star wars new hope   
1        2                     show credits for the godfather   
2        3             who was the main actor in the exorcist   
3        4  find the female actress from the movie she 's ...   
4        5                    who played dory on finding nemo   
...    ...                                                ...   
2307  2308               what was the revenue for toy story 3   
2308  2309                                dark knight revenue   
2309  2310               how much did the dark night generate   
2310  2311                 can i see the lion king 's revenue   
2311  2312        can i see what the lion king 's revenue was   

                                      IOB Slot tags  
0      O O B_char O B_movie I_movie I_movie I_movie  
1                             O O O B_movie I_movie  
2                       O O O O O O B_movie I_movie  
3  

In [11]:
# 使用 CountVectorizer 將文本數據轉換為詞袋表示
vectorizer = CountVectorizer(max_features=1000)  # 限制特徵數量為 1000
X = vectorizer.fit_transform(train_df['utterances'])  # 假設 CSV 中有 'sentence' 列
X = X.toarray()  # 將稀疏矩陣轉換為密集矩陣

In [13]:

# 假設我們有標籤列 'tags'，並進行標籤二值化
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(train_df['IOB Slot tags'].apply(lambda x: x.split()))  # 將標籤列轉換為列表

In [15]:

# 將數據分割為訓練集和測試集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
# 將數據轉換為 PyTorch 張量
X_train_tensor = torch.tensor(X_train, dtype=torch.float32).to(device)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).to(device)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32).to(device)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32).to(device)

In [19]:
# 創建數據集和數據加載器
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [21]:
class SlotTaggingModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(SlotTaggingModel, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        self.softmax = nn.Softmax(dim=1)  # 对输出进行softmax

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return self.softmax(x)

# 模型初始化
input_dim = X_train.shape[1]  # 詞袋模型的特徵數量
hidden_dim = 128  # 隱藏層的大小，可以根據需求調整
output_dim = y_train.shape[1]  # 標籤的數量

model = SlotTaggingModel(input_dim, hidden_dim, output_dim).to(device)

# 定義損失函數和優化器
criterion = nn.BCELoss()  # 二元交叉熵損失
optimizer = optim.Adam(model.parameters(), lr=0.001)


In [25]:
# 訓練模型
def train_model(model, train_loader, criterion, optimizer, device):
    model.train()
    running_loss = 0.0
    for data in train_loader:
        inputs, labels = data
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    return running_loss / len(train_loader)

# 評估模型
def evaluate_model(model, test_loader, criterion, device):
    model.eval()
    total_loss = 0.0
    with torch.no_grad():
        for data in test_loader:
            inputs, labels = data
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            total_loss += loss.item()
    return total_loss / len(test_loader)

# 訓練和評估過程
num_epochs = 500
for epoch in range(num_epochs):
    train_loss = train_model(model, train_loader, criterion, optimizer, device)
    test_loss = evaluate_model(model, test_loader, criterion, device)
    print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Test Loss: {test_loss:.4f}')


Epoch 1/500, Train Loss: 0.1000, Test Loss: 0.1125
Epoch 2/500, Train Loss: 0.0982, Test Loss: 0.1122
Epoch 3/500, Train Loss: 0.0970, Test Loss: 0.1113
Epoch 4/500, Train Loss: 0.0959, Test Loss: 0.1113
Epoch 5/500, Train Loss: 0.0951, Test Loss: 0.1113
Epoch 6/500, Train Loss: 0.0942, Test Loss: 0.1115
Epoch 7/500, Train Loss: 0.0936, Test Loss: 0.1114
Epoch 8/500, Train Loss: 0.0930, Test Loss: 0.1112
Epoch 9/500, Train Loss: 0.0925, Test Loss: 0.1114
Epoch 10/500, Train Loss: 0.0921, Test Loss: 0.1115
Epoch 11/500, Train Loss: 0.0916, Test Loss: 0.1118
Epoch 12/500, Train Loss: 0.0913, Test Loss: 0.1120
Epoch 13/500, Train Loss: 0.0911, Test Loss: 0.1122
Epoch 14/500, Train Loss: 0.0907, Test Loss: 0.1125
Epoch 15/500, Train Loss: 0.0905, Test Loss: 0.1127
Epoch 16/500, Train Loss: 0.0903, Test Loss: 0.1128
Epoch 17/500, Train Loss: 0.0901, Test Loss: 0.1133
Epoch 18/500, Train Loss: 0.0899, Test Loss: 0.1135
Epoch 19/500, Train Loss: 0.0898, Test Loss: 0.1139
Epoch 20/500, Train L