In [24]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader
import sys

# 檢查是否有 GPU 可用
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("使用的設備:", device)
train_file_path = r"C:\Users\USER\Downloads\NLP-Courses\NLP243\Assignments\HW2\hw2_train.csv"

# Load the file
train_df = pd.read_csv(train_file_path)
print(train_df)

使用的設備: cpu
        ID                                         utterances  \
0        1               who plays luke on star wars new hope   
1        2                     show credits for the godfather   
2        3             who was the main actor in the exorcist   
3        4  find the female actress from the movie she 's ...   
4        5                    who played dory on finding nemo   
...    ...                                                ...   
2307  2308               what was the revenue for toy story 3   
2308  2309                                dark knight revenue   
2309  2310               how much did the dark night generate   
2310  2311                 can i see the lion king 's revenue   
2311  2312        can i see what the lion king 's revenue was   

                                      IOB Slot tags  
0      O O B_char O B_movie I_movie I_movie I_movie  
1                             O O O B_movie I_movie  
2                       O O O O O O B_movie I_

In [19]:
# Load the data
train_df = pd.read_csv(train_file_path)

# Tokenize utterances and slot tags
X_utterances = train_df["utterances"].values
y_tags = train_df["IOB Slot tags"].apply(lambda x: x.split()).values

# Flatten the tags and then one-hot encode for PyTorch
all_tags = set(tag for tags in y_tags for tag in tags)
tag2idx = {tag: idx for idx, tag in enumerate(sorted(all_tags))}
idx2tag = {idx: tag for tag, idx in tag2idx.items()}
# print("all_tags = " , all_tags)
# print("tag2idx = " , tag2idx)
# print("idx2tag = " , idx2tag)

In [20]:
# Encode tags
y_encoded = [[tag2idx[tag] for tag in tags] for tags in y_tags]
# print("y_encoded = " , y_encoded)

In [8]:
# Convert utterances to feature vectors
vectorizer = CountVectorizer(tokenizer=lambda x: x.split())
X_features = vectorizer.fit_transform(X_utterances).toarray()

In [21]:
# Define a fixed maximum sequence length
max_len = max(len(seq) for seq in y_encoded)

# Pad feature vectors and labels to max_len
X_padded = np.array([np.pad(seq, (0, max_len - len(seq)), 'constant') for seq in X_features])
y_padded = np.array([np.pad(seq, (0, max_len - len(seq)), 'constant', constant_values=tag2idx['O']) for seq in y_encoded])

# Train-test split
X_train, X_val, y_train, y_val = train_test_split(X_padded, y_padded, test_size=0.2, random_state=42)

# Convert to tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val, dtype=torch.long)

# Create DataLoader
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
val_loader = DataLoader(val_dataset, batch_size=32)


ValueError: index can't contain negative values

In [22]:
X_train_tensor = X_train_tensor.unsqueeze(1)  # Shape will be (batch_size, 1, feature_vector_length)
X_val_tensor = X_val_tensor.unsqueeze(1)
class SlotTaggingLSTM(nn.Module):
    def __init__(self, input_size, hidden_dim, output_dim):
        super(SlotTaggingLSTM, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        tag_scores = self.fc(lstm_out.squeeze(1))  # Squeeze to remove the sequence dimension
        return tag_scores

# Adjust input size and initialize
input_size = X_train_tensor.shape[2]  # Feature vector length
hidden_dim = 128
output_dim = len(tag2idx)
model = SlotTaggingLSTM(input_size, hidden_dim, output_dim).to(device)


In [23]:
# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    for batch_x, batch_y in train_loader:
        batch_x, batch_y = batch_x.to(device), batch_y.to(device)
        
        # Forward pass
        optimizer.zero_grad()
        outputs = model(batch_x)  # Expected shape: (batch_size, sequence_length, output_dim)
        
        # Print shapes for debugging
        print(f"Outputs shape before view: {outputs.shape}")  # Should be (batch_size, sequence_length, output_dim)
        print(f"Batch_y shape before view: {batch_y.shape}")  # Should be (batch_size, sequence_length)
        
        # Reshape outputs and targets for loss computation
        outputs = outputs.view(-1, output_dim)  # Flatten to (batch_size * sequence_length, output_dim)
        batch_y = batch_y.view(-1)  # Flatten to (batch_size * sequence_length)
        
        # Print shapes after reshaping
        print(f"Outputs shape after view: {outputs.shape}")  # Should be (batch_size * sequence_length, output_dim)
        print(f"Batch_y shape after view: {batch_y.shape}")  # Should be (batch_size * sequence_length)
        
        # Compute loss and update weights
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()
    
    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch_x, batch_y in val_loader:
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)
            outputs = model(batch_x).view(-1, output_dim)
            batch_y = batch_y.view(-1)
            val_loss += criterion(outputs, batch_y).item()
    
    print(f'Epoch {epoch+1}/{num_epochs}, Training Loss: {train_loss/len(train_loader)}, Validation Loss: {val_loss/len(val_loader)}')


RuntimeError: input.size(-1) must be equal to input_size. Expected 1, got 1159

In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
from seqeval.metrics import f1_score, classification_report

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# Load the dataset
train_file_path = "./hw2_train.csv"
train_df = pd.read_csv(train_file_path)
train_df['tokens'] = train_df['utterances'].apply(lambda x: x.split())
train_df['labels'] = train_df['IOB Slot tags'].apply(lambda x: x.split())

# Build vocabularies
all_tokens = [token for tokens in train_df['tokens'] for token in tokens]
word2idx = {word: idx + 2 for idx, word in enumerate(set(all_tokens))}
word2idx['<PAD>'] = 0
word2idx['<UNK>'] = 1

all_labels = [label for labels in train_df['labels'] for label in labels]
tag2idx = {tag: idx for idx, tag in enumerate(set(all_labels))}
tag2idx['O'] = 0  # Assume 'O' is the padding label

# Encoding and padding sequences
def encode_and_pad(tokens_list, labels_list, word2idx, tag2idx):
    sequences = []
    label_sequences = []
    lengths = []
    for tokens, labels in zip(tokens_list, labels_list):
        seq = [word2idx.get(token, word2idx['<UNK>']) for token in tokens]
        label_seq = [tag2idx[label] for label in labels]
        sequences.append(torch.tensor(seq))
        label_sequences.append(torch.tensor(label_seq))
        lengths.append(len(seq))
    sequences_padded = pad_sequence(sequences, batch_first=True, padding_value=word2idx['<PAD>'])
    labels_padded = pad_sequence(label_sequences, batch_first=True, padding_value=tag2idx['O'])
    lengths = torch.tensor(lengths)
    return sequences_padded, labels_padded, lengths

tokens_list = train_df['tokens'].tolist()
labels_list = train_df['labels'].tolist()
X_padded, y_padded, seq_lengths = encode_and_pad(tokens_list, labels_list, word2idx, tag2idx)

# Split data into training and validation sets
X_train, X_val, y_train, y_val, train_lengths, val_lengths = train_test_split(
    X_padded, y_padded, seq_lengths, test_size=0.2, random_state=42
)

# Dataset and DataLoader
class SlotDataset(Dataset):
    def __init__(self, sequences, labels, lengths):
        self.sequences = sequences
        self.labels = labels
        self.lengths = lengths

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        return self.sequences[idx], self.labels[idx], self.lengths[idx]

train_dataset = SlotDataset(X_train, y_train, train_lengths)
val_dataset = SlotDataset(X_val, y_val, val_lengths)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

# Model definition
class SlotTaggingLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(SlotTaggingLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=word2idx['<PAD>'])
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)

    def forward(self, x, lengths):
        embedded = self.embedding(x)
        packed_embedded = pack_padded_sequence(embedded, lengths.cpu(), batch_first=True, enforce_sorted=False)
        packed_output, _ = self.lstm(packed_embedded)
        output, _ = pad_packed_sequence(packed_output, batch_first=True)
        logits = self.fc(output)
        return logits

model = SlotTaggingLSTM(len(word2idx), embedding_dim=100, hidden_dim=128, output_dim=len(tag2idx)).to(device)
criterion = nn.CrossEntropyLoss(ignore_index=tag2idx['O'])
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training function
def train_model(model, dataloader, criterion, optimizer, device):
    model.train()
    total_loss = 0
    for sequences, labels, lengths in dataloader:
        sequences, labels, lengths = sequences.to(device), labels.to(device), lengths.to(device)
        
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(sequences, lengths)  # [batch_size, seq_len, num_classes]
        
        # Reshape outputs and labels for loss calculation
        outputs = outputs.view(-1, outputs.shape[-1])  # [batch_size * seq_len, num_classes]
        labels = labels.view(-1)                       # [batch_size * seq_len]
        
        # Compute loss with mask applied
        loss = criterion(outputs, labels)
        
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)

# Evaluation function
def evaluate_model(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    all_preds, all_labels = [], []
    with torch.no_grad():
        for sequences, labels, lengths in dataloader:
            sequences, labels, lengths = sequences.to(device), labels.to(device), lengths.to(device)
            
            # Forward pass
            outputs = model(sequences, lengths)  # [batch_size, seq_len, num_classes]
            
            # Reshape for loss calculation
            outputs = outputs.view(-1, outputs.shape[-1])  # [batch_size * seq_len, num_classes]
            labels = labels.view(-1)                       # [batch_size * seq_len]
            
            # Compute loss
            loss = criterion(outputs, labels)
            total_loss += loss.item()
            
            # Predictions
            preds = torch.argmax(outputs, dim=1)
            
            # Collect predictions and labels for evaluation
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    return total_loss / len(dataloader), all_preds, all_labels

# Training and evaluation
num_epochs = 10
for epoch in range(num_epochs):
    train_loss = train_model(model, train_loader, criterion, optimizer, device)
    val_loss, val_preds, val_labels = evaluate_model(model, val_loader, criterion, device)
    print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}')

    # Convert predictions and labels to tags for F1 evaluation
    val_preds_tags = [idx2tag[idx] for idx in val_preds]
    val_labels_tags = [idx2tag[idx] for idx in val_labels]
    
    # Calculate F1 score using seqeval
    val_preds_split, val_labels_split = [], []
    idx = 0
    for length in val_lengths:
        val_preds_split.append(val_preds_tags[idx:idx+length])
        val_labels_split.append(val_labels_tags[idx:idx+length])
        idx += length
    f1 = f1_score(val_labels_split, val_preds_split)
    print(f'Validation F1 Score: {f1:.4f}')


使用的設備: cpu
