In [1]:
import csv
import re
import torch
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
from collections import Counter
from torch import nn
from torch.optim import AdamW
import numpy as np
from datasets import load_dataset

# Specify device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# Load the dataset
ptb = load_dataset('ptb_text_only', split=['train', 'validation', 'test'], trust_remote_code=True)

# Tokenization and Vocabulary Building
def tokenize(text):
    return re.findall(r'\w+', text.lower())

def build_vocab(dataset):
    counter = Counter()
    for example in dataset:
        tokens = tokenize(example['sentence'])
        counter.update(tokens)
    vocab = {word: idx for idx, (word, _) in enumerate(counter.items())}
    vocab['<PAD>'] = len(vocab)
    return vocab

vocab = build_vocab(ptb[0])
vocab_size = len(vocab)
pad_token_idx = vocab['<PAD>']

# Convert text to sequences of indices
def encode_text(text, vocab):
    return [vocab[word] for word in tokenize(text) if word in vocab]

# Process each split
train_data = [torch.tensor(encode_text(example['sentence'], vocab)) for example in ptb[0]]
val_data = [torch.tensor(encode_text(example['sentence'], vocab)) for example in ptb[1]]
test_data = [torch.tensor(encode_text(example['sentence'], vocab)) for example in ptb[2]]

# DataLoader preparation
def collate_batch(batch):
    sequences = pad_sequence(batch, batch_first=True, padding_value=pad_token_idx)
    return sequences[:, :-1], sequences[:, 1:]  # Inputs and targets

train_loader = DataLoader(train_data, batch_size=32, shuffle=True, collate_fn=collate_batch)
val_loader = DataLoader(val_data, batch_size=32, shuffle=False, collate_fn=collate_batch)
test_loader = DataLoader(test_data, batch_size=1, shuffle=False, collate_fn=collate_batch)  # Batch size 1 for sentence-level perplexity

# Define the LSTM Language Model with dropout
class LanguageModelLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, pad_idx, dropout_prob=0.5):
        super(LanguageModelLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=1, dropout=dropout_prob, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)
    
    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        logits = self.fc(lstm_out)
        return logits

# Model Initialization
embedding_dim = 100
hidden_dim = 128
model = LanguageModelLSTM(vocab_size, embedding_dim, hidden_dim, pad_token_idx, dropout_prob=0.5).to(device)

# Loss and Optimizer
criterion = nn.CrossEntropyLoss(ignore_index=pad_token_idx)
optimizer = AdamW(model.parameters(), lr=0.0005)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=3, verbose=True)

def train_model(model, dataloader, optimizer, criterion):
    model.train()
    total_loss = 0
    for inputs, targets in dataloader:
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        logits = model(inputs)
        loss = criterion(logits.reshape(-1, vocab_size), targets.reshape(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)

def evaluate_model(model, dataloader, criterion):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for inputs, targets in dataloader:
            inputs, targets = inputs.to(device), targets.to(device)
            logits = model(inputs)
            loss = criterion(logits.reshape(-1, vocab_size), targets.reshape(-1))
            total_loss += loss.item()
    avg_loss = total_loss / len(dataloader)
    perplexity = np.exp(avg_loss)
    return perplexity

# Calculate sentence-wise perplexities on the test set
def evaluate_sentence_perplexities(model, dataloader, criterion, total_sentences):
    model.eval()
    sentence_perplexities = []
    with torch.no_grad():
        for idx, (inputs, targets) in enumerate(dataloader):
            if idx >= total_sentences:  # Stop after required sentences
                break
            inputs, targets = inputs.to(device), targets.to(device)
            if inputs.size(1) == 0:
                perplexity = -1  # Placeholder for empty sequence
            else:
                logits = model(inputs)
                loss = criterion(logits.reshape(-1, vocab_size), targets.reshape(-1))
                perplexity = np.exp(loss.item())
            sentence_perplexities.append((idx, perplexity))
    return sentence_perplexities

# Training Loop with Early Stopping
epochs = 50
best_val_perplexity = float("inf")
epochs_no_improve = 0
early_stop_patience = 5

for epoch in range(epochs):
    train_loss = train_model(model, train_loader, optimizer, criterion)
    val_perplexity = evaluate_model(model, val_loader, criterion)
    
    print(f"Epoch {epoch+1}, Train Loss: {train_loss:.4f}, Validation Perplexity: {val_perplexity:.4f}")
    
    # Update learning rate based on validation perplexity
    scheduler.step(val_perplexity)
    
    # Check for early stopping
    if val_perplexity < best_val_perplexity:
        best_val_perplexity = val_perplexity
        epochs_no_improve = 0
    else:
        epochs_no_improve += 1
        if epochs_no_improve == early_stop_patience:
            print("Early stopping triggered.")
            break

# Evaluate on the test set and save sentence-wise perplexities
total_sentences = 3761  # Expected number of test sentences
test_perplexities = evaluate_sentence_perplexities(model, test_loader, criterion, total_sentences)

# Save the perplexities to the required CSV submission file format
with open("submission_output.csv", mode="w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(["ID", "ppl"])  # Header as per requirement
    for idx, perplexity in test_perplexities:
        writer.writerow([idx, perplexity])

print("Submission file 'LSTM_submission_output.csv' generated.")



  from .autonotebook import tqdm as notebook_tqdm


Using device: cuda




Epoch 1, Train Loss: 6.5245, Validation Perplexity: 450.9675
Epoch 2, Train Loss: 5.9175, Validation Perplexity: 331.0002
Epoch 3, Train Loss: 5.6454, Validation Perplexity: 277.7363
Epoch 4, Train Loss: 5.4629, Validation Perplexity: 247.3272
Epoch 5, Train Loss: 5.3237, Validation Perplexity: 227.1906
Epoch 6, Train Loss: 5.2104, Validation Perplexity: 213.3520
Epoch 7, Train Loss: 5.1152, Validation Perplexity: 202.2631
Epoch 8, Train Loss: 5.0331, Validation Perplexity: 194.3026
Epoch 9, Train Loss: 4.9598, Validation Perplexity: 187.9996
Epoch 10, Train Loss: 4.8942, Validation Perplexity: 182.9223
Epoch 11, Train Loss: 4.8362, Validation Perplexity: 179.0538
Epoch 12, Train Loss: 4.7829, Validation Perplexity: 175.5508
Epoch 13, Train Loss: 4.7346, Validation Perplexity: 173.4530
Epoch 14, Train Loss: 4.6902, Validation Perplexity: 171.4615
Epoch 15, Train Loss: 4.6482, Validation Perplexity: 169.8248
Epoch 16, Train Loss: 4.6086, Validation Perplexity: 168.6048
Epoch 17, Train L

In [None]:
# Define RNN, LSTM, GRU, and Transformer models
class LanguageModelRNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, pad_idx, dropout_prob=0.5):
        super(LanguageModelRNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.rnn = nn.RNN(embedding_dim, hidden_dim, num_layers=1, batch_first=True, dropout=dropout_prob)
        self.fc = nn.Linear(hidden_dim, vocab_size)
    
    def forward(self, x):
        embedded = self.embedding(x)
        rnn_out, _ = self.rnn(embedded)
        logits = self.fc(rnn_out)
        return logits

class LanguageModelGRU(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, pad_idx, dropout_prob=0.5):
        super(LanguageModelGRU, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.gru = nn.GRU(embedding_dim, hidden_dim, num_layers=1, batch_first=True, dropout=dropout_prob)
        self.fc = nn.Linear(hidden_dim, vocab_size)
    
    def forward(self, x):
        embedded = self.embedding(x)
        gru_out, _ = self.gru(embedded)
        logits = self.fc(gru_out)
        return logits

class LanguageModelTransformer(nn.Module):
    def __init__(self, vocab_size, embedding_dim, num_heads, num_layers, pad_idx, dropout_prob=0.5):
        super(LanguageModelTransformer, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.positional_encoding = nn.Parameter(torch.zeros(1, 512, embedding_dim))  # Support up to 512 tokens
        self.encoder_layer = nn.TransformerEncoderLayer(d_model=embedding_dim, nhead=num_heads, dropout=dropout_prob)
        self.transformer = nn.TransformerEncoder(self.encoder_layer, num_layers=num_layers)
        self.fc = nn.Linear(embedding_dim, vocab_size)
    
    def forward(self, x):
        embedded = self.embedding(x) + self.positional_encoding[:, :x.size(1), :]
        transformer_out = self.transformer(embedded)
        logits = self.fc(transformer_out)
        return logits

# Model Initialization
model_types = {
    "RNN": LanguageModelRNN,
    "LSTM": LanguageModelLSTM,
    "GRU": LanguageModelGRU,
    "Transformer": LanguageModelTransformer
}

embedding_dim = 100
hidden_dim = 128
num_heads = 4
num_layers = 2
dropout_prob = 0.5
model_choice = "Transformer"  # Change to "RNN", "LSTM", or "GRU" to test other models

if model_choice == "Transformer":
    model = model_types[model_choice](vocab_size, embedding_dim, num_heads, num_layers, pad_token_idx, dropout_prob).to(device)
else:
    model = model_types[model_choice](vocab_size, embedding_dim, hidden_dim, pad_token_idx, dropout_prob).to(device)

# Training and evaluation functions remain the same
# ...

# Start training the chosen model
print(f"Training {model_choice} model...")
for epoch in range(epochs):
    train_loss = train_model(model, train_loader, optimizer, criterion)
    val_perplexity = evaluate_model(model, val_loader, criterion)
    
    print(f"Epoch {epoch+1}, Train Loss: {train_loss:.4f}, Validation Perplexity: {val_perplexity:.4f}")
    
    scheduler.step(val_perplexity)
    
    if val_perplexity < best_val_perplexity:
        best_val_perplexity = val_perplexity
        epochs_no_improve = 0
    else:
        epochs_no_improve += 1
        if epochs_no_improve == early_stop_patience:
            print("Early stopping triggered.")
            break

# Evaluate and save test perplexities
test_perplexities = evaluate_sentence_perplexities(model, test_loader, criterion, total_sentences)
with open(f"{model_choice}_submission_output.csv", mode="w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(["ID", "ppl"])
    for idx, perplexity in test_perplexities:
        writer.writerow([idx, perplexity])

print(f"Submission file '{model_choice}_submission_output.csv' generated.")


In [6]:
import optuna

# Define RNN, LSTM, GRU, and Transformer models
class LanguageModelRNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, pad_idx, dropout_prob=0.5):
        super(LanguageModelRNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.rnn = nn.RNN(embedding_dim, hidden_dim, num_layers=1, batch_first=True, dropout=dropout_prob)
        self.fc = nn.Linear(hidden_dim, vocab_size)
    
    def forward(self, x):
        embedded = self.embedding(x)
        rnn_out, _ = self.rnn(embedded)
        logits = self.fc(rnn_out)
        return logits

class LanguageModelGRU(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, pad_idx, dropout_prob=0.5):
        super(LanguageModelGRU, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.gru = nn.GRU(embedding_dim, hidden_dim, num_layers=1, batch_first=True, dropout=dropout_prob)
        self.fc = nn.Linear(hidden_dim, vocab_size)
    
    def forward(self, x):
        embedded = self.embedding(x)
        gru_out, _ = self.gru(embedded)
        logits = self.fc(gru_out)
        return logits

class LanguageModelTransformer(nn.Module):
    def __init__(self, vocab_size, embedding_dim, num_heads, num_layers, pad_idx, dropout_prob=0.5):
        super(LanguageModelTransformer, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.positional_encoding = nn.Parameter(torch.zeros(1, 512, embedding_dim))  # Support up to 512 tokens
        self.encoder_layer = nn.TransformerEncoderLayer(d_model=embedding_dim, nhead=num_heads, dropout=dropout_prob)
        self.transformer = nn.TransformerEncoder(self.encoder_layer, num_layers=num_layers)
        self.fc = nn.Linear(embedding_dim, vocab_size)
    
    def forward(self, x):
        embedded = self.embedding(x) + self.positional_encoding[:, :x.size(1), :]
        transformer_out = self.transformer(embedded)
        logits = self.fc(transformer_out)
        return logits

# Model Initialization
model_types = {
    "RNN": LanguageModelRNN,
    "LSTM": LanguageModelLSTM,
    "GRU": LanguageModelGRU,
    "Transformer": LanguageModelTransformer
}
model_results = {}


def objective(trial, model_type):
    embedding_dim = trial.suggest_int("embedding_dim", 50, 200, step=25)
    
    # For Transformer, ensure embedding_dim is divisible by num_heads
    if model_type == "Transformer":
        num_heads = trial.suggest_int("num_heads", 2, 8, step=2)
        embedding_dim = trial.suggest_int("embedding_dim", num_heads * 8, num_heads * 32, step=num_heads * 8)
        num_layers = trial.suggest_int("num_layers", 1, 4, step=1)
    else:
        num_heads = None
        num_layers = None

    hidden_dim = trial.suggest_int("hidden_dim", 64, 256, step=32) if model_type != "Transformer" else None
    dropout_prob = trial.suggest_float("dropout_prob", 0.1, 0.5, step=0.1)
    learning_rate = trial.suggest_loguniform("learning_rate", 1e-4, 1e-2)
    batch_size = trial.suggest_categorical("batch_size", [16, 32, 64])

    # Initialize model
    if model_type == "Transformer":
        model = LanguageModelTransformer(
            vocab_size, embedding_dim, num_heads, num_layers, pad_token_idx, dropout_prob
        ).to(device)
    else:
        model_class = model_types[model_type]
        model = model_class(vocab_size, embedding_dim, hidden_dim, pad_token_idx, dropout_prob).to(device)
    
    # Update DataLoader with suggested batch size
    train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, collate_fn=collate_batch)
    val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=False, collate_fn=collate_batch)

    # Define optimizer and loss
    optimizer = AdamW(model.parameters(), lr=learning_rate)
    criterion = nn.CrossEntropyLoss(ignore_index=pad_token_idx)

    # Training for a few epochs (to save time during hyperparameter tuning)
    epochs = 5
    for epoch in range(epochs):
        train_model(model, train_loader, optimizer, criterion)

    # Evaluate validation perplexity
    val_perplexity = evaluate_model(model, val_loader, criterion)
    return val_perplexity


# Run Optuna for each model type
for model_type in ["RNN", "LSTM", "GRU", "Transformer"]:
    print(f"Optimizing hyperparameters for {model_type}...")
    study = optuna.create_study(direction="minimize")
    study.optimize(lambda trial: objective(trial, model_type), n_trials=20)  # Adjust n_trials as needed
    
    # Save best trial and perplexity
    best_trial = study.best_trial
    model_results[model_type] = {
        "best_params": best_trial.params,
        "best_val_perplexity": best_trial.value
    }
    print(f"Best hyperparameters for {model_type}: {best_trial.params}")
    print(f"Best validation perplexity for {model_type}: {best_trial.value}")
Run Optuna for each model type

# Compare results and find the best model
best_model = min(model_results, key=lambda x: model_results[x]["best_val_perplexity"])
print("\nSummary of results:")
for model_type, result in model_results.items():
    print(f"{model_type}: Best perplexity = {result['best_val_perplexity']}, Best params = {result['best_params']}")
print(f"\nBest model: {best_model} with perplexity {model_results[best_model]['best_val_perplexity']}")

# Train and evaluate the best model with its optimal hyperparameters
best_params = model_results[best_model]["best_params"]
if best_model == "Transformer":
    model = LanguageModelTransformer(
        vocab_size,
        best_params["embedding_dim"],
        best_params["num_heads"],
        best_params["num_layers"],
        pad_token_idx,
        best_params["dropout_prob"]
    ).to(device)
else:
    model_class = model_types[best_model]
    model = model_class(
        vocab_size,
        best_params["embedding_dim"],
        best_params["hidden_dim"],
        pad_token_idx,
        best_params["dropout_prob"]
    ).to(device)

# Recreate loaders with optimal batch size
train_loader = DataLoader(train_data, batch_size=best_params["batch_size"], shuffle=True, collate_fn=collate_batch)
val_loader = DataLoader(val_data, batch_size=best_params["batch_size"], shuffle=False, collate_fn=collate_batch)

# Train the best model
optimizer = AdamW(model.parameters(), lr=best_params["learning_rate"])
criterion = nn.CrossEntropyLoss(ignore_index=pad_token_idx)
epochs = 50  # Train for more epochs with optimal hyperparameters
for epoch in range(epochs):
    train_loss = train_model(model, train_loader, optimizer, criterion)
    val_perplexity = evaluate_model(model, val_loader, criterion)
    print(f"Epoch {epoch+1}, Train Loss: {train_loss:.4f}, Validation Perplexity: {val_perplexity:.4f}")

print(f"Final Validation Perplexity of Best Model ({best_model}): {val_perplexity:.4f}")


[I 2024-11-23 18:47:07,929] A new study created in memory with name: no-name-0231d9c9-3870-4a88-b1b1-7e2cec79978c
  learning_rate = trial.suggest_loguniform("learning_rate", 1e-4, 1e-2)
  attn_output = scaled_dot_product_attention(q, k, v, attn_mask, dropout_p, is_causal)


Optimizing hyperparameters for Transformer...


[I 2024-11-23 18:49:27,079] Trial 0 finished with value: 219.2230314053985 and parameters: {'embedding_dim': 200, 'num_heads': 2, 'num_layers': 4, 'dropout_prob': 0.1, 'learning_rate': 0.00015025947000242787, 'batch_size': 32}. Best is trial 0 with value: 219.2230314053985.
[W 2024-11-23 18:49:27,095] Trial 1 failed with parameters: {'embedding_dim': 175, 'num_heads': 4, 'num_layers': 2, 'dropout_prob': 0.5, 'learning_rate': 0.0016972383513934142, 'batch_size': 32} because of the following error: AssertionError('embed_dim must be divisible by num_heads').
Traceback (most recent call last):
  File "C:\Users\USER\anaconda3\envs\pytorch\lib\site-packages\optuna\study\_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\USER\AppData\Local\Temp\ipykernel_28784\2520461970.py", line 117, in <lambda>
    study.optimize(lambda trial: objective(trial, model_type), n_trials=20)  # Adjust n_trials as needed
  File "C:\Users\USER\AppData\Local\Temp\ipykernel_287

AssertionError: embed_dim must be divisible by num_heads

Best hyperparameters for RNN: {'embedding_dim': 150, 'hidden_dim': 128, 'dropout_prob': 0.5, 'learning_rate': 0.0016579694418827292, 'batch_size': 64}
Best validation perplexity for RNN: 173.18619457496038
Optimizing hyperparameters for LSTM...
Best hyperparameters for LSTM: {'embedding_dim': 175, 'hidden_dim': 256, 'dropout_prob': 0.2, 'learning_rate': 0.00048762132730701416, 'batch_size': 16}
Best validation perplexity for LSTM: 153.97874027835306
Optimizing hyperparameters for GRU...
Best hyperparameters for GRU: {'embedding_dim': 75, 'hidden_dim': 192, 'dropout_prob': 0.5, 'learning_rate': 0.001014428519610825, 'batch_size': 16}
Best validation perplexity for GRU: 156.944322817553
Optimizing hyperparameters for Transformer...

In [7]:
import optuna
import torch
from torch import nn
from torch.optim import AdamW
from torch.utils.data import DataLoader

# Transformer 模型定義
class LanguageModelTransformer(nn.Module):
    def __init__(self, vocab_size, embedding_dim, num_heads, num_layers, pad_idx, dropout_prob=0.5):
        super(LanguageModelTransformer, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.positional_encoding = nn.Parameter(torch.zeros(1, 512, embedding_dim))  # 支持最多512 tokens
        self.encoder_layer = nn.TransformerEncoderLayer(d_model=embedding_dim, nhead=num_heads, dropout=dropout_prob)
        self.transformer = nn.TransformerEncoder(self.encoder_layer, num_layers=num_layers)
        self.fc = nn.Linear(embedding_dim, vocab_size)

    def forward(self, x):
        embedded = self.embedding(x) + self.positional_encoding[:, :x.size(1), :]
        transformer_out = self.transformer(embedded)
        logits = self.fc(transformer_out)
        return logits

# 定義調參目標函數
def objective(trial):
    # Transformer 專屬超參數
    num_heads = trial.suggest_int("num_heads", 2, 8, step=2)
    embedding_dim = trial.suggest_int("embedding_dim", num_heads * 8, num_heads * 32, step=num_heads * 8)
    num_layers = trial.suggest_int("num_layers", 1, 4, step=1)
    dropout_prob = trial.suggest_float("dropout_prob", 0.1, 0.5, step=0.1)
    learning_rate = trial.suggest_float("learning_rate", 1e-4, 1e-2, log=True)
    batch_size = trial.suggest_categorical("batch_size", [16, 32, 64])

    # 初始化模型
    model = LanguageModelTransformer(
        vocab_size, embedding_dim, num_heads, num_layers, pad_token_idx, dropout_prob
    ).to(device)

    # 設定 DataLoader
    train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, collate_fn=collate_batch)
    val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=False, collate_fn=collate_batch)

    # 定義損失函數和優化器
    criterion = nn.CrossEntropyLoss(ignore_index=pad_token_idx)
    optimizer = AdamW(model.parameters(), lr=learning_rate)

    # 簡單訓練數個 epoch
    epochs = 5
    for epoch in range(epochs):
        model.train()
        for inputs, targets in train_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            optimizer.zero_grad()
            logits = model(inputs)
            loss = criterion(logits.reshape(-1, vocab_size), targets.reshape(-1))
            loss.backward()
            optimizer.step()

    # 驗證模型並計算困惑度
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for inputs, targets in val_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            logits = model(inputs)
            loss = criterion(logits.reshape(-1, vocab_size), targets.reshape(-1))
            total_loss += loss.item()
    avg_loss = total_loss / len(val_loader)
    perplexity = torch.exp(torch.tensor(avg_loss)).item()

    return perplexity

# Optuna 調參
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=20)  # 調整 n_trials 以增加或減少試驗次數

# 最佳結果
best_trial = study.best_trial
print("Best hyperparameters:", best_trial.params)
print("Best validation perplexity:", best_trial.value)

# 使用最佳超參數初始化模型
best_params = best_trial.params
final_model = LanguageModelTransformer(
    vocab_size,
    best_params["embedding_dim"],
    best_params["num_heads"],
    best_params["num_layers"],
    pad_token_idx,
    best_params["dropout_prob"]
).to(device)

# 打印最佳模型的結構
print(final_model)


[I 2024-11-23 18:56:53,846] A new study created in memory with name: no-name-dfef2af9-4732-43dc-9aec-ec7b7fd779e0
[I 2024-11-23 18:58:13,063] Trial 0 finished with value: 240.25912475585938 and parameters: {'num_heads': 8, 'embedding_dim': 64, 'num_layers': 1, 'dropout_prob': 0.1, 'learning_rate': 0.00040652048652476926, 'batch_size': 16}. Best is trial 0 with value: 240.25912475585938.
[I 2024-11-23 18:58:58,324] Trial 1 finished with value: 229.2498779296875 and parameters: {'num_heads': 4, 'embedding_dim': 96, 'num_layers': 1, 'dropout_prob': 0.5, 'learning_rate': 0.004776870580403055, 'batch_size': 32}. Best is trial 1 with value: 229.2498779296875.
[I 2024-11-23 18:59:40,809] Trial 2 finished with value: 221.57119750976562 and parameters: {'num_heads': 4, 'embedding_dim': 96, 'num_layers': 1, 'dropout_prob': 0.1, 'learning_rate': 0.0026013361040733095, 'batch_size': 64}. Best is trial 2 with value: 221.57119750976562.
[I 2024-11-23 19:01:44,482] Trial 3 finished with value: 217.09

Best hyperparameters: {'num_heads': 6, 'embedding_dim': 192, 'num_layers': 4, 'dropout_prob': 0.2, 'learning_rate': 0.0007258591107050929, 'batch_size': 64}
Best validation perplexity: 214.95179748535156
LanguageModelTransformer(
  (embedding): Embedding(9644, 192, padding_idx=9643)
  (encoder_layer): TransformerEncoderLayer(
    (self_attn): MultiheadAttention(
      (out_proj): NonDynamicallyQuantizableLinear(in_features=192, out_features=192, bias=True)
    )
    (linear1): Linear(in_features=192, out_features=2048, bias=True)
    (dropout): Dropout(p=0.2, inplace=False)
    (linear2): Linear(in_features=2048, out_features=192, bias=True)
    (norm1): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
    (norm2): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
    (dropout1): Dropout(p=0.2, inplace=False)
    (dropout2): Dropout(p=0.2, inplace=False)
  )
  (transformer): TransformerEncoder(
    (layers): ModuleList(
      (0-3): 4 x TransformerEncoderLayer(
        (self_a

In [13]:
import torch
from torch.nn.utils.rnn import pad_sequence
from torch import nn
import numpy as np
from torch.utils.data import DataLoader

# 測試集困惑度計算函數
def calculate_test_perplexity(model, dataloader, criterion):
    """
    Calculate overall perplexity using CrossEntropyLoss and exp.
    """
    model.eval()
    total_loss = 0
    total_tokens = 0

    with torch.no_grad():
        for inputs, targets in dataloader:
            inputs, targets = inputs.to(device), targets.to(device)
            
            # Skip empty sequences
            if inputs.size(1) == 0:
                continue

            # Forward pass
            logits = model(inputs)
            loss = criterion(logits.reshape(-1, vocab_size), targets.reshape(-1))

            # Accumulate loss and token count
            total_loss += loss.item() * targets.numel()
            total_tokens += targets.numel()

    # Calculate perplexity
    if total_tokens > 0:
        average_loss = total_loss / total_tokens
        perplexity = np.exp(average_loss)
        print(f"Average Loss: {average_loss:.4f}")
        print(f"Perplexity: {perplexity:.4f}")
        return perplexity
    else:
        return float('inf')  # Handle edge case where there are no tokens

# 創建測試集 DataLoader
def create_test_loader(test_data, batch_size):
    def collate_batch(batch):
        sequences = pad_sequence(batch, batch_first=True, padding_value=pad_token_idx)
        return sequences[:, :-1], sequences[:, 1:]  # Inputs and targets

    return DataLoader(test_data, batch_size=batch_size, shuffle=False, collate_fn=collate_batch)

# 定義最佳模型參數
best_params_dict = {
    "RNN": {
        "embedding_dim": 150,
        "hidden_dim": 128,
        "dropout_prob": 0.5,
        "learning_rate": 0.0016579694418827292,
        "batch_size": 64,
    },
    "LSTM": {
        "embedding_dim": 175,
        "hidden_dim": 256,
        "dropout_prob": 0.2,
        "learning_rate": 0.00048762132730701416,
        "batch_size": 16,
    },
    "GRU": {
        "embedding_dim": 75,
        "hidden_dim": 192,
        "dropout_prob": 0.5,
        "learning_rate": 0.001014428519610825,
        "batch_size": 16,
    },
    "Transformer": {
        "num_heads": 6,
        "embedding_dim": 192,
        "num_layers": 4,
        "dropout_prob": 0.2,
        "learning_rate": 0.0007258591107050929,
        "batch_size": 64,
    },
}

# 定義模型類別
model_classes = {
    "RNN": LanguageModelRNN,
    "LSTM": LanguageModelLSTM,
    "GRU": LanguageModelGRU,
    "Transformer": LanguageModelTransformer,
}

# 測試每個模型的整體困惑度
test_results = {}
criterion = nn.CrossEntropyLoss(ignore_index=pad_token_idx)

for model_name, params in best_params_dict.items():
    print(f"Evaluating {model_name} on the test set...")

    if model_name == "Transformer":
        model = model_classes[model_name](
            vocab_size,
            params["embedding_dim"],
            params["num_heads"],
            params["num_layers"],
            pad_token_idx,
            params["dropout_prob"],
        ).to(device)
    else:
        model = model_classes[model_name](
            vocab_size,
            params["embedding_dim"],
            params["hidden_dim"],
            pad_token_idx,
            params["dropout_prob"],
        ).to(device)

    # 創建測試集 DataLoader
    test_loader = create_test_loader(test_data, batch_size=params["batch_size"])

    # 計算整體困惑度
    overall_perplexity = calculate_test_perplexity(model, test_loader, criterion)
    test_results[model_name] = overall_perplexity
    print(f"{model_name} Overall Test Perplexity: {overall_perplexity:.2f}")

# 結果總結
print("\nTest Set Perplexity Results:")
for model_name, perplexity in test_results.items():
    print(f"{model_name}: {perplexity:.2f}")


Evaluating RNN on the test set...
Average Loss: 9.2183
Perplexity: 10079.7970
RNN Overall Test Perplexity: 10079.80
Evaluating LSTM on the test set...
Average Loss: 9.1765
Perplexity: 9667.7301
LSTM Overall Test Perplexity: 9667.73
Evaluating GRU on the test set...
Average Loss: 9.1799
Perplexity: 9700.6330
GRU Overall Test Perplexity: 9700.63
Evaluating Transformer on the test set...
Average Loss: 9.3371
Perplexity: 11351.5771
Transformer Overall Test Perplexity: 11351.58

Test Set Perplexity Results:
RNN: 10079.80
LSTM: 9667.73
GRU: 9700.63
Transformer: 11351.58
