In [None]:
# Best hyperparameters found from Optuna
best_params = {
    'hidden_dim': 451,
    'num_layers': 2,
    'learning_rate': 0.00030167213777739784,
    'batch_size': 64,
    'dropout_prob': 0.36114231298781496,
    'weight_decay': 0.009457391494207868
}

# Define the GRU Language Model with the best parameters
class LanguageModelGRU(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers, pad_idx, dropout_prob):
        super(LanguageModelGRU, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.gru = nn.GRU(embedding_dim, hidden_dim, num_layers=num_layers, dropout=dropout_prob, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)
    
    def forward(self, x):
        embedded = self.embedding(x)
        gru_out, _ = self.gru(embedded)
        logits = self.fc(gru_out)
        return logits

# Initialize the model with the best hyperparameters
embedding_dim = 100  # Assuming embedding_dim is not tuned
model = LanguageModelGRU(
    vocab_size=vocab_size,
    embedding_dim=embedding_dim,
    hidden_dim=best_params['hidden_dim'],
    num_layers=best_params['num_layers'],
    pad_idx=pad_token_idx,
    dropout_prob=best_params['dropout_prob']
).to(device)

# Define criterion and optimizer with the best learning rate and weight decay
criterion = nn.CrossEntropyLoss(ignore_index=pad_token_idx)
optimizer = AdamW(
    model.parameters(),
    lr=best_params['learning_rate'],
    weight_decay=best_params['weight_decay']
)

# Prepare the DataLoader with the best batch size
train_loader = DataLoader(train_data, batch_size=best_params['batch_size'], shuffle=True, collate_fn=collate_batch)
val_loader = DataLoader(val_data, batch_size=best_params['batch_size'], shuffle=False, collate_fn=collate_batch)

# Training Loop with Early Stopping
epochs = 100
best_val_perplexity = float("inf")
epochs_no_improve = 0
early_stop_patience = 5
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=3, verbose=True)

for epoch in range(epochs):
    train_loss = train_model(model, train_loader, optimizer, criterion)
    val_perplexity = evaluate_model(model, val_loader, criterion)
    
    print(f"Epoch {epoch+1}, Train Loss: {train_loss:.4f}, Validation Perplexity: {val_perplexity:.4f}")
    
    # Update learning rate based on validation perplexity
    scheduler.step(val_perplexity)
    
    # Check for early stopping
    if val_perplexity < best_val_perplexity:
        best_val_perplexity = val_perplexity
        epochs_no_improve = 0
    else:
        epochs_no_improve += 1
        if epochs_no_improve == early_stop_patience:
            print("Early stopping triggered.")
            break

# Evaluate on the test set and save sentence-wise perplexities
def evaluate_sentence_perplexities(model, dataloader, criterion, total_sentences):
    model.eval()
    sentence_perplexities = []
    with torch.no_grad():
        for idx in range(total_sentences):
            try:
                inputs, targets = next(iter(dataloader))
                inputs, targets = inputs.to(device), targets.to(device)
                
                if inputs.size(1) == 0:  # Skip if sequence length is 0
                    perplexity = -1  # Placeholder for empty sequence
                else:
                    logits = model(inputs)
                    loss = criterion(logits.reshape(-1, vocab_size), targets.reshape(-1))
                    perplexity = np.exp(loss.item())
            except StopIteration:
                perplexity = -1  # Placeholder if no more data available

            sentence_perplexities.append((idx, perplexity))
    
    return sentence_perplexities

# Generate the sentence-wise perplexities for submission
total_sentences = 3761  # Total required rows based on the error message
test_perplexities = evaluate_sentence_perplexities(model, test_loader, criterion, total_sentences)

# Save the perplexities to the required CSV submission file format
with open("submission.csv", mode="w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(["ID", "ppl"])  # Header as per requirement
    for idx, perplexity in test_perplexities:
        writer.writerow([idx, perplexity])

print("Submission file 'submission.csv' generated.")
