In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import numpy as np

# Define a custom Dataset class
class Seq2SeqDataset(Dataset):
    def __init__(self, encoder_inputs, decoder_inputs, decoder_targets):
        self.encoder_inputs = encoder_inputs
        self.decoder_inputs = decoder_inputs
        self.decoder_targets = decoder_targets

    def __len__(self):
        return len(self.encoder_inputs)

    def __getitem__(self, idx):
        return (torch.tensor(self.encoder_inputs[idx], dtype=torch.long),
                torch.tensor(self.decoder_inputs[idx], dtype=torch.long),
                torch.tensor(self.decoder_targets[idx], dtype=torch.long))

# Sample data and preprocessing
class DummyTokenizer:
    def __init__(self):
        self.word_to_id = {word: i + 1 for i, word in enumerate(['<pad>', 'GET', '/index.html', 'HTTP/1.1', 'POST', '/submit', '200', 'OK', '400', 'Bad', 'Request'])}
        self.id_to_word = {i + 1: word for i, word in enumerate(['<pad>', 'GET', '/index.html', 'HTTP/1.1', 'POST', '/submit', '200', 'OK', '400', 'Bad', 'Request'])}

    def encode(self, text):
        return [self.word_to_id.get(word, 0) for word in text.split()]

    def decode(self, ids):
        return ' '.join([self.id_to_word.get(id, '<unk>') for id in ids])

tokenizer = DummyTokenizer()

requests = ['GET /index.html HTTP/1.1', 'POST /submit HTTP/1.1']
responses = ['HTTP/1.1 200 OK', 'HTTP/1.1 400 Bad Request']

def tokenize_and_pad(texts, max_len, tokenizer):
    sequences = [tokenizer.encode(text) for text in texts]
    return [seq + [0] * (max_len - len(seq)) for seq in sequences]

max_request_len = max(len(tokenizer.encode(req)) for req in requests)
max_response_len = max(len(tokenizer.encode(resp)) for resp in responses)

encoder_input_data = tokenize_and_pad(requests, max_request_len, tokenizer)
decoder_input_data = tokenize_and_pad(responses, max_response_len, tokenizer)
decoder_target_data = [seq[1:] + [0] for seq in decoder_input_data]  # Shift right for teacher forcing

# Define the Seq2Seq model
class Seq2Seq(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super(Seq2Seq, self).__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.encoder_lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.decoder_lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, encoder_input, decoder_input, hidden=None):
        embedded = self.embedding(encoder_input)
        encoder_output, (hidden, cell) = self.encoder_lstm(embedded)

        decoder_embedded = self.embedding(decoder_input)
        decoder_output, _ = self.decoder_lstm(decoder_embedded, (hidden, cell))

        output = self.fc(decoder_output)
        return output

# Hyperparameters
input_dim = len(tokenizer.word_to_id) + 1
embedding_dim = 50
hidden_dim = 100
output_dim = len(tokenizer.word_to_id) + 1
batch_size = 2
epochs = 10

# DataLoader
dataset = Seq2SeqDataset(encoder_input_data, decoder_input_data, decoder_target_data)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Model, loss, optimizer
model = Seq2Seq(input_dim, embedding_dim, hidden_dim, output_dim)
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(model.parameters())

# Training loop
for epoch in range(epochs):
    for encoder_inputs, decoder_inputs, decoder_targets in dataloader:
        optimizer.zero_grad()

        outputs = model(encoder_inputs, decoder_inputs)
        outputs = outputs.view(-1, output_dim)
        targets = decoder_targets.contiguous().view(-1)

        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

    print(f'Epoch {epoch + 1}/{epochs}, Loss: {loss.item()}')

# Inference
def decode_sequence(model, input_seq):
    model.eval()
    with torch.no_grad():
        input_tensor = torch.tensor(input_seq).unsqueeze(0)
        decoder_input = torch.zeros(1, max_response_len).long()
        decoder_output = torch.zeros(1, max_response_len, output_dim)

        for t in range(max_response_len):
            output = model(input_tensor, decoder_input)
            output = output[:, t, :]
            top1 = output.argmax(1)
            decoder_input[0, t] = top1.item()
            decoder_output[0, t, :] = output

            if top1.item() == 0:  # Assuming 0 is <pad>
                break

    return tokenizer.decode(decoder_input.squeeze().tolist())

# Example inference
input_seq = tokenizer.encode('POST /index HTTP/1.1')
decoded_sentence = decode_sequence(model, input_seq)
print(decoded_sentence)


Epoch 1/10, Loss: 2.521040439605713
Epoch 2/10, Loss: 2.4622442722320557
Epoch 3/10, Loss: 2.4041213989257812
Epoch 4/10, Loss: 2.345937967300415
Epoch 5/10, Loss: 2.2869880199432373
Epoch 6/10, Loss: 2.226607322692871
Epoch 7/10, Loss: 2.164182186126709
Epoch 8/10, Loss: 2.0991594791412354
Epoch 9/10, Loss: 2.0310542583465576
Epoch 10/10, Loss: 1.959471344947815
400 Bad Request /index.html
