Step 1: Load and inspect the data

In [18]:
import pandas as pd

# Load the CSV file
file_path = "combined_data.csv"
data = pd.read_csv(file_path)

# Inspect the first few rows of the data
print(data.head())


  Method Path  Query                                    Request Headers  \
0    GET    /  <EMP>  Host: 18.118.191.213\nProxy-Connection: keep-a...   
1    GET    /  <EMP>  Host: 16.182.101.101\nProxy-Connection: keep-a...   
2    GET    /  <EMP>  Host: 216.171.160.223\nProxy-Connection: keep-...   
3    GET    /  <EMP>  Host: 18.118.191.213\nProxy-Connection: keep-a...   
4    GET    /  <EMP>  Host: 16.182.101.101\nProxy-Connection: keep-a...   

                                        Request Body  Response Status  \
0  {"source": "", "btnFlyerAction": "%00", "btnAc...              302   
1         {"searchQuery": "'; DROP TABLE users; --"}              301   
2  {"_wpcf7": "admin'--", "_wpcf7_version": "", "...              301   
3  {"source": "1234567890", "btnFlyerAction": "%0...              302   
4                     {"searchQuery": "normalInput"}              301   

                                    Response Headers  \
0  Server: nginx/1.18.0 (Ubuntu)\nDate: Thu, 22 A...  

Step 2: Extract the relevant columns

In [19]:
# Separate the request and response columns
requests = data[['Method', 'Path', 'Query', 'Request Headers']]
responses = data[['Response Status', 'Response Headers', 'Response Body']]

# Inspect the first few rows of requests and responses
print("Requests:\n", requests.head())
print("Responses:\n", responses.head())


Requests:
   Method Path  Query                                    Request Headers
0    GET    /  <EMP>  Host: 18.118.191.213\nProxy-Connection: keep-a...
1    GET    /  <EMP>  Host: 16.182.101.101\nProxy-Connection: keep-a...
2    GET    /  <EMP>  Host: 216.171.160.223\nProxy-Connection: keep-...
3    GET    /  <EMP>  Host: 18.118.191.213\nProxy-Connection: keep-a...
4    GET    /  <EMP>  Host: 16.182.101.101\nProxy-Connection: keep-a...
Responses:
    Response Status                                   Response Headers  \
0              302  Server: nginx/1.18.0 (Ubuntu)\nDate: Thu, 22 A...   
1              301  x-amz-error-code: WebsiteRedirect\nx-amz-error...   
2              301  Date: Thu, 22 Aug 2024 09:39:06 GMT\nServer: A...   
3              302  Server: nginx/1.18.0 (Ubuntu)\nDate: Thu, 22 A...   
4              301  x-amz-error-code: WebsiteRedirect\nx-amz-error...   

                                       Response Body  
0  <html>\r\n<head><title>302 Found</title></head..

Step 3: Tokenize 

In [20]:
import re

def tokenize(text):
    # Convert to lowercase
    # text = text.lower()

    # Replace or normalize specific patterns
    text = re.sub(r'\n', ' ', text)  # Replace newlines with spaces

    # Tokenize by splitting on spaces, slashes, commas, and colons
    tokens = re.split(r'[ ,/:]+', text)  # Split by space, comma, slash, or colon

    # Remove empty tokens (if any)
    tokens = [token for token in tokens if token]

    return tokens


In [21]:
# Apply enhanced tokenization to the request and response components
requests['tokenized'] = requests.apply(lambda row: tokenize(" ".join(row.dropna().astype(str))), axis=1)
responses['tokenized'] = responses.apply(lambda row: tokenize(" ".join(row.dropna().astype(str))), axis=1)

# Inspect the tokenized data
print("Tokenized Requests:\n", requests['tokenized'][0])
print("Tokenized Responses:\n", responses['tokenized'][1])


Tokenized Requests:
 ['GET', '<EMP>', 'Host', '18.118.191.213', 'Proxy-Connection', 'keep-alive', 'Upgrade-Insecure-Requests', '1', 'User-Agent', 'Mozilla', '5.0', '(Macintosh;', 'Intel', 'Mac', 'OS', 'X', '10_15_7)', 'AppleWebKit', '537.36', '(KHTML', 'like', 'Gecko)', 'Chrome', '127.0.0.0', 'Safari', '537.36', 'Accept', 'text', 'html', 'application', 'xhtml+xml', 'application', 'xml;q=0.9', 'image', 'avif', 'image', 'webp', 'image', 'apng', '*', '*;q=0.8', 'application', 'signed-exchange;v=b3;q=0.7', 'Accept-Encoding', 'gzip', 'deflate', 'Accept-Language', 'en-GB', 'en-US;q=0.9', 'en;q=0.8']
Tokenized Responses:
 ['301', 'x-amz-error-code', 'WebsiteRedirect', 'x-amz-error-message', 'Request', 'does', 'not', 'contain', 'a', 'bucket', 'name.', 'x-amz-request-id', '7G7YJ115EK8TM5KM', 'x-amz-id-2', 'kkXjU5v8vvjazOiNuWOBHEAViLoy5WWSfBbHj5VpyZr5G5UBqZhW', 'dr+hXGsE2d3eaj7znOlLKI=', 'Location', 'https', 'aws.amazon.com', 's3', 'Content-Type', 'text', 'html;', 'charset=utf-8', 'Date', 'Thu',

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  requests['tokenized'] = requests.apply(lambda row: tokenize(" ".join(row.dropna().astype(str))), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  responses['tokenized'] = responses.apply(lambda row: tokenize(" ".join(row.dropna().astype(str))), axis=1)


Step 4: Build the vocabulary

In [22]:
from collections import Counter
import re

import re

def clean_tokens(tokens):
    # Keep periods and other specified punctuation in tokens
    return [re.sub(r'[^\w\s.]', '', token) for token in tokens if token]


def build_vocab(tokenized_data, max_vocab_size=10000):
    vocab = Counter()
    for tokens in tokenized_data:
        clean_tokens_list = clean_tokens(tokens)
        vocab.update(clean_tokens_list)
    
    # Keep only the most frequent `max_vocab_size` words
    most_common = vocab.most_common(max_vocab_size)
    vocab = {word: i+4 for i, (word, _) in enumerate(most_common)}
    
    # Add special tokens
    vocab['<PAD>'] = 0
    vocab['<UNK>'] = 1
    vocab['<SOS>'] = 2
    vocab['<EOS>'] = 3
    return vocab

# Build vocabulary with a limited size
vocab_responses = build_vocab(responses['tokenized'], max_vocab_size=10000)  # Example size limit
vocab_requests = build_vocab(requests['tokenized'], max_vocab_size=10000)
print("Request Vocabulary Size:", len(vocab_requests))
print("Response Vocabulary Size:", len(vocab_responses))

print(vocab_requests)


Request Vocabulary Size: 429
Response Vocabulary Size: 10004
{'537.36': 4, '': 5, '171.237.155.62': 6, 'Mozilla': 7, '5.0': 8, 'Macintosh': 9, 'Intel': 10, 'Mac': 11, 'OS': 12, 'X': 13, '10_15_7': 14, 'AppleWebKit': 15, 'KHTML': 16, 'like': 17, 'Gecko': 18, 'Chrome': 19, '127.0.0.0': 20, 'Safari': 21, 'gzip': 22, 'deflate': 23, 'enGB': 24, 'enUSq0.9': 25, 'enq0.8': 26, 'Host': 27, 'ProxyConnection': 28, 'keepalive': 29, 'UserAgent': 30, 'Accept': 31, 'AcceptEncoding': 32, 'AcceptLanguage': 33, 'GET': 34, 'http': 35, 'Referer': 36, '75.186.51.109': 37, '5000': 38, 'image': 39, 'css': 40, 'text': 41, 'Cookie': 42, 'secure': 43, 'EMP': 44, 'application': 45, 'q0.1': 46, 'IfModifiedSince': 47, 'Fri': 48, 'GMT': 49, 'IfNoneMatch': 50, '31': 51, 'Jul': 52, '2020': 53, '08': 54, '41': 55, '59': 56, '1596184919': 57, 'jsBase': 58, 'versionWebVersion': 59, 'webman': 60, 'style.css': 61, 'modules': 62, 'widget': 63, 'js': 64, 'avif': 65, 'webp': 66, 'apng': 67, 'q0.8': 68, 'WebVersionWebVersion'

Step 5: Numericalize and pad

In [23]:
from torch.nn.utils.rnn import pad_sequence
import torch

# Numericalize the sequences using the separate vocabularies
def numericalize(tokens, vocab):
    return [vocab.get(token, vocab['<UNK>']) for token in tokens]

numericalized_requests = requests['tokenized'].apply(lambda x: numericalize(x, vocab_requests))
numericalized_responses = responses['tokenized'].apply(lambda x: numericalize(x, vocab_responses))

# Add <SOS> and <EOS> tokens to the responses for sequence generation
numericalized_responses = numericalized_responses.apply(lambda x: [vocab_responses['<SOS>']] + x + [vocab_responses['<EOS>']])

# Pad the sequences
padded_requests = pad_sequence([torch.tensor(seq) for seq in numericalized_requests],
                               batch_first=True, padding_value=vocab_requests['<PAD>'])
padded_responses = pad_sequence([torch.tensor(seq) for seq in numericalized_responses],
                                batch_first=True, padding_value=vocab_responses['<PAD>'])

# Inspect the padded sequences
print(padded_requests.shape)
print(padded_responses.shape)


torch.Size([248, 73])
torch.Size([248, 36023])


Step 6: Create a dataset and data loader

In [24]:
from torch.utils.data import Dataset, DataLoader

class HTTPDataset(Dataset):
    def __init__(self, requests, responses):
        self.requests = requests
        self.responses = responses

    def __len__(self):
        return len(self.requests)

    def __getitem__(self, idx):
        return self.requests[idx], self.responses[idx]

# Create dataset and dataloader
dataset = HTTPDataset(padded_requests, padded_responses)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

# Inspect the first batch
for batch in dataloader:
    requests_batch, responses_batch = batch
    print(requests_batch.shape, responses_batch.shape)
    break


torch.Size([32, 73]) torch.Size([32, 36023])


Step 7: Define the Transformer model

In [25]:
import torch.nn as nn
import torch.optim as optim

class TransformerModel(nn.Module):
    def __init__(self, vocab_size_request, vocab_size_response, embed_size, num_heads, num_encoder_layers, num_decoder_layers, dim_feedforward):
        super(TransformerModel, self).__init__()
        self.request_embedding = nn.Embedding(vocab_size_request, embed_size)
        self.response_embedding = nn.Embedding(vocab_size_response, embed_size)
        self.transformer = nn.Transformer(embed_size, num_heads, num_encoder_layers, num_decoder_layers, dim_feedforward)
        self.fc_out = nn.Linear(embed_size, vocab_size_response)

    def forward(self, src, tgt):
        src = self.request_embedding(src)
        tgt = self.response_embedding(tgt)
        src = src.permute(1, 0, 2)  # Transformer expects (seq_len, batch, embed_size)
        tgt = tgt.permute(1, 0, 2)
        transformer_out = self.transformer(src, tgt)
        out = self.fc_out(transformer_out)
        return out

# Hyperparameters
vocab_size_request = len(vocab_requests)
vocab_size_response = len(vocab_responses)
embed_size = 512
num_heads = 8
num_encoder_layers = 3
num_decoder_layers = 3
dim_feedforward = 512

# Initialize model
model = TransformerModel(vocab_size_request, vocab_size_response, embed_size, num_heads, num_encoder_layers, num_decoder_layers, dim_feedforward)

# Inspect model
print(model)


TransformerModel(
  (request_embedding): Embedding(429, 512)
  (response_embedding): Embedding(10004, 512)
  (transformer): Transformer(
    (encoder): TransformerEncoder(
      (layers): ModuleList(
        (0-2): 3 x TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
          )
          (linear1): Linear(in_features=512, out_features=512, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (linear2): Linear(in_features=512, out_features=512, bias=True)
          (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (dropout1): Dropout(p=0.1, inplace=False)
          (dropout2): Dropout(p=0.1, inplace=False)
        )
      )
      (norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    )
    (decoder): TransformerDecoder(
      (layers): ModuleL

Step 8: Train the model 

In [27]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.cuda.amp import GradScaler, autocast
import gc

# Assume the TransformerModel class is already defined
# Also assume the dataset and vocabulary dictionaries (vocab_requests, vocab_responses) are already created

# Hyperparameters
learning_rate = 0.0005
num_epochs = 10
batch_size = 8  # Reduced batch size to avoid memory issues
embed_size = 256  # Reduced from 512
num_heads = 4     # Reduced number of attention heads
num_encoder_layers = 2  # Reduced number of layers
num_decoder_layers = 2  # Reduced number of layers
dim_feedforward = 256  # Reduced feedforward dimension

# Initialize model
vocab_size_request = len(vocab_requests)
vocab_size_response = len(vocab_responses)
model = TransformerModel(vocab_size_request, vocab_size_response, embed_size, num_heads, num_encoder_layers, num_decoder_layers, dim_feedforward)

# Move model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

# Loss and optimizer
criterion = nn.CrossEntropyLoss(ignore_index=vocab_responses['<PAD>'])
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Mixed precision training scaler
scaler = GradScaler()

# Define DataLoader
train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Training loop
for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0

    for requests_batch, responses_batch in train_loader:
        requests_batch, responses_batch = requests_batch.to(device), responses_batch.to(device)

        optimizer.zero_grad()

        # Use autocast for mixed precision
        with autocast():
            # Prepare inputs and targets for the decoder
            tgt_input = responses_batch[:, :-1]
            tgt_output = responses_batch[:, 1:]

            # Forward pass through the model
            output = model(requests_batch, tgt_input)

            # Reshape output to (batch_size, vocab_size_response, seq_len) for loss calculation
            output = output.permute(1, 2, 0)  # (batch_size, vocab_size_response, seq_len)

            # Compute loss
            loss = criterion(output, tgt_output)

        # Backward pass and optimization
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        epoch_loss += loss.item()

        # Clear GPU cache
        del requests_batch, responses_batch, tgt_input, tgt_output, output
        torch.cuda.empty_cache()
        gc.collect()

    avg_loss = epoch_loss / len(train_loader)
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}")

    # Optionally, add validation and save checkpoints here

# You can add validation code and model checkpointing if needed.




RuntimeError: [enforce fail at ..\c10\core\impl\alloc_cpu.cpp:72] data. DefaultCPUAllocator: not enough memory: you tried to allocate 166090813952 bytes.

Step 9: Inference

In [None]:
def generate_response(model, request_seq, max_len=100):
    model.eval()
    generated_response = [vocab_responses['<SOS>']]

    request_seq = request_seq.unsqueeze(0).to(device)

    for _ in range(max_len):
        tgt_input = torch.tensor(generated_response).unsqueeze(0).to(device)

        with torch.no_grad():
            output = model(request_seq, tgt_input)

        next_token = output.argmax(2)[:, -1].item()
        generated_response.append(next_token)

        if next_token == vocab_responses['<EOS>']:
            break

    generated_response = [list(vocab_responses.keys())[list(vocab_responses.values()).index(idx)] for idx in generated_response]
    return generated_response
