## Data preprocessing Autoencoders

The aim of this file is to preprocess the data for the autoencoders model. Here, we can create a preprocessed csv for training and another for testing.

In [None]:
import pandas as pd
import torch
from skipgram import *
from torch.nn.utils.rnn import pad_sequence

In [None]:
files=["sitges_access_clean_whole_set_but_last", "sitges_access_clean_last"] # files[0] is for training and files[1] is for testing

file=f'../data/{files[0]}.csv' # Change the file as needed

logs_df = pd.read_csv(file)

logs_df['status_1'] = logs_df['status_1'].fillna(False).astype(int)

In [None]:
ROOT_DIR = os.path.dirname(os.path.abspath(""))

# Load the embeddings
embeddings_url = load_embeddings(os.path.join(ROOT_DIR, "models", "embeddings-url.pt"))
# Load the idx2word. This is the vocabulary where each token is associated with an index
idx2word_url = load_idx2word(os.path.join(ROOT_DIR, "models", "idx2word-url.json"))
# Load the tokenizer. Just specify the name `charbpe-url` and it will load the tokenizer, which is saved
# in the files `charbpe-url-vocab.json` and `charbpe-url-merges.txt`
tokenizer_url = load_tokenizer(os.path.join(ROOT_DIR, "models"), "charbpe-url")

url_embeddings = extract_embeddings(
	sequence = logs_df["URL"],
	embeddings = embeddings_url,
	idx2word = idx2word_url,
	tokenizer = tokenizer_url
)

embeddings_referer = load_embeddings(os.path.join(ROOT_DIR, "models/embeddings-referer.pt"))
idx2word_referer = load_idx2word(os.path.join(ROOT_DIR, "models/idx2word-referer.json"))
tokenizer_referer = load_tokenizer(os.path.join(ROOT_DIR, "models"), "charbpe-referer")
embeddings_referer.shape, embeddings_referer.mean(), embeddings_referer.std()

# --- this will take additional 3.3 GB of memory---
referers_embeddings = extract_embeddings(
	sequence = logs_df["referer"],
	embeddings = embeddings_referer,
	idx2word = idx2word_referer,
	tokenizer = tokenizer_referer
)

embeddings_useragent = load_embeddings(os.path.join(ROOT_DIR, "models/embeddings-useragent.pt"))
idx2word_useragent = load_idx2word(os.path.join(ROOT_DIR, "models/idx2word-useragent.json"))
tokenizer_useragent = load_tokenizer(os.path.join(ROOT_DIR, "models"), "charbpe-useragent")
embeddings_useragent.shape, embeddings_useragent.mean(), embeddings_useragent.std()

# --- this will take additional 3.3 GB of memory---
useragents_embeddings = extract_embeddings(
	sequence = logs_df["user-agent"],
	embeddings = embeddings_useragent,
	idx2word = idx2word_useragent,
	tokenizer = tokenizer_useragent
)

  0%|          | 0/58365 [00:00<?, ?it/s]

100%|██████████| 58365/58365 [00:10<00:00, 5454.44it/s]
100%|██████████| 58365/58365 [00:11<00:00, 5046.36it/s]
100%|██████████| 58365/58365 [00:11<00:00, 5266.18it/s]


In [None]:
urls=[]
referers=[]
usernames=[]

for url, referer, username in zip(url_embeddings, referers_embeddings, useragents_embeddings):
    urls.append(url.mean(0).float().numpy())
    referers.append(referer.mean(0).float().numpy())
    usernames.append(username.mean(0).float().numpy())

In [None]:
# ensure that the columns are in the correct order
logs_df = logs_df.reindex(columns=['bytes','elapsed', 'IP_oct0', 'IP_oct1', 'IP_oct2', 'IP_oct3', 'month_sin',
       'month_cos', 'day_sin', 'day_cos', 'weekday_sin', 'weekday_cos',
       'hour_sin', 'hour_cos', 'minute_sin', 'minute_cos', 'petition_-',
       'petition_GET', 'petition_HEAD', 'petition_POST', 'petition_other',
       'status_1', 'status_2', 'status_3', 'status_4'])

In [None]:
def convert_to_sequence(list_sequence):
    # Convert sequences to PyTorch tensors
    sequences = [torch.tensor(seq, dtype=torch.float32) for seq in list_sequence]

    # Padding sequences
    padded_sequences = pad_sequence(sequences, batch_first=True, padding_value=0.0)

    # Convert padded sequences back to DataFrame
    padded_df = pd.DataFrame(padded_sequences.numpy())

    return padded_df

def add_sequence_to_dataframe(logs_df, df, column_after):
    df1_part1 = logs_df.iloc[:, :logs_df.columns.get_loc(column_after)]
    df1_part2 = logs_df.iloc[:, logs_df.columns.get_loc(column_after):]

    # Concatenate the parts with df2 in between
    logs_df = pd.concat([df1_part1, df, df1_part2], axis=1)
    return logs_df



sequenced_urls=convert_to_sequence(urls)
sequenced_referers=convert_to_sequence(referers)
sequenced_usernames=convert_to_sequence(usernames)
print(logs_df.columns)
# Add the sequences to the original dataframe
logs_df=add_sequence_to_dataframe(logs_df, sequenced_urls, "bytes")

logs_df=add_sequence_to_dataframe(logs_df, sequenced_referers, "elapsed")
logs_df=add_sequence_to_dataframe(logs_df, sequenced_usernames, "elapsed")


Index(['bytes', 'elapsed', 'IP_oct0', 'IP_oct1', 'IP_oct2', 'IP_oct3',
       'month_sin', 'month_cos', 'day_sin', 'day_cos', 'weekday_sin',
       'weekday_cos', 'hour_sin', 'hour_cos', 'minute_sin', 'minute_cos',
       'petition_-', 'petition_GET', 'petition_HEAD', 'petition_POST',
       'petition_other', 'status_1', 'status_2', 'status_3', 'status_4'],
      dtype='object')


In [None]:
if file==f'../data/{files[0]}.csv':
    logs_df.to_csv(os.path.join(ROOT_DIR, "data/sitges_access_prepared_whole_set_but_last.csv"), index=False)
else:
    logs_df.to_csv(os.path.join(ROOT_DIR, "data/sitges_access_prepared_last.csv"), index=False)