## Data cleaning Autoencoders

The aim of this file is to clean the data for the autoencoders model. Here, we can create a cleaned csv for training and another for testing.

In [None]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from tqdm import tqdm
import json

%pip install tokenizers
from skipgram import *

Note: you may need to restart the kernel to use updated packages.


In [None]:
#Create a combined csv for all the logs except for the last file

list_csv=["sitges_access.20240122", "sitges_access.20240123", "sitges_access.20240124", "sitges_access.20240125", "sitges_access.20240126", "sitges_access.20240127"]

dataframes = []
for csv_file in list_csv:
    file = f"../data/{csv_file}.csv"
    df = pd.read_csv(file)
    dataframes.append(df)

combined_df = pd.concat(dataframes, ignore_index=True)

combined_df.to_csv("../data/combined_csvs.csv", index=False)

In [None]:
ROOT_DIR = os.path.dirname(os.path.abspath(""))

# List with all possible csv to clean
files=["combined_csvs", "sitges_access.20240129"] # files[0] is for training and files[1] is for testing

file = f"../data/{files[0]}.csv" # Change the file as needed 

df = pd.read_csv(file)

df_clean = df.copy()

# shuffle the rows
df_clean = df_clean.sample(frac=1, random_state=42).reset_index(drop=True)

# remove columns
df_clean = df_clean.drop(columns=["logname", "authenticate", "Unnamed: 0", "server_name"])

df_temp = df_clean.copy()
groups = df_temp.groupby(["IP", "user-agent"])
df_temp["group"] = groups.ngroup()
counts = df_temp.value_counts("group")
df_clean["group"] = df_temp["group"]

df_temp = df_clean.copy()
df_temp["date"] = pd.to_datetime(df_temp["date"])
df_temp["date"].head()

# sort groups by date
df_temp = df_temp.sort_values("date")
for i, g in tqdm(enumerate(df_temp["group"].unique()), total=df_temp["group"].nunique()):
	group = df_temp.loc[df_temp["group"] == g].copy()
	group["elapsed"] = group["date"].diff().dt.total_seconds()
	group["elapsed"] = group["elapsed"].fillna(0)
	group["elapsed"] = np.log(group["elapsed"]+1)
	df_temp.loc[df_temp["group"] == g, "elapsed"] = group["elapsed"]
	
df_clean["elapsed"] = df_temp["elapsed"]
df_clean["order"] = df_temp.reset_index().index


normalize_path = os.path.join(ROOT_DIR, "models/normalize.json")

with open(normalize_path, "r") as f:
	normalize = json.load(f)
	df_temp = df_clean.copy()
	IP_octs = df_temp["IP"].apply(lambda x: x.split("."))
	for i in range(4):
		df_temp["IP_oct" + str(i)] = IP_octs.apply(lambda x: float(x[i]))
		mean, std = df_temp["IP_oct" + str(i)].mean(), df_temp["IP_oct" + str(i)].std()
		df_temp["IP_oct" + str(i)] = (df_temp["IP_oct" + str(i)] - mean) / std
		normalize["IP_oct" + str(i)] = {"mean": mean, "std": std}
	json.dump(normalize, open(normalize_path, "w"))

df_temp = df_temp.drop(columns=["IP"])
df_clean = df_temp.copy()

df_clean["date"] = df["date"].str.split("+").str[0]

def sin_transform(x, period=24):
	return np.sin(2 * np.pi * x / period)

def cos_transform(x, period=24):
	return np.cos(2 * np.pi * x / period)

df_temp = df_clean.copy()

df_temp["date"] = pd.to_datetime(df_temp["date"])
df_temp["month"] = df_temp["date"].dt.month
df_temp["day"] = df_temp["date"].dt.day
df_temp["weekday"] = df_temp["date"].dt.weekday
df_temp["hour"] = df_temp["date"].dt.hour
df_temp["minute"] = df_temp["date"].dt.minute
df_temp["month_sin"] = sin_transform(df_temp["month"], 12)
df_temp["month_cos"] = cos_transform(df_temp["month"], 12)
df_temp["day_sin"] = sin_transform(df_temp["day"], 31)
df_temp["day_cos"] = cos_transform(df_temp["day"], 31)
df_temp["weekday_sin"] = sin_transform(df_temp["weekday"], 7)
df_temp["weekday_cos"] = cos_transform(df_temp["weekday"], 7)
df_temp["hour_sin"] = sin_transform(df_temp["hour"], 24)
df_temp["hour_cos"] = cos_transform(df_temp["hour"], 24)
df_temp["minute_sin"] = sin_transform(df_temp["minute"], 60)
df_temp["minute_cos"] = cos_transform(df_temp["minute"], 60)

df_temp.drop(["date", 'month', 'day', 'weekday', 'hour', 'minute'], axis=1, inplace=True)

df_clean = df_temp.copy()

df_clean = pd.get_dummies(df_clean, columns=["petition"], dtype=int)
columns_to_combine = ["petition_CONNECT", "petition_USER", "petition_OPTIONS"] 
df_clean["petition_other"] = df_clean[columns_to_combine].max(axis=1) # combine using OR
df_clean = df_clean.drop(columns=columns_to_combine)

# URL  
df_temp = df_clean.copy()
df_temp = df_temp[df_temp["URL"].str.contains(r"HTTP/\d+\.\d+").fillna(False)]

embeddings_url = load_embeddings(os.path.join(ROOT_DIR, "models/embeddings-url.pt"))
idx2word_url = load_idx2word(os.path.join(ROOT_DIR, "models/idx2word-url.json"))
tokenizer_url = load_tokenizer(os.path.join(ROOT_DIR, "models"), "charbpe-url")
embeddings_url.shape, embeddings_url.mean(), embeddings_url.std()

df_clean = df_temp.copy()

# Status
df_temp = df_clean.copy()
df_temp["status"] = df_temp["status"].apply(lambda x: str(x)[0])
df_temp["status_1"] = False
df_temp = pd.get_dummies(df_temp, columns=["status"], dtype=int)
df_clean = df_temp.copy()

# Bytes
df_temp = df_clean.copy()
df_temp["bytes"] = np.log(df_temp["bytes"]+1)

mean, std = df_temp["bytes"].mean(), df_temp["bytes"].std()
bytes_scaled = (df_temp["bytes"] - mean) / std
df_temp["bytes"] = bytes_scaled
df_clean = df_temp.copy()

with open(normalize_path, "r") as f:
	normalize = json.load(f)
	normalize["bytes"] = {"mean": mean, "std": std}
	json.dump(normalize, open(normalize_path, "w"))
	
# Referer
embeddings_referer = load_embeddings(os.path.join(ROOT_DIR, "models/embeddings-referer.pt"))
idx2word_referer = load_idx2word(os.path.join(ROOT_DIR, "models/idx2word-referer.json"))
tokenizer_referer = load_tokenizer(os.path.join(ROOT_DIR, "models"), "charbpe-referer")

df_temp = df_clean.copy()
df_temp = df_temp.dropna(subset=["referer"])

df_clean = df_temp.copy()

# User agent
embeddings_useragent = load_embeddings(os.path.join(ROOT_DIR, "models/embeddings-useragent.pt"))
idx2word_useragent = load_idx2word(os.path.join(ROOT_DIR, "models/idx2word-useragent.json"))
tokenizer_useragent = load_tokenizer(os.path.join(ROOT_DIR, "models"), "charbpe-useragent")

df_temp = df_clean.copy()
df_temp = df_temp.dropna(subset=["user-agent"])

df_clean = df_temp.copy()


# Save df_clean to csv
df_clean = df_clean.sort_values("order")
df_clean = df_clean.drop(columns=["order"])
if file == f"../data/{files[0]}.csv":
	df_clean.to_csv(os.path.join(ROOT_DIR, "data/sitges_access_clean_whole_set_but_last.csv"), index=False)
else:
	df_clean.to_csv(os.path.join(ROOT_DIR, "data/sitges_access_clean_last.csv"), index=False)


100%|██████████| 2277/2277 [00:09<00:00, 242.59it/s]
