In [None]:
import re 
import math 

import numpy as np
import pandas as pd
import seaborn as sns 
import matplotlib.pyplot as plt 

from typing import List, Callable 

from parse import parse 

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import  SelectFromModel
from sklearn.model_selection import (
  StratifiedKFold,
  train_test_split,
  GridSearchCV
)
from sklearn.ensemble import (
  RandomForestClassifier, 
)
from sklearn.metrics import (
  precision_score,
  accuracy_score,
  recall_score, 
  f1_score, 
  roc_auc_score, 
  mean_absolute_error,
  confusion_matrix, 
  classification_report
)
from sklearn.linear_model import SGDClassifier
from sklearn.svm import (
  LinearSVC,
  SVC
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier 
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier

logs_master_path = '../data/target/access_log_master_manual_labeling.csv'
csic_master_path = '../data/test_dataset/CSIC2010/csic_database_saved.csv'

df_logs = pd.read_csv(logs_master_path)
df_csic = pd.read_csv(csic_master_path)

### Dataset de Access-Log-Master

In [None]:
df_logs_labeled = df_logs[df_logs['anomaly'] != -1]
display(df_logs_labeled.head(3))
display(df_logs_labeled['anomaly'].value_counts())

In [None]:
url_columns_selected = [
  'url__count_sql_words',
  'url__count_xss_words', 
  'url__count_command_words',
  'url__count_auth_words', 
  'url__count_error_words',
  'url__count_malware_words', 
  'url__count_danger_characters',
  'url__count_obfuscation_code_words',
  'url__count_dir_words',
  'url__count_dot', 
  'url__count_http', 
  'url__count_percentage_symbol',
  'url__count_question_symbol', 
  'url__count_hyphen', 
  'url__count_equal',
  'url__url_length', 
  'url__digit_count', 
  'url__letter_count',
  'url__count_special_characters', 
  'url__is_encoded',
  'url__unusual_character_ratio'
]
columns_selected = [
  'anomaly',
  # 'size', porque CSIC2010 no proporciona esa característica
  'method',  
] + url_columns_selected
df_logs_selected = df_logs_labeled[columns_selected]
display(df_logs_selected.head(3))

### Dataset de CSIC2010

In [None]:
display(df_csic.head(3))
display(df_csic['Class'].value_counts())

In [None]:
columns_request = [
  'request_count_sql_words',
  'request_count_xss_words',
  'request_count_command_words', 
  'request_count_auth_words',
  'request_count_error_words', 
  'request_count_malware_words',
  'request_count_danger_characters',
  'request_count_obfuscation_code_words', 
  'request_count_dir_words',
  'request_count_dot', 
  'request_count_http',
  'request_count_percentage_symbol', 
  'request_count_question_symbol',
  'request_count_hyphen', 
  'request_count_equal', 
  'request_url_length',
  'request_digit_count', 
  'request_letter_count',
  'request_count_special_characters', 
  'request_is_encoded',
  'request_unusual_character_ratio'
]
for col_name in columns_request:
  df_csic.rename(columns={col_name : col_name.replace('request_', 'url__')}, inplace=True)

In [None]:
columns_selected = [
  'Class', 
  'Method'
] + url_columns_selected
df_csic_selected = df_csic[columns_selected]

In [None]:
df_csic_selected.rename(columns={'Class' : 'anomaly', 'Method' : 'method'}, inplace=True)
# df_csic_selected['size']    DELETE SIZE from TARGET DATASET  

In [None]:
df_csic_selected['anomaly'] = df_csic_selected['anomaly'].apply(
  lambda x: 0 if x == 'Normal' else 1 if x == 'Anomalous' else int(x)
)


### Dataset Combinado: Acces-Log-Master con CSIC2010
Tenemos que el dataset combinado estará formado por datos cuyas características están relacionadas con el Método y URL. Todas las columnas de los dataset: `df_csic_selected` (CSIC2010) y `df_logs_selected` (Access-Log-Master) son iguales.

In [None]:
assert set(df_csic_selected.columns) == set(df_logs_selected.columns)
df_combined = pd.concat([df_logs_selected, df_csic_selected], ignore_index=True)
print(f"Dataset combinado shape: {df_combined.shape}")
display(df_combined['anomaly'].value_counts())

In [None]:
# guardar el dataset combinado
df_combined_path = '../data/combined/dataset_combined_master_csic2010.csv' 
df_combined.to_csv(df_combined_path, index=False)