In [None]:
import re 
import math 

import numpy as np
import pandas as pd
import seaborn as sns 
import matplotlib.pyplot as plt 

from typing import List, Callable 

from parse import parse 

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import  SelectFromModel
from sklearn.model_selection import (
  StratifiedKFold,
  train_test_split,
  GridSearchCV
)
from sklearn.ensemble import (
  RandomForestClassifier, 
)
from sklearn.metrics import (
  precision_score,
  accuracy_score,
  recall_score, 
  f1_score, 
  roc_auc_score, 
  mean_absolute_error,
  confusion_matrix, 
  classification_report
)
from sklearn.linear_model import SGDClassifier
from sklearn.svm import (
  LinearSVC,
  SVC
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier 
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier

log_prep_path = '../data/test_dataset/CSIC2010/csic_database.csv' 

In [None]:
df = pd.read_csv(log_prep_path)
df = df.rename(columns={'Unnamed: 0': 'Class'})
df = df.rename(columns={'lenght': 'content_length'})

# Eliminar protocolo ( http:// o https:// ) y el host de la URL
df['URL'] = df['URL'].str.replace(r'^https?://', '', regex=True)
df['URL'] = df.apply(lambda row : row['URL'].replace(row['host'], '', 1), axis=1)

def parse_url(row):
  pattern = '{request_url} {protocol}'  
  result = parse(pattern, row['URL'])
  
  if result:
    return pd.Series( [result['request_url'], result['protocol']] )
  else: # Fallback si no coincide
    raise Exception(f"Error in URL: {row['URL']}")

df[['request_url', 'protocol']] = df.apply(parse_url, axis=1)

del_columns = [
  'Pragma',
  'Cache-Control',
  'Accept',
  'Accept-encoding',
  'Accept-charset',
  'language',
  'host',
  'cookie',
  'content-type',
  'connection',       
  'classification',   # Es la variable categórica ya codificada de la columna Class
  'content_length',   # Eliminar del dataset debido a problemas entre el valor original y el valor de la columna de request_url
  'URL'               # Ya esta separado URL en request_url y protocol
]
df_selected = df.drop(columns=del_columns, axis=1)

display(df_selected.head())
display(df_selected.tail())

In [None]:
important_columns = [
  'Class',
  'Method',
  'User-Agent',
  'protocol'
]

# Filtrar solo columnas existentes
existing_cols = [col for col in important_columns if col in df_selected.columns]
n_cols = len(existing_cols)

# Configurar subplots
fig, axes = plt.subplots(n_cols, 1, figsize=(15, 5 * n_cols))
if n_cols == 1:
  axes = [axes]

# Crear gráficas para cada columna
for idx, col in enumerate(existing_cols):
  ax = axes[idx]
  
  # Obtener value counts (top 10 para evitar sobrecarga)
  value_counts = df_selected[col].value_counts().head(10)
  
  # Para columnas con muchos valores únicos, mostrar solo top
  if len(value_counts) > 10:
    others_count = df_selected[col].value_counts().iloc[10:].sum()
    if others_count > 0:
      value_counts['Otros'] = others_count
  
  # Crear gráfica de barras horizontal
  bars = ax.barh(range(len(value_counts)), value_counts.values, color=plt.cm.tab20c(range(len(value_counts))))
  
  ax.set_title(f'Distribución de {col}', fontsize=12, fontweight='bold')
  ax.set_xlabel('Frecuencia', fontsize=10)
  ax.set_yticks(range(len(value_counts)))
  ax.set_yticklabels(value_counts.index, fontsize=9)
  
  # Añadir etiquetas de valores
  total = value_counts.sum()
  for i, v in enumerate(value_counts.values):
    percentage = (v / total) * 100
    ax.text(v + max(value_counts.values) * 0.01, i, f'{v} ({percentage:.1f}%)', va='center', fontsize=9, fontweight='bold')

plt.tight_layout()
plt.show()

Este dataset en muy sencillo por lo que no hay muchos valores únicos por parte de columnas como `protocol` y `User-Agent` por lo que se opta por eliminar estas columnas del dataset

In [None]:
del_columns = [
  'protocol',
  'User-Agent'
]
df_selected = df_selected.drop(columns=del_columns, axis=1)
display(df_selected.head())
display(df_selected.tail())

In [None]:
# Sustituir NaN por cadena vacía
df_selected['content_missing'] = df_selected['content'].isna()
df_selected['content'] = df['content'].fillna('')
display(df_selected.isnull().sum())

In [None]:
# Creación de una columna 'content-lenght' que aplique la función len a la columna content
df_selected['content_length'] = df_selected['content'].fillna('').str.len()
display(df_selected.head())
display(df_selected.tail())

**Definición de Funciones Útiles para Extraer Características del Access Log**

In [None]:
def count_sql_words(url):
  "Cuenta palabras relacionadas con SQL Injection" 
  sql_words = [
    r'SELECT', 
    r'FROM', 
    r'WHERE', 
    r'DELETE', 
    r'DROP', 
    r'CREATE', 
    r'TABLE', 
    r'LIKE', 
    r'UNION', 
    r'INSERT', 
    r'UPDATE', 
    r'ALTER',
    r'INTO', 
    r'VALUES', 
    r'SET', 
    r'JOIN', 
    r'GRANT', 
    r'REVOKE'
  ]
  pattern = re.compile('|'.join(sql_words), re.IGNORECASE)
  matches = pattern.findall(str(url))
  return len(matches)

def count_xss_words(url):
  "Cuenta palabras relacionadas con Cross-Site Scripting (XSS)"
  xss_words = [
    r'script', 
    r'alert', 
    r'javascript', 
    r'onerror', 
    r'onload', 
    r'onunload', 
    r'prompt', 
    r'confirm', 
    r'eval', 
    r'expression',
    r'function\(', 
    r'xmlhttprequest', 
    r'xhr', 
    r'window\.', 
    r'document\.', 
    r'iframe', 
    r'src=', 
    r'cookie', 
    r'document\.cookie',
    r'set-cookie', 
    r'click', 
    r'mouseover'
  ]
  pattern = re.compile('|'.join(xss_words), re.IGNORECASE)
  matches = pattern.findall(str(url))
  return len(matches)

def count_command_words(url): 
  "Cuenta palabras relacionadas con ejecución de comandos"
  command_words = [
    r'cmd', 
    r'dir', 
    r'shell', 
    r'exec', 
    r'cat', 
    r'etc', 
    r'tmp',
    r'bin', 
    r'bash', 
    r'sh', 
    r'python', 
    r'perl', 
    r'ruby', 
    r'php',
    r'\.exe', 
    r'\.php', 
    r'\.js', 
    r'\.py', 
    r'\.pl', 
    r'\.rb',
    r'system\(', 
    r'popen\(', 
    r'proc_open\(', 
    r'passthru\('
  ]
  pattern = re.compile('|'.join(command_words), re.IGNORECASE)
  matches = pattern.findall(str(url))
  return len(matches)

def count_auth_words(url):
  "Cuenta palabras relacionadas con autentificación"
  auth_words = [
    r'admin', 
    r'administrator', 
    r'password', 
    r'login', 
    r'pwd',
    r'credential', 
    r'user', 
    r'username', 
    r'passwd', 
    r'secret',
    r'token', 
    r'session', 
    r'auth', 
    r'authentication', 
    r'key'
  ]
  pattern = re.compile('|'.join(auth_words), re.IGNORECASE)
  matches = pattern.findall(str(url))
  return len(matches)

def count_error_words(url):
  "Cuenta palabras relacionadas con errores"
  e_words = [
    r'error', 
    r'errorMsg', 
    r'errorID', 
    r'incorrect', 
    r'fail',
    r'failed', 
    r'failure', 
    r'exception', 
    r'stack',
    r'trace',
    r'debug', 
    r'warning', 
    r'fatal', 
    r'crash',
    r'invalid'
  ]
  pattern = re.compile('|'.join(e_words), re.IGNORECASE)
  matches = pattern.findall(str(url))
  return len(matches)

def count_malware_words(url):
  "Cuenta palabras relacionadas con malware"
  malware_words = [
    r'malware', 
    r'ransomware', 
    r'phishing', 
    r'exploit', 
    r'virus',
    r'trojan', 
    r'backdoor', 
    r'spyware', 
    r'rootkit', 
    r'worm',
    r'adware', 
    r'keylogger', 
    r'botnet', 
    r'payload', 
    r'inject',
    r'injected', 
    r'hacker', 
    r'attack', 
    r'exploit', 
    r'breach'
  ]
  pattern = re.compile('|'.join(malware_words), re.IGNORECASE)
  matches = pattern.findall(str(url))
  return len(matches)

def count_danger_characters(url):
  "Cuenta caracteres potencialmente peligrosos"
  characters = [
    r"'", 
    r"--", 
    r";", 
    r"\\", 
    r"\"", 
    r"<", 
    r">", 
    r"(", 
    r")", 
    r"&", 
    r"|"
  ]
  count = 0
  url_str = str(url)
  for c in characters:
    count += url_str.count(c)
  return count

def count_obfuscation_code_words(url):
  "Cuenta técnicas de ofuscación de código"
  obfuscation_words = [
    r'encode', 
    r'decode', 
    r'base64', 
    r'hex', 
    r'urlencode',
    r'urldecode', 
    r'escape', 
    r'unescape', 
    r'obfuscate',
    r'xor', 
    r'rot13', 
    r'chr\(',
    r'char\(', 
    r'fromCharCode',
    r'eval\('
  ]
  pattern = re.compile('|'.join(obfuscation_words), re.IGNORECASE)
  matches = pattern.findall(str(url))
  return len(matches)

def count_dir_words(url):
  "Cuenta referencias a directorios sensibles"
  dir_words = [
    r'\.\./', 
    r'\.\.\\', 
    r'/etc/', 
    r'/bin/', 
    r'/tmp/', 
    r'/var/',
    r'/home/', 
    r'/root/', 
    r'proc/', 
    r'dev/', 
    r'boot/', 
    r'usr/', 
    r'lib/', 
    r'sbin/'
  ]
  pattern = re.compile('|'.join(dir_words), re.IGNORECASE)
  matches = pattern.findall(str(url))
  return len(matches)

def count_dot(url):
  "Cuenta la cantidad de puntos en la URL"
  count_dot = url.count('.')
  return count_dot

def count_http(url): 
  "Cuenta las ocurrencias de http en la URL"
  return url.count('http')

def count_percentage_symbol(url):
  "Cuenta los signos de porcentaje"
  return url.count('%')

def count_question_symbol(url):
  "Cuenta los signos de interrogación"
  return url.count('?')

def count_hyphen(url):
  "Cuenta guiones (-) en la URL"
  return url.count('-')

def count_equal(url):
  "Cuenta signos igual (=)"
  return url.count('=')

def url_length(url):
  "Retorna la longitud total de la URL"
  return len(str(url))

def digit_count(url):
  "Cuenta la cantidad de dígitos numéricos en la URL"
  digits = 0
  for i in url:
    if i.isnumeric():
      digits = digits + 1
  return digits

def letter_count(url):
  "Cuenta la cantidad de letras en la URL"
  letters = 0
  for i in url:
    if i.isalpha():
      letters += 1
  return letters

def count_special_characters(url):
  "Cuenta caracteres especiales (no alfanuméricos) usando regex"
  special_characters = re.sub(r'[a-zA-Z0-9\s]', '', url)
  count = len(special_characters)
  return count

def is_encoded(url):
  """Detecta si la URL está codificada (presencia de %)

  Returns:
    int: Retorna 1 si es verdadero (URL está codificada) y 0 si no
  """
  return int('%' in url.lower())

def unusual_character_ratio(url):
  "Calcula la proporción de caracteres inusuales (no alfanuméricos, guiones, puntos o guiones bajos) respecto a la longitud total"
  total_characters = len(url)
  unusual_characters = re.sub(r'[a-zA-Z0-9\s\-._]', '', url)
  unusual_count = len(unusual_characters)
  ratio = unusual_count / total_characters if total_characters > 0 else 0
  return ratio

In [None]:
extract_features_functions: List[Callable] = [
  count_sql_words,
  count_xss_words, 
  count_command_words, 
  count_auth_words, 
  count_error_words,
  count_malware_words,
  count_danger_characters,
  count_obfuscation_code_words,
  count_dir_words,
  count_dot,
  count_http,
  count_percentage_symbol,
  count_question_symbol,
  count_hyphen,
  count_equal,
  url_length,
  digit_count,
  letter_count,
  count_special_characters,
  is_encoded,
  unusual_character_ratio
]

extract_request_features = True 
extract_content_features = False 
request_columns_name = []
content_columns_name = []
for func in extract_features_functions:
  if extract_request_features:
    feat_name = f"request_{func.__name__}"
    df_selected[feat_name] = df_selected['content'].apply(func)
    request_columns_name.append(feat_name)
  if extract_content_features:
    feat_name = f"content_{func.__name__}"
    df_selected[feat_name] = df_selected['request_url'].apply(func)
    content_columns_name.append(feat_name)

display(df_selected.head())

**Codificación de Variable Categóricas**: `Class` y `Method`

In [None]:
encoder = LabelEncoder()
df_selected['Class_encoded']  = encoder.fit_transform(df_selected['Class'] )
df_selected['Method_encoded'] = encoder.fit_transform(df_selected['Method'])
print(f"Number of unique values for 'Class_encoded':  {df_selected['Class_encoded'].nunique() }")
print(f"Number of unique values for 'Method_encoded': {df_selected['Method_encoded'].nunique()}")

In [None]:
df_selected.columns

In [None]:
#all(df_selected['content_length'] == df_selected['content_url_length'])

In [None]:
X_labels = ['Method_encoded', 'content_length'] + request_columns_name + content_columns_name
y_label = 'Class_encoded'

In [None]:
X_train,X_test, y_train,y_test = train_test_split(
  df_selected[X_labels], 
  df_selected[y_label], 
  test_size=0.3, 
  random_state=43, 
  stratify=df_selected[y_label]
)

In [None]:
RF_model = RandomForestClassifier(random_state=1000)
RF_model.fit(X_train,y_train)

selector = SelectFromModel(RF_model, threshold="mean")
selector.fit(X_train, y_train)
X_train_selected = selector.transform(X_train)
X_test_selected  = selector.transform(X_test )

SVM_model = SVC(kernel='rbf', C=2, gamma='scale')
SVM_model.fit(X_train_selected, y_train)

In [None]:
y_pred = SVM_model.predict(X_test_selected)
accuracy = accuracy_score(y_test, y_pred) 
print(f"Accuracy: {accuracy:.2f}")

In [None]:
y_pred = RF_model.predict(X_test)
print(f"""Resultados de Random Forest 
MAE:          {mean_absolute_error(y_test, y_pred)}
Accuracy:     {accuracy_score(y_test, y_pred)} 
Precision:    {precision_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred))}
Recall:       {recall_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred))}
F1-Score:     {f1_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred))}
ROC AUC:      {roc_auc_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred))}
""")

error_rate = (y_pred != y_test).mean()
print("Test error: {:.1%}".format(error_rate))

print(classification_report(y_test, y_pred, target_names= ['Normal', 'Anomalous']))

In [None]:
label = ['Normal', 'Anomalous']
cm = confusion_matrix(y_test, y_pred)
cm = pd.DataFrame(cm, index=['0', '1'], columns=['0', '1'])

plt.figure(figsize=(10, 10))
sns.heatmap(cm, cmap="Blues", linecolor='black', linewidth=1, annot=True, fmt='', xticklabels=label, yticklabels=label)
plt.title("Random Forest")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

**Observaciones**: 
- Aplicar las funciones de extracción de características al content y tomarlas como características mejora mucho el modelo 
- *Problema*: en access logs como se puede extraer el content?