In [None]:
import re 
import math 

import numpy as np
import pandas as pd
import seaborn as sns 
import matplotlib.pyplot as plt 

from typing import List, Callable 

from parse import parse 

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import  SelectFromModel
from sklearn.model_selection import (
  StratifiedKFold,
  train_test_split,
  GridSearchCV
)
from sklearn.ensemble import (
  RandomForestClassifier, 
)
from sklearn.metrics import (
  precision_score,
  accuracy_score,
  recall_score, 
  f1_score, 
  roc_auc_score, 
  mean_absolute_error,
  confusion_matrix, 
  classification_report
)
from sklearn.linear_model import SGDClassifier
from sklearn.svm import (
  LinearSVC,
  SVC
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier 
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier

log_prep_path = '../data/test_dataset/HttpParamsDataset/payload_full.csv' 

In [None]:
def count_sql_words(url):
  "Cuenta palabras relacionadas con SQL Injection" 
  sql_words = [
    r'SELECT', 
    r'FROM', 
    r'WHERE', 
    r'DELETE', 
    r'DROP', 
    r'CREATE', 
    r'TABLE', 
    r'LIKE', 
    r'UNION', 
    r'INSERT', 
    r'UPDATE', 
    r'ALTER',
    r'INTO', 
    r'VALUES', 
    r'SET', 
    r'JOIN', 
    r'GRANT', 
    r'REVOKE'
  ]
  pattern = re.compile('|'.join(sql_words), re.IGNORECASE)
  matches = pattern.findall(str(url))
  return len(matches)

def count_xss_words(url):
  "Cuenta palabras relacionadas con Cross-Site Scripting (XSS)"
  xss_words = [
    r'script', 
    r'alert', 
    r'javascript', 
    r'onerror', 
    r'onload', 
    r'onunload', 
    r'prompt', 
    r'confirm', 
    r'eval', 
    r'expression',
    r'function\(', 
    r'xmlhttprequest', 
    r'xhr', 
    r'window\.', 
    r'document\.', 
    r'iframe', 
    r'src=', 
    r'cookie', 
    r'document\.cookie',
    r'set-cookie', 
    r'click', 
    r'mouseover'
  ]
  pattern = re.compile('|'.join(xss_words), re.IGNORECASE)
  matches = pattern.findall(str(url))
  return len(matches)

def count_command_words(url): 
  "Cuenta palabras relacionadas con ejecución de comandos"
  command_words = [
    r'cmd', 
    r'dir', 
    r'shell', 
    r'exec', 
    r'cat', 
    r'etc', 
    r'tmp',
    r'bin', 
    r'bash', 
    r'sh', 
    r'python', 
    r'perl', 
    r'ruby', 
    r'php',
    r'\.exe', 
    r'\.php', 
    r'\.js', 
    r'\.py', 
    r'\.pl', 
    r'\.rb',
    r'system\(', 
    r'popen\(', 
    r'proc_open\(', 
    r'passthru\('
  ]
  pattern = re.compile('|'.join(command_words), re.IGNORECASE)
  matches = pattern.findall(str(url))
  return len(matches)

def count_auth_words(url):
  "Cuenta palabras relacionadas con autentificación"
  auth_words = [
    r'admin', 
    r'administrator', 
    r'password', 
    r'login', 
    r'pwd',
    r'credential', 
    r'user', 
    r'username', 
    r'passwd', 
    r'secret',
    r'token', 
    r'session', 
    r'auth', 
    r'authentication', 
    r'key'
  ]
  pattern = re.compile('|'.join(auth_words), re.IGNORECASE)
  matches = pattern.findall(str(url))
  return len(matches)

def count_error_words(url):
  "Cuenta palabras relacionadas con errores"
  e_words = [
    r'error', 
    r'errorMsg', 
    r'errorID', 
    r'incorrect', 
    r'fail',
    r'failed', 
    r'failure', 
    r'exception', 
    r'stack',
    r'trace',
    r'debug', 
    r'warning', 
    r'fatal', 
    r'crash',
    r'invalid'
  ]
  pattern = re.compile('|'.join(e_words), re.IGNORECASE)
  matches = pattern.findall(str(url))
  return len(matches)

def count_malware_words(url):
  "Cuenta palabras relacionadas con malware"
  malware_words = [
    r'malware', 
    r'ransomware', 
    r'phishing', 
    r'exploit', 
    r'virus',
    r'trojan', 
    r'backdoor', 
    r'spyware', 
    r'rootkit', 
    r'worm',
    r'adware', 
    r'keylogger', 
    r'botnet', 
    r'payload', 
    r'inject',
    r'injected', 
    r'hacker', 
    r'attack', 
    r'exploit', 
    r'breach'
  ]
  pattern = re.compile('|'.join(malware_words), re.IGNORECASE)
  matches = pattern.findall(str(url))
  return len(matches)

def count_danger_characters(url):
  "Cuenta caracteres potencialmente peligrosos"
  characters = [
    r"'", 
    r"--", 
    r";", 
    r"\\", 
    r"\"", 
    r"<", 
    r">", 
    r"(", 
    r")", 
    r"&", 
    r"|"
  ]
  count = 0
  url_str = str(url)
  for c in characters:
    count += url_str.count(c)
  return count

def count_obfuscation_code_words(url):
  "Cuenta técnicas de ofuscación de código"
  obfuscation_words = [
    r'encode', 
    r'decode', 
    r'base64', 
    r'hex', 
    r'urlencode',
    r'urldecode', 
    r'escape', 
    r'unescape', 
    r'obfuscate',
    r'xor', 
    r'rot13', 
    r'chr\(',
    r'char\(', 
    r'fromCharCode',
    r'eval\('
  ]
  pattern = re.compile('|'.join(obfuscation_words), re.IGNORECASE)
  matches = pattern.findall(str(url))
  return len(matches)

def count_dir_words(url):
  "Cuenta referencias a directorios sensibles"
  dir_words = [
    r'\.\./', 
    r'\.\.\\', 
    r'/etc/', 
    r'/bin/', 
    r'/tmp/', 
    r'/var/',
    r'/home/', 
    r'/root/', 
    r'proc/', 
    r'dev/', 
    r'boot/', 
    r'usr/', 
    r'lib/', 
    r'sbin/'
  ]
  pattern = re.compile('|'.join(dir_words), re.IGNORECASE)
  matches = pattern.findall(str(url))
  return len(matches)

def count_dot(url):
  "Cuenta la cantidad de puntos en la URL"
  count_dot = url.count('.')
  return count_dot

def count_http(url): 
  "Cuenta las ocurrencias de http en la URL"
  return url.count('http')

def count_percentage_symbol(url):
  "Cuenta los signos de porcentaje"
  return url.count('%')

def count_question_symbol(url):
  "Cuenta los signos de interrogación"
  return url.count('?')

def count_hyphen(url):
  "Cuenta guiones (-) en la URL"
  return url.count('-')

def count_equal(url):
  "Cuenta signos igual (=)"
  return url.count('=')

def url_length(url):
  "Retorna la longitud total de la URL"
  return len(str(url))

def digit_count(url):
  "Cuenta la cantidad de dígitos numéricos en la URL"
  digits = 0
  for i in url:
    if i.isnumeric():
      digits = digits + 1
  return digits

def letter_count(url):
  "Cuenta la cantidad de letras en la URL"
  letters = 0
  for i in url:
    if i.isalpha():
      letters += 1
  return letters

def count_special_characters(url):
  "Cuenta caracteres especiales (no alfanuméricos) usando regex"
  special_characters = re.sub(r'[a-zA-Z0-9\s]', '', url)
  count = len(special_characters)
  return count

def is_encoded(url):
  """Detecta si la URL está codificada (presencia de %)

  Returns:
    int: Retorna 1 si es verdadero (URL está codificada) y 0 si no
  """
  return int('%' in url.lower())

def unusual_character_ratio(url):
  "Calcula la proporción de caracteres inusuales (no alfanuméricos, guiones, puntos o guiones bajos) respecto a la longitud total"
  total_characters = len(url)
  unusual_characters = re.sub(r'[a-zA-Z0-9\s\-._]', '', url)
  unusual_count = len(unusual_characters)
  ratio = unusual_count / total_characters if total_characters > 0 else 0
  return ratio

extract_features_functions: List[Callable] = [
  count_sql_words,
  count_xss_words, 
  count_command_words, 
  count_auth_words, 
  count_error_words,
  count_malware_words,
  count_danger_characters,
  count_obfuscation_code_words,
  count_dir_words,
  count_dot,
  count_http,
  count_percentage_symbol,
  count_question_symbol,
  count_hyphen,
  count_equal,
  url_length,
  digit_count,
  letter_count,
  count_special_characters,
  is_encoded,
  unusual_character_ratio
]

extract_request_features = True 
extract_content_features = True 
request_columns_name = []
content_columns_name = []
for func in extract_features_functions:
  if extract_request_features:
    feat_name = f"request_{func.__name__}"
    df_selected[feat_name] = df_selected['content'].apply(func)
    request_columns_name.append(feat_name)
  if extract_content_features:
    feat_name = f"content_{func.__name__}"
    df_selected[feat_name] = df_selected['request_url'].apply(func)
    content_columns_name.append(feat_name)

display(df_selected.head())