In [None]:
#Capa 1: Heurísticas tipo ModSecurity (no ML)

import pandas as pd
import numpy as np
import re
from collections import Counter

# ==============================
# Cargar dataset base
# ==============================
# Ajusta el path si es necesario
df = pd.read_csv('access_log_structured.csv')


### Análisis de `user-agent` sospechosos

In [None]:
df_user_agent = pd.DataFrame(df['user_agent'].value_counts())
display(df_user_agent.head(5))
display(df_user_agent.columns)

In [None]:
def is_bot(user_agent:str) -> bool:
  "Detecta si un user-agent pertenece a un bot/crawler/spider"
  if not user_agent:
    return False 
  user_agent = user_agent.lower()
  
  # Nombres de bots conocidos
  known_bots = {
    'googlebot', 
    'bingbot', 
    'yandexbot', 
    'applebot',
    'duckduckbot', 
    'baiduspider', 
    'sogou', 
    'bytespider',
    'amazonbot', 
    'gptbot', 
    'chatgpt-user', 
    'oai-searchbot',
    'claudebot', 
    'google-cloudvertexbot', 
    'google-extended',
    'perplexitybot', 
    'meta-externalagent', 
    'meta-webindexer',
    'tiktokspider', 
    'openai.com-bot', 
    'google.bot',
    # Poco comunes pero encontrados en el archivo de access log
    'thinkbot', 
    'petalbot'
    # No es bot pero se asumirá que sí debido a que los comportamientos no son permitidos
    'securitytxtresearch'
    #'SecurityTxtResearch'
  }

  # Verificar nombres de bots conocidos
  for bot in known_bots:
    if bot in user_agent:
      return True

  # Patrón: "dominio.com-bot" o "dominio.bot"
  import re
  pattern = r'[a-z0-9.-]+\.(?:com|org|net|io)[-.]bot'
  if re.search(pattern, user_agent):
    return True

  return False

In [None]:
df['is_bot'] = df['user_agent'].apply(is_bot)

display(df.head())
display(df['is_bot'].value_counts())


# Crear máscara para los bots
bot_mask = df['is_bot'] == True
# Modificar la columna 'attack' para los bots 
df.loc[bot_mask, 'anomaly'] = 1 


### Dataset interno: IPs


In [None]:


ip_stats = df.groupby('ip_client').agg(
    total_requests=('ip_client', 'count'),
    unique_urls=('request', 'nunique'),
    error_rate=('status', lambda x: np.mean(x.astype(str).str.startswith(('4','5'))))
).reset_index()

mean_requests = ip_stats['total_requests'].mean()
ip_stats['high_rate'] = ip_stats['total_requests'] > 5 * mean_requests
ip_stats['anomaly_score_ip'] = ip_stats[['high_rate']].sum(axis=1) + (ip_stats['error_rate'] > 0.4).astype(int)
ip_stats['is_banned'] = ip_stats['anomaly_score_ip'] >= 2

df = df.merge(ip_stats[['ip_client','anomaly_score_ip','is_banned']], on='ip_client', how='left')



### Clasificación heurística final

In [None]:

def classify(row):
    score = 0
    score += row.get('anomaly_score_ip',0)
    score += row.get('anomaly_score_ua',0)
    return 'Anomaly' if score >= 2 else 'Normal'

df['heuristic_label'] = df.apply(classify, axis=1)


### Limpieza de columnas y resultados

In [None]:

df_clean = df.drop(columns=['ip_client','user_agent','ident','referrer','is_bot'], errors='ignore')

print(df_clean['heuristic_label'].value_counts())
