In [None]:
import numpy as np 
import pandas as pd
from datetime import datetime
from typing import List, Callable
import matplotlib.pyplot as plt 
import seaborn as sns 

# Librerías para el procesamiento de access logs 
import re 
from parse import parse 
from lars.apache import ApacheSource, COMBINED, ApacheWarning

# Manejo de advertencias del sistema, usada para capturar las líneas que no pueden parsearse por problemas de lars (ApacheWarning)
import warnings

# Configuración de estilo para las gráficas
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')

log_prep_path = '../data/target/access_log_master.csv'
log_mlab_path = '../data/target/access_log_master_manual_labeling.csv'
df = pd.read_csv(log_prep_path)

In [None]:
display(df['anomaly'].value_counts())
labeled_data = df[df['anomaly'] != -1]
unlabeled_data = df[df['anomaly'] == -1]

### Etiquetado Manual Simple 
Etiquetar los datos de forma manual si son normal o presentan alguna anomalía. Pero el análisis siguiente presenta un problema: Debido a la gran cantidad de datos y muchos de estos presentan patrones semejantes, etiquetar todos estos resultaría complicado y requeriría mucho tiempo

In [None]:
import pandas as pd
from IPython.display import clear_output

def manual_labeling(unlabeled_data:pd.DataFrame, columns_selected:List[str], _max:int=20):
  for idx in unlabeled_data.index[:_max]:
    clear_output(wait=True) 
    data = unlabeled_data.loc[idx] 
    
    # Mostrar información del log no etiquetado
    max_len_columns_name = max([len(col_name) for col_name in columns_selected])
    for col_name in columns_selected:
      print(f"{col_name:>{max_len_columns_name}}: {data[col_name]}")
    
    option = input("Options (0: Normal, 1: Anomaly, q: Quit): ").strip().lower()
    
    match option:
      case "0":
        df.at[idx, "anomaly"] = 0
      case "1":
        df.at[idx, "anomaly"] = 1
      case "q":
        break 
  return df

df_manual = manual_labeling(
  unlabeled_data, 
  columns_selected=[
    'ip_client', 
    'status', 
    'size', 
    'user_agent', 
    'method', 
    'url', 
    'protocol',
  ], 
  _max=100
)

In [None]:
display(df_manual.head(10))
idx_update = df_manual[df_manual['anomaly'] != -1].index
df.loc[idx_update, 'anomaly'] = df_manual.loc[idx_update, 'anomaly']

### Etiquetado Manual por Lotes y Patrones Comunes

In [None]:
display(df['anomaly'].value_counts())
labeled_data = df[df['anomaly'] != -1]
unlabeled_data = df[df['anomaly'] == -1]

In [None]:
unlabeled_data = unlabeled_data.assign(
  n_tuple=list(
    zip(
      #unlabeled_data['status'],
      unlabeled_data['method'],
      unlabeled_data['url'],
      #unlabeled_data['protocol'],
    )
  )
)

In [None]:
# FULL TUPLES: Status, Method, URL, Protocol
def manual_labeling_by_tuples(
    unlabeled_df: pd.DataFrame, 
    columns_selected: List[str],
    tuple_col: str = "n_tuple",
    max_values: int = 5
  ):
  "Etiqueta n-tuplas formadas por status, method, url, protocol"
  # Contar frecuencia de n-tuplas
  tuple_counts = unlabeled_df[tuple_col].value_counts()
  labels_map = {}

  total = len(tuple_counts)
  
  for i, (tup, freq) in enumerate(tuple_counts.items(), 1):
    clear_output(wait=True) 
    
    subset = unlabeled_df[unlabeled_df[tuple_col] == tup]
    status, method, url, protocol = tup
    
    # Mostrar información
    print(f"  N-tupla {i}/{total}")
    print(f" Frecuencia: {freq} logs")
    print(f" Características")
    print(f"  - status:   {status}")
    print(f"  - method:   {method}")
    print(f"  - URL:      {url}")
    print(f"  - protocol: {protocol}")
    
    print("\nContexto Adicional:")
    for col in columns_selected:
      if col in ["status", "method", "url", "protocol", tuple_col]:
        continue
      
      print(f"\n {col}:")
      vc = subset[col].value_counts().head(max_values)

      for val, count in vc.items():
        print(f"  {val} -> {count}")
    
    option = input("Options (0: Normal, 1: Anomaly, q: Quit): ").strip().lower()
    
    match option:
      case "0":
        labels_map[tup] = 0
      case "1":
        labels_map[tup] = 1
      case "q":
        break

  return labels_map

tuple_labels = manual_labeling_by_tuples(
  unlabeled_data,
  columns_selected = ['user_agent'],
  tuple_col = 'n_tuple',
  max_values = 3,
)

In [None]:
# Partial Tuples: Only Method, URL
def manual_labeling_by_tuples(
    unlabeled_df: pd.DataFrame, 
    columns_selected: List[str],
    tuple_col: str = "n_tuple",
    max_values: int = 5
  ):
  "Etiqueta n-tuplas formadas por method, url"
  # Contar frecuencia de n-tuplas
  tuple_counts = unlabeled_df[tuple_col].value_counts()
  labels_map = {}

  total = len(tuple_counts)
  
  for i, (tup, freq) in enumerate(tuple_counts.items(), 1):
    clear_output(wait=True) 
    
    subset = unlabeled_df[unlabeled_df[tuple_col] == tup]
    method, url = tup
    
    # Mostrar información
    print(f"  N-tupla {i}/{total}")
    print(f" Frecuencia: {freq} logs")
    print(f" Características")
    print(f"  - method:   {method}")
    print(f"  - URL:      {url}")
    
    
    print("\nContexto Adicional:")
    for col in columns_selected:
      if col in ["method", "url", tuple_col]:
        continue
      
      print(f"\n {col}:")
      vc = subset[col].value_counts().head(max_values)

      for val, count in vc.items():
        print(f"  {val} -> {count}")
    
    option = input("Options (0: Normal, 1: Anomaly, q: Quit): ").strip().lower()
    
    match option:
      case "0":
        labels_map[tup] = 0
      case "1":
        labels_map[tup] = 1
      case "q":
        break

  return labels_map

tuple_labels = manual_labeling_by_tuples(
  unlabeled_data,
  columns_selected = [ 'protocol', 'status' ],
  tuple_col = 'n_tuple',
  max_values = 3,
)

In [None]:
filtering_data = unlabeled_data[(unlabeled_data['size'] > 30000) & (unlabeled_data['url'] == '/inicio/') ]
display(filtering_data)

In [None]:
if tuple_labels:
  unlabeled_data['temp_label'] = unlabeled_data['n_tuple'].map(tuple_labels)

  mask = unlabeled_data['temp_label'].notna()
  unlabeled_data.loc[mask, 'anomaly'] = unlabeled_data.loc[mask, 'temp_label']

  labeled_count = mask.sum()

  print("=" * 50)
  print("Etiquetado por tuplas")
  print("=" * 50)
  print(f"Registros etiquetados automáticamente: {labeled_count}")
  print("\nDistribución de etiquetas:")
  print(unlabeled_data.loc[mask, 'anomaly'].value_counts())

  unlabeled_data = unlabeled_data.drop(['n_tuple', 'temp_label'], axis=1)

df_manual = pd.concat([labeled_data, unlabeled_data]).sort_index()

idx_update = df_manual[df_manual['anomaly'] != -1].index
df.loc[idx_update, 'anomaly'] = df_manual.loc[idx_update, 'anomaly']

print("Distribución final en el dataset:")
display(df['anomaly'].value_counts())

### Etiquetado Manual por Lotes y Patrones Comunes (Inverse Freq) 

In [None]:
display(df['anomaly'].value_counts())
labeled_data = df[df['anomaly'] != -1]
unlabeled_data = df[df['anomaly'] == -1]
filtering_data = df[df['status_category'] == 200]
display(filtering_data['anomaly'].value_counts())

In [None]:
filtering_data = df[df['status_category'] == 200]
labeled_data = filtering_data[filtering_data['anomaly'] != -1]
unlabeled_data = filtering_data[filtering_data['anomaly'] == -1]

In [None]:
unlabeled_data = unlabeled_data.assign(
  n_tuple=list(
    zip(
      #unlabeled_data['status'],
      unlabeled_data['method'],
      unlabeled_data['url'],
      #unlabeled_data['protocol'],
    )
  )
)

In [None]:
def manual_labeling_by_tuples(
    unlabeled_df: pd.DataFrame,
    columns_selected: List[str],
    tuple_col: str = "n_tuple",
    max_values: int = 5,
    reverse_freq: bool = True  # Nuevo parámetro
  ):
  "Etiqueta n-tuplas en orden inverso de frecuencia (menos frecuentes primero)"
  # Contar frecuencia de n-tuplas
  tuple_counts = unlabeled_df[tuple_col].value_counts()

  # Ordenar por frecuencia: ascendente (menos frecuentes primero) si reverse_freq=True
  if reverse_freq:
    tuple_counts = tuple_counts.iloc[::-1]

  labels_map = {}
  total = len(tuple_counts)

  for i, (tup, freq) in enumerate(tuple_counts.items(), 1):
    clear_output(wait=True)

    subset = unlabeled_df[unlabeled_df[tuple_col] == tup]
    method, url = tup

    # Mostrar información con color coding
    print(f" N-tupla {i}/{total}")
    print(f" Frecuencia: {freq} logs")
    print(
        f" Percentil: {((i-1)/total*100):.1f}% (más {'frecuente' if not reverse_freq else 'rara'})")
    print(f" Características:")
    print(f"  - method:   {method}")
    print(f"  - URL:      {url}")

    print("\nContexto Adicional:")
    for col in columns_selected:
      if col in ["method", "url", tuple_col]:
        continue

      print(f"\n {col}:")
      vc = subset[col].value_counts().head(max_values)

      if len(vc) > 0:
        for val, count in vc.items():
          print(f"  {val} -> {count}")
      else:
        print("  (sin valores únicos)")

    # Opciones mejoradas
    print("\n" + "="*50)
    option = input(
        "Options (0: Normal, 1: Anomaly, b: Batch, s: Skip, q: Quit): ").strip().lower()

    match option:
      case "0":
        labels_map[tup] = 0
      case "1":
        labels_map[tup] = 1
      case "b":  # Modo batch
        batch_labels = input(
            "Etiquetar batch (ej: '0 0 1 0' para 4 siguientes): ").split()
        for j, batch_label in enumerate(batch_labels):
          if i + j <= total:
            next_tup = tuple_counts.index[i + j - 1]
            labels_map[next_tup] = int(batch_label)
        i += len(batch_labels) - 1
      case "s":
        continue
      case "q":
        break

  return labels_map

tuple_labels = manual_labeling_by_tuples(
  unlabeled_data,
  columns_selected = [ 'protocol', 'status' ],
  tuple_col = 'n_tuple',
  max_values = 3,
)

In [None]:
if tuple_labels:
  unlabeled_data['temp_label'] = unlabeled_data['n_tuple'].map(tuple_labels)

  mask = unlabeled_data['temp_label'].notna()
  unlabeled_data.loc[mask, 'anomaly'] = unlabeled_data.loc[mask, 'temp_label']

  labeled_count = mask.sum()

  print("=" * 50)
  print("Etiquetado por tuplas")
  print("=" * 50)
  print(f"Registros etiquetados automáticamente: {labeled_count}")
  print("\nDistribución de etiquetas:")
  print(unlabeled_data.loc[mask, 'anomaly'].value_counts())

  unlabeled_data = unlabeled_data.drop(['n_tuple', 'temp_label'], axis=1)

df_manual = pd.concat([labeled_data, unlabeled_data]).sort_index()

idx_update = df_manual[df_manual['anomaly'] != -1].index
df.loc[idx_update, 'anomaly'] = df_manual.loc[idx_update, 'anomaly']

print("Distribución final en el dataset:")
display(df['anomaly'].value_counts())

### Guardar Dataset

In [None]:
df.to_csv(log_mlab_path, index=False)