In [None]:
import numpy as np 
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt 
import seaborn as sns 

# Librerías para el procesamiento de access logs 
import re 
from parse import parse 
from lars.apache import ApacheSource, COMBINED, ApacheWarning

# Manejo de advertencias del sistema, usada para capturar las líneas que no pueden parsearse por problemas de lars (ApacheWarning)
import warnings

# Configuración de estilo para las gráficas
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')

log_file_path = '../data/test_dataset/WebServerAccessLog/access.log'

In [None]:
# Cambia a None para cargar todos los datos, o a un número para establecer un límite
n_samples = None 

with open(log_file_path, 'r') as f:
  if n_samples is not None: 
    sample_lines = []
    for _ in range(n_samples):
      line = f.readline()
      
      # si se acaba el archivo antes de alcanzar el número de muestras definidas
      if not line:    
        break 
      sample_lines.append(line.strip())
  else: 
    # Cargar todas las líneas
    sample_lines = [line.strip() for line in f]

print(f"Sample log lines (Total: {len(sample_lines)}):")
for i,line in enumerate(sample_lines[:10], 1):
  print(f"{i:>{len(str(n_samples))}}) {line}") 

In [None]:
parsed_logs = []
conflicting_logs = {}
problematic_logs = {}

with open(log_file_path, 'r') as f:
  all_lines = f.readlines()

# Lista de campos de URL que se quieren extraer de la clase Row de lars
url_fields = [
  ('request_url_scheme', 'scheme'),
  ('request_url_netloc', 'netloc'),
  ('request_url_path_str', 'path_str'),
  ('request_url_params', 'params'),
  ('request_url_query_str', 'query_str'),
  ('request_url_fragment', 'fragment')
]

# Función para extraer campos de URL 
def extract_url_fields(url_obj):
  result = {}
  for field_name, attr_name in url_fields:
    try:
      value = getattr(url_obj, attr_name, None)
      result[field_name] = value if value else None
    except Exception:
      result[field_name] = None
  return result

with warnings.catch_warnings(record=True) as w:
  warnings.simplefilter("always", ApacheWarning)

  with open(log_file_path) as f:
    with ApacheSource(f, log_format=COMBINED) as source:
      for i, row in enumerate(source, 1):
        try:
          record = {
            "remote_host": row.remote_host,
            "ident": row.ident,
            "remote_user": row.remote_user,
            "time": row.time,
            "request_method": row.request.method if hasattr(row, 'request') and row.request else None,
            "request_protocol": row.request.protocol if hasattr(row, 'request') and row.request else None,
            "status": row.status,
            "size": row.size,
            "req_Referer": row.req_Referer,
            "req_User_agent": row.req_User_agent,
          }
          
          # Extraer campos de URL si existe
          if hasattr(row, 'request') and row.request and row.request.url:
            url_data = extract_url_fields(row.request.url)
            record.update(url_data)
          else:
            # Si no hay URL, establecer todos los campos como None
            record.update({field_name: None for field_name, _ in url_fields})
          
          parsed_logs.append(record)
        except Exception as e:
          conflicting_logs[i] = {
            'error': str(e),
            'line_content': all_lines[i-1].strip() if i <= len(all_lines) else "no disponible"
          }
  
  for warning in w:
    if issubclass(warning.category, ApacheWarning):
      msg = str(warning.message)
      match = re.search(r'Line (\d+):', msg)
      if match:
        line_num = int(match.group(1))
        if line_num <= len(all_lines):
          problematic_logs[line_num] = {
            'line_content': all_lines[line_num - 1].strip(),
            'warning_message': msg,
            'category': warning.category.__name__,
          }

total_problematic_logs = len(problematic_logs)
total_conflicting_logs = len(conflicting_logs)
total_parsed_logs = len(parsed_logs)
total_logs = total_parsed_logs + total_problematic_logs + total_conflicting_logs

print(f"Total de logs procesados: {total_logs}")
print(f"Problematic Logs: {total_problematic_logs} ({(total_problematic_logs / total_logs * 100 if total_logs != 0 else 0):.2f}%)")
print(f"Conflicting Logs: {total_conflicting_logs} ({(total_conflicting_logs / total_logs * 100 if total_logs != 0 else 0):.2f}%)")
print(f"Parsed Logs: {total_parsed_logs} ({(total_parsed_logs / total_logs * 100 if total_logs != 0 else 0):.2f}%)")

**Problema**: Como son muchos datos, cargar los logs presentan una alta complejidad.

In [None]:
df_parsed = pd.DataFrame(parsed_logs)  
display(df_parsed.head(5))
display(df_parsed.tail(5))

### Análisis de Logs Problemáticos (`problematic_logs`)

In [None]:
max_idx = max(problematic_logs.keys())

logs = list(problematic_logs.items()) 

print(f"Número Total de Logs: {len(problematic_logs.keys())}")
for log in logs:
  idx = log[0]
  line_content = log[1]['line_content']
  print(f"{idx:>{len(str(max_idx))}}) {line_content}") 

In [None]:
from parse import parse 

pattern = '{ip_client} {ident} {auth_user} [{timestamp}] {request_http} {status:d} {size:d} "{referrer}" "{agent}"'

problematic_parsed_logs = []
for problematic in list(problematic_logs.items()):
  idx = problematic[0]
  line_content = problematic[1]['line_content']
  try: 
    parsed = parse(pattern, line_content.strip())
    problematic_parsed_logs.append(parsed.named)
  except Exception as e:
    print(f"Line {idx}: {line_content}")
    print(f"Error: {e}")

df_problematic = pd.DataFrame(problematic_parsed_logs)
status_distribution = df_problematic['status'].value_counts().sort_index()
display(df_problematic.shape[0])
display(status_distribution)