In [None]:
import numpy as np 
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt 
import seaborn as sns 

# Ejecutar antes las celdas de `01_access_log_parsing` o elige una ubicación donde exista un archivo de access logs (Apache Combined Log Format) 
log_file_path = '../data/target/access_log_master.log'

## Load Access Log File 

In [None]:
# Cambia a None para cargar todos los datos, o a un número para establecer un límite
n_samples = None 

with open(log_file_path, 'r') as f:
  if n_samples is not None: 
    sample_lines = []
    for _ in range(n_samples):
      line = f.readline()
      
      # si se acaba el archivo antes de alcanzar el número de muestras definidas
      if not line:    
        break 
      sample_lines.append(line.strip())
  else: 
    # Cargar todas las líneas
    sample_lines = [line.strip() for line in f]

print(f"Sample log lines (Total: {len(sample_lines)}):")
for i,line in enumerate(sample_lines[:10], 1):
  print(f"{i:>{len(str(n_samples))}}) {line}") 

In [None]:
from parse import parse 

pattern = '{ip_client} {ident} {auth_user} [{timestamp}] "{method} {request} {protocol}" {status:d} {size:d} "{referrer}" "{agent}"'

parsed_logs = []
conflicting_logs = {}
with open(log_file_path, 'r') as f:
  for idx,line in enumerate(f, 1):
    try:
      parsed = parse(pattern, line.strip())
      parsed_logs.append(parsed.named)
    except Exception as e:
      conflicting_logs[idx] = str(e)

In [None]:
len(conflicting_logs.keys())

In [None]:
for i,parsed in enumerate(parsed_logs, 1):
  print(f"{i:>{len(str(n_samples))}}) {parsed}") 

df_parse = pd.DataFrame(parsed_logs)
display(df_parse.head(5))

## Load and Parsing a Access Log File using `lars`

In [None]:
import re 
import warnings
from lars.apache import ApacheSource, COMBINED, ApacheWarning

parsed_logs = []
conflicting_logs = {}
problematic_logs = {}

# Leer todas las lineas primero
with open(log_file_path, 'r') as f:
  all_lines = f.readlines()

# usar catch-warning para capturar advertencias
with warnings.catch_warnings(record=True) as w:
  # convertir todas las advertencias en excepciones o capturarlas
  warnings.simplefilter("always", ApacheWarning)

  with open(log_file_path) as f:
    with ApacheSource(f, log_format=COMBINED) as source:
      for i,row in enumerate(source, 1):
        try:
          record = {
            "remote_host" : row.remote_host,
          "ident" : row.ident,
          "remote_user" : row.remote_user, 
          "time" : row.time,
          "request_method" : row.request.method,
          "request_url_scheme" : row.request.url.scheme, 
          "request_url_netloc" : row.request.url.netloc,
          "request_url_path_str" : row.request.url.path_str,
          "request_url_params" : row.request.url.params, 
          "request_url_query_str" : row.request.url.query_str,
          "request_url_fragment" : row.request.url.fragment,
          "request_protocol": row.request.protocol,
          "status" : row.status,
          "size" : row.size,
          "req_Referer" : row.req_Referer,
            "req_User_agent" : row.req_User_agent
          }
          parsed_logs.append(record)
        except Exception as e:
          #print(f"Error en fila {i}: {e}")
          conflicting_logs[i] = {
            'error': str(e), 
            'line_content': all_lines[i-1].strip() if i <= len(all_lines) else "no disponible"
          }
  
  # procesar las advertencias capturadas
  for warning in w:
    if issubclass(warning.category, ApacheWarning):
      msg = str(warning.message)
      match = re.search(r'Line (\d+):', msg)
      if match:
        line_num = int(match.group(1))
        if line_num <= len(all_lines):
            problematic_logs[line_num] = {
              'line_content': all_lines[line_num - 1].strip(),
              'warning_message': msg, 
              'category': warning.category.__name__,
              'filename': warning.filename,
              'lineno': warning.lineno
            }

In [None]:
print(len(problematic_logs.keys()))
print(len(conflicting_logs.keys()))

### TODO

- Análisis Exploratorio de Datos sobre los DataFrames
  - Distribución de Tiempo y Consulta (saber la densidad de consulta por día)
  - Distribución de `method`, `protocol`, `status`, `size`, `referer`, `user_agent`
  - 

## Exploratory Data Analysis on Parsed Access Log