In [None]:
import numpy as np 
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt 
import seaborn as sns 

# Librerías para el procesamiento de access logs 
import re 
from parse import parse 
from lars.apache import ApacheSource, COMBINED, ApacheWarning

# Manejo de advertencias del sistema, usada para capturar las líneas que no pueden parsearse por problemas de lars (ApacheWarning)
import warnings

# Configuración de estilo para las gráficas
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')

log_prep_path = '../data/target/access_log_master.csv'

In [None]:
df = pd.read_csv(log_prep_path)
df.head(5)

In [None]:
display(df.isnull().sum())

In [None]:
features = [
  'size',
  'status_category',
  'url_length',
  'url_depth',
  'n_encoded_chars',
  'n_special_chars'
]

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA 

from sklearn.ensemble import IsolationForest

In [None]:
X = df[features]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
clf = IsolationForest(random_state=42, contamination=0.01)
df['anomaly_score'] = clf.fit_predict(X_scaled)

In [None]:
df['anomaly_score'].value_counts()

In [None]:
samples = df['anomaly_score'].sample(50)
display(samples)

In [None]:
df['is_anomaly'] = df['anomaly_score']
df['is_anomaly'] = df['is_anomaly'].replace( 1,  0)
df['is_anomaly'] = df['is_anomaly'].replace(-1,  1)

In [None]:
pca = PCA(n_components=2)
res = pca.fit_transform(X_scaled)

plt.figure(figsize=(10, 6))
plt.scatter(res[:, 0], res[:, 1], c=df['is_anomaly'], cmap='coolwarm', alpha=0.6)
plt.title("Anomaly Detection in Web Logs (PCA Projection)")
plt.xlabel("PCA 1")
plt.ylabel("PCA 2")
plt.show()