In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns 

# Para evitar warnings de convergencia
import warnings
from sklearn.exceptions import ConvergenceWarning

# Agregamos importaciones para validación cruzada y fine-tuning
from sklearn.model_selection import (
  train_test_split,
  StratifiedKFold,          # Validación cruzada estratificada
  GridSearchCV,
  RandomizedSearchCV,       # Para optimización de hiperparámetros
  cross_val_score,
  cross_val_predict
)
from sklearn.preprocessing import (
  StandardScaler,
  RobustScaler,
  LabelEncoder
)
from sklearn.tree import (
  DecisionTreeClassifier
)
from sklearn.ensemble import (
  RandomForestClassifier, 
  IsolationForest,
  GradientBoostingClassifier, 
  AdaBoostClassifier, 
  ExtraTreesClassifier
)
from sklearn.linear_model import (
  LogisticRegression
)
from sklearn.neighbors import (
  KNeighborsClassifier
)
from sklearn.naive_bayes import (
  GaussianNB
)
from xgboost import (
  XGBClassifier
)
from lightgbm import (
  LGBMClassifier
)
from sklearn.svm import (
  SVC
)

from sklearn.discriminant_analysis import (
  LinearDiscriminantAnalysis, 
  QuadraticDiscriminantAnalysis
)
from sklearn.neural_network import (
  MLPClassifier
)

from sklearn.metrics import (
  classification_report, 
  confusion_matrix,
  roc_auc_score,
  roc_curve,
  precision_recall_curve,
  f1_score,
  recall_score, 
  precision_score,
  accuracy_score,
  precision_recall_fscore_support   # Métricas adicionales
)
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

# Para optimización de hiperparámetros
from scipy.stats import randint, uniform, loguniform

# Configuración de estilo para las gráficas
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')

### Carga y Exploración del Dataset Combinado

Se va a entrenar un modelo sobre el dataset combinado: **Access-Log-Master** y **CSIC2010**

In [None]:
dataset_combined_path = '../data/combined/dataset_combined_master_csic2010.csv'
df_combined = pd.read_csv(dataset_combined_path) 
display(df_combined.head(3))
display(df_combined['anomaly'].value_counts())

### Codificación de Variables

Se tendría que codificar solamente `method`, ya que `anomaly` viene codificada con $0$ (Normal) y $1$ (Anomalía)

In [None]:
encoder = LabelEncoder()
df_combined['method_encoded'] = encoder.fit_transform(df_combined['method'])

### Preparación de Datos para Modelado

In [None]:
num_feat = [
  'url__count_sql_words', 
  'url__count_xss_words', 
  'url__count_command_words',
  'url__count_auth_words', 
  'url__count_error_words', 
  'url__count_malware_words',
  'url__count_danger_characters', 
  'url__count_obfuscation_code_words', 
  'url__count_dir_words',
  'url__count_dot', 
  'url__count_http', 
  'url__count_percentage_symbol', 
  'url__count_question_symbol',
  'url__count_hyphen', 
  'url__count_equal', 
  'url__url_length', 
  'url__digit_count',
  'url__letter_count', 
  'url__count_special_characters', 
  'url__is_encoded', 
  'url__unusual_character_ratio'
]
cat_feat = ['method_encoded']

X = df_combined[num_feat + cat_feat]
X_cat = df_combined[cat_feat]
y = df_combined['anomaly']

### Escalado de Datos

In [None]:
#scaler = StandardScaler()
#X_scaled = scaler.fit_transform(X[num_feat])
#X_scaled_df = pd.DataFrame(X_scaled, columns=num_feat, index=X.index)

In [None]:
scaler = RobustScaler()
X_scaled = scaler.fit_transform(X[num_feat])
X_scaled_df = pd.DataFrame(X_scaled, columns=num_feat, index=X.index)

In [None]:
X = X_scaled_df
X[cat_feat] = X_cat[cat_feat]
display(X.head())

### Función de Evaluación y Entrenamiento de Modelos

In [None]:
def evaluate_model_with_scaling(model, X, y, cv=5, random_state=42):
  """Evalúa un modelo con escalado, SMOTE y validación cruzada estratificada.

  Args:
    model: Modelo de sklearn
    X: Características escaladas
    y: Etiquetas
    cv: Número de folds
    random_state: Parámetro de Aleatoriedad

  Returns:
    Diccionario con métricas promedio y globales
  """
  skf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=random_state)

  # Crear pipeline (SMOTE se aplica solo durante entrenamiento)
  pipeline = ImbPipeline([
    ('smote', SMOTE(random_state=random_state)),
    ('classifier', model)
  ])

  accuracies, aucs, precisions, recalls, f1_scores = [], [], [], [], []
  y_true_all, y_pred_all, y_pred_proba_all = [], [], []

  for fold, (train_idx, val_idx) in enumerate(skf.split(X, y), 1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    # Entrenar pipeline
    with warnings.catch_warnings():
      warnings.filterwarnings("ignore", category=ConvergenceWarning)
      pipeline.fit(X_train, y_train)

    # Predecir
    y_pred = pipeline.predict(X_val)
    y_pred_proba = pipeline.predict_proba(X_val)[:, 1] if hasattr(model, "predict_proba") else None

    # Métricas del fold
    accuracies.append(accuracy_score(y_val, y_pred))
    if y_pred_proba is not None:
      aucs.append(roc_auc_score(y_val, y_pred_proba))

    precision, recall, f1, _ = precision_recall_fscore_support(y_val, y_pred, average='binary', pos_label=1, zero_division=0)
    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1)

    # Acumular para métricas globales
    y_true_all.extend(y_val)
    y_pred_all.extend(y_pred)
    if y_pred_proba is not None:
      y_pred_proba_all.extend(y_pred_proba)

  # Métricas promedio de CV
  cv_metrics = {
    'accuracy': f"{np.mean(accuracies):.4f} ± {np.std(accuracies):.4f}",
    'auc': f"{np.mean(aucs):.4f} ± {np.std(aucs):.4f}" if aucs else "N/A",
    'precision': f"{np.mean(precisions):.4f} ± {np.std(precisions):.4f}",
    'recall': f"{np.mean(recalls):.4f} ± {np.std(recalls):.4f}",
    'f1': f"{np.mean(f1_scores):.4f} ± {np.std(f1_scores):.4f}"
  }

  # Métricas globales
  global_metrics = {
    'accuracy': accuracy_score(y_true_all, y_pred_all),
    'auc': roc_auc_score(y_true_all, y_pred_proba_all) if y_pred_proba_all else None,
    'precision': precision_recall_fscore_support(y_true_all, y_pred_all, average='binary', pos_label=1, zero_division=0)[0],
    'recall': precision_recall_fscore_support(y_true_all, y_pred_all, average='binary', pos_label=1, zero_division=0)[1],
    'f1': precision_recall_fscore_support(y_true_all, y_pred_all, average='binary', pos_label=1, zero_division=0)[2],
    'confusion_matrix': confusion_matrix(y_true_all, y_pred_all)
  }

  return cv_metrics, global_metrics

In [None]:
# Variable para almacenar los resultados de cada modelo de entrenamiento 
results = {}

#### Regresión Logística

In [None]:
model_lr = LogisticRegression(
  random_state=42,
  max_iter=1000,  
  solver='lbfgs', 
  C=0.1,  
  class_weight='balanced', 
  verbose=0
)
# Evaluación con Validación Cruzada (5 Folds)
cv_metrics, global_metrics = evaluate_model_with_scaling(model_lr, X, y, cv=5)
results['LogisticRegression'] = {
  'cv': cv_metrics, 
  'global': global_metrics
}
print(f"Métricas CV (promedio ± std)")
for metric, value in cv_metrics.items():
  print(f"- {metric.capitalize()}: {value}")

print(f"""Métricas totales (concatenado de todos los folds):
- Accuracy:  {global_metrics['accuracy']:.4f}
- Precision: {global_metrics['precision']:.4f}
- Recall:    {global_metrics['recall']:.4f}
- F1:        {global_metrics['f1']:.4f}
""")
# NO Metrica AUC

#### Clasificador Random Forest

In [None]:
model_rfc = RandomForestClassifier(
  random_state=42,
  n_estimators=100,  
  class_weight='balanced_subsample',  # Mejor manejo de clases
  max_depth=None,
  min_samples_split=5,
  min_samples_leaf=2
)
# Evaluación con Validación Cruzada (5 Folds)
cv_metrics, global_metrics = evaluate_model_with_scaling(model_rfc, X, y, cv=5)
results['RandomForestClassifier'] = {
  'cv': cv_metrics, 
  'global': global_metrics
}
print(f"Métricas CV (promedio ± std)")
for metric, value in cv_metrics.items():
  print(f"- {metric.capitalize()}: {value}")

print(f"""Métricas totales (concatenado de todos los folds):
- Accuracy:  {global_metrics['accuracy']:.4f}
- Precision: {global_metrics['precision']:.4f}
- Recall:    {global_metrics['recall']:.4f}
- F1:        {global_metrics['f1']:.4f}
""")
print(f"- AUC: {global_metrics['auc']:.4f} ")

#### Clasificador XGBoost

In [None]:
model_xgboost = XGBClassifier(
  random_state=42,
  eval_metric='logloss',
  scale_pos_weight=len(y[y==0]) / len(y[y==1]), # Balancear clases
  n_estimators=200,
  max_depth=6,
  learning_rate=0.1,
  subsample=0.8,
  colsample_bytree=0.8
)
# Evaluación con Validación Cruzada (5 Folds)
cv_metrics, global_metrics = evaluate_model_with_scaling(model_xgboost, X, y, cv=5)
results['XGBClassifier'] = {
  'cv': cv_metrics, 
  'global': global_metrics
}
print(f"Métricas CV (promedio ± std)")
for metric, value in cv_metrics.items():
  print(f"- {metric.capitalize()}: {value}")

print(f"""Métricas totales (concatenado de todos los folds):
- Accuracy:  {global_metrics['accuracy']:.4f}
- Precision: {global_metrics['precision']:.4f}
- Recall:    {global_metrics['recall']:.4f}
- F1:        {global_metrics['f1']:.4f}
""")
print(f"- AUC: {global_metrics['auc']:.4f} ")

#### Gradient Boosting


In [None]:
model_grad_boosting = GradientBoostingClassifier(
  random_state=42,
  n_estimators=200,
  learning_rate=0.1,
  max_depth=5,
  min_samples_split=10,
  min_samples_leaf=5,
  subsample=0.8,
  max_features='sqrt'
)
# Evaluación con Validación Cruzada (5 Folds)
cv_metrics, global_metrics = evaluate_model_with_scaling(model_grad_boosting, X, y, cv=5)
results['GradientBoosting'] = {
  'cv': cv_metrics, 
  'global': global_metrics
}
print(f"Métricas CV (promedio ± std)")
for metric, value in cv_metrics.items():
  print(f"- {metric.capitalize()}: {value}")

print(f"""Métricas totales (concatenado de todos los folds):
- Accuracy:  {global_metrics['accuracy']:.4f}
- Precision: {global_metrics['precision']:.4f}
- Recall:    {global_metrics['recall']:.4f}
- F1:        {global_metrics['f1']:.4f}
""")
print(f"- AUC: {global_metrics['auc']:.4f} ")

#### Ada Boost

#### Decision Tree

#### SVM 

In [None]:
model_svm = SVC(
  random_state=42,
  kernel='linear',
  C=1.0,
  class_weight='balanced',
  probability=True,  # Necesario para AUC
  max_iter=1000,
  verbose=0
)
# Evaluación con Validación Cruzada (5 Folds)
cv_metrics, global_metrics = evaluate_model_with_scaling(model_svm, X, y, cv=5)
results['SVM'] = {
  'cv': cv_metrics, 
  'global': global_metrics
}
print(f"Métricas CV (promedio ± std)")
for metric, value in cv_metrics.items():
  print(f"- {metric.capitalize()}: {value}")

print(f"""Métricas totales (concatenado de todos los folds):
- Accuracy:  {global_metrics['accuracy']:.4f}
- Precision: {global_metrics['precision']:.4f}
- Recall:    {global_metrics['recall']:.4f}
- F1:        {global_metrics['f1']:.4f}
""")
print(f"- AUC: {global_metrics['auc']:.4f} ")


#### KNN

In [None]:
model_knn = KNeighborsClassifier(
    n_neighbors=5,
    weights='distance',
    algorithm='auto',
    leaf_size=30,
    p=2,
    n_jobs=-1
)



#### Naive Bayes 



#### MLPClassifier

### Fine-Tuning con RandomizedSearchCV

#### Optimización de Regresión Logística

In [None]:
lr_param_dist = [
  {  # Para penalty='l2'
    'classifier__C': loguniform(1e-3, 1e2),
    'classifier__solver': ['lbfgs', 'newton-cg', 'sag', 'saga'],
    'classifier__max_iter': [2000, 3000, 5000],
    'classifier__class_weight': [None, 'balanced'],
    'classifier__penalty': ['l2']
  },
  {  # Para penalty=None (sin regularización)
    'classifier__solver': ['lbfgs', 'newton-cg', 'sag', 'saga'],
    'classifier__max_iter': [2000, 3000, 5000],
    'classifier__class_weight': [None, 'balanced'],
    'classifier__penalty': [None]
  }
]

lr_pipeline = ImbPipeline([
  ('smote', SMOTE(random_state=42)),
  ('classifier', LogisticRegression(random_state=42))
])

lr_random = RandomizedSearchCV(
  lr_pipeline,
  param_distributions=lr_param_dist,
  n_iter=20,
  cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=42),  # 3 folds para velocidad
  scoring='f1',
  refit=True,
  n_jobs=-1,
  verbose=0,
  random_state=42
)

# Entrenar con warnings suprimidos
with warnings.catch_warnings():
  warnings.filterwarnings("ignore", category=ConvergenceWarning)
  warnings.filterwarnings("ignore", category=UserWarning)
  lr_random.fit(X, y)

print(f"""Resultados del Fine-Tuning de Hiperparámetros de Regresión Logística (Optimización de Regresión Logística)
Mejores Parámetros LR:  {lr_random.best_params_}
Mejor F1-Score:         {lr_random.best_score_:.4f}
""")

#### Optimización del Clasificador Random Forest

In [None]:
rf_param_dist = {
  'classifier__n_estimators': randint(100, 500),
  'classifier__max_depth': [10, 20, 30, None],
  'classifier__min_samples_split': randint(2, 20),
  'classifier__min_samples_leaf': randint(1, 10),
  'classifier__max_features': ['sqrt', 'log2', None],
  'classifier__class_weight': [None, 'balanced', 'balanced_subsample']
}

rf_pipeline = ImbPipeline([
  ('smote', SMOTE(random_state=42)),
  ('classifier', RandomForestClassifier(random_state=42))
])

rf_random = RandomizedSearchCV(
  rf_pipeline,
  param_distributions=rf_param_dist,
  n_iter=15,
  cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=42),
  scoring='f1',
  refit=True,
  n_jobs=-1,
  verbose=0,
  random_state=42
)

# Proceso de Entrenamiento
rf_random.fit(X, y)

print(f"""Resultados del Fine-Tuning de Hiperparámetros del Clasificador Random Forest (Optimización del RF Classifier)
Mejores Parámetros LR:  {rf_random.best_params_}
Mejor F1-Score:         {rf_random.best_score_:.4f}
""")

### Comparación y Visualización de Resultados

In [None]:
# Modelos optimizados
optimized_models = {
  'LogisticRegression_Optimized': lr_random.best_estimator_,
  'RandomForest_Optimized': rf_random.best_estimator_,
  'XGBoost_Default': model_xgboost  # Ya estaba bastante configurado
}

final_results = {}
for name, model in optimized_models.items():
  print(f" {name}:")
  cv_metrics, global_metrics = evaluate_model_with_scaling(
    model.named_steps['classifier'] if hasattr(model, 'named_steps') else model,
    X, y, cv=5
  )
  final_results[name] = {'cv': cv_metrics, 'global': global_metrics}
  
  print(f"- F1-score (CV): {cv_metrics['f1']}")
  print(f"- F1-score (Global): {global_metrics['f1']:.4f}")

In [None]:
# Preparar datos para gráfico
model_names = list(results.keys())
f1_scores = [results[name]['global']['precision'] for name in model_names]
auc_scores = [results[name]['global']['recall'] for name in model_names if results[name]['global']['auc']]

# Gráfico de comparación
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Gráfico 1: F1 Scores
axes[0].bar(model_names, f1_scores, color=['#FF6B6B', '#4ECDC4', '#45B7D1'], alpha=0.8)
axes[0].set_title('Comparación de F1-Score (Global)', fontsize=14, fontweight='bold')
axes[0].set_ylabel('F1-Score', fontsize=12)
axes[0].set_ylim(0, 1)
axes[0].grid(True, alpha=0.3, axis='y')

# Agregar valores encima de las barras
for i, v in enumerate(f1_scores):
    axes[0].text(i, v + 0.02, f'{v:.3f}', ha='center', fontweight='bold')

# Gráfico 2: AUC Scores
axes[1].bar(model_names[:len(auc_scores)], auc_scores, color=['#FF6B6B', '#4ECDC4', '#45B7D1'], alpha=0.8)
axes[1].set_title('Comparación de AUC (Global)', fontsize=14, fontweight='bold')
axes[1].set_ylabel('AUC', fontsize=12)
axes[1].set_ylim(0, 1)
axes[1].grid(True, alpha=0.3, axis='y')

# Agregar valores encima de las barras
for i, v in enumerate(auc_scores):
  axes[1].text(i, v + 0.02, f'{v:.3f}', ha='center', fontweight='bold')

plt.tight_layout()
plt.show()


### Guardar Mejor Modelo

In [None]:
import joblib
import datetime

# Seleccionar el mejor modelo basado en F1-score
best_model_name = max(final_results, key=lambda x: final_results[x]['global']['f1'])
best_model = optimized_models[best_model_name]

print(f"\n" + "="*60)
print(f"MEJOR MODELO: {best_model_name}")
print(f"F1-Score: {final_results[best_model_name]['global']['f1']:.4f}")
print("="*60)

# Guardar modelo y escalador
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
joblib.dump(best_model, f'../models/best_model_{timestamp}.pkl')
joblib.dump(scaler, f'../models/scaler_{timestamp}.pkl')
joblib.dump(method_encoder, f'../models/method_encoder_{timestamp}.pkl')

print(f"\nModelo guardado como: best_model_{timestamp}.pkl")
print(f"Escalador guardado como: scaler_{timestamp}.pkl")
print(f"Encoder de métodos guardado como: method_encoder_{timestamp}.pkl")