# Evaluación HDFS
---

In [None]:
import json
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt
import joblib
from sklearn.metrics import classification_report, roc_curve, auc, confusion_matrix

In [None]:
# Rutas
plots_dir = Path('outputs/plots')
plots_dir.mkdir(parents=True, exist_ok=True)
meta_path = Path('outputs/detect_meta.json')

In [None]:
# Cargar ground-truth labels (guardadas via joblib)
y_true = joblib.load('data/processed/y.joblib')

In [None]:
# Load detection outputs
mse = np.load('outputs/mse_scores.npy')
predictions = np.load('outputs/anomaly_predictions.npy')

assert len(y_true) == len(mse) == len(predictions), "Length mismatch between y_true, mse, and predictions."

In [None]:
# Load the exact threshold used during detection (so plots match your run)
if meta_path.exists():
    with open(meta_path, 'r', encoding='utf-8') as f:
        meta = json.load(f)
    threshold = meta.get('computed_threshold', None)
else:
    meta = {}
    threshold = None

## Evaluación

In [None]:
# Classification report + confusion matrix
print(classification_report(y_true, predictions, digits=4))

cm = confusion_matrix(y_true, predictions)
print("Confusion matrix:\n", cm)

In [None]:
# ROC curve using MSE as anomaly score (higher = more anomalous)
fpr, tpr, thresh = roc_curve(y_true, mse)  # pos_label=1 by default
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, label=f'AUC = {roc_auc:.3f}')
plt.plot([0, 1], [0, 1], 'k--')
plt.title('Curva ROC - Detección de Anomalías')
plt.xlabel('Tasa de Falsos Positivos')
plt.ylabel('Tasa de Verdaderos Positivos')
plt.legend()
plt.tight_layout()
plt.savefig(plots_dir / 'roc_curve.png', dpi=120)
plt.show()

In [None]:
# Reconstruction error distribution
plt.figure(figsize=(10, 6))
plt.hist(mse[y_true == 0], bins=50, alpha=0.5, label='Normales')
plt.hist(mse[y_true == 1], bins=50, alpha=0.5, label='Anomalías')

if threshold is not None:
    plt.axvline(threshold, linestyle='--', label=f'Umbral: {threshold:.4f}')

plt.title('Distribución de Errores de Reconstrucción (MSE)')
plt.xlabel('MSE')
plt.ylabel('Frecuencia')
plt.yscale('log')  # often helpful for long tails
plt.legend()
plt.tight_layout()
plt.savefig(plots_dir / 'mse_distribution.png', dpi=120)
plt.show()