# Network Anomaly Detection

This notebook demonstrates unsupervised anomaly detection on telecom network KPI data
using Isolation Forest. We identify anomalous cell behavior from time-series metrics
and evaluate detection quality against ground-truth labels.

## 1. Setup & Configuration

In [ ]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_context("notebook")
sns.set_style("whitegrid")
sns.set_palette("husl")

plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['figure.dpi'] = 100

In [ ]:
import sys
sys.path.insert(0, "../src")

RANDOM_STATE = 42
DATA_PATH = "../data/synthetic_data.parquet"

np.random.seed(RANDOM_STATE)
print("Environment ready.")

## 2. Data Loading & Validation

In [ ]:
df = pd.read_parquet(DATA_PATH)
print(f"Dataset shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
df.head()

In [ ]:
# Anomaly rate in ground-truth labels
anomaly_col = [c for c in df.columns if 'anomaly' in c.lower() or 'label' in c.lower()][0]
print(f"Label column: {anomaly_col}")
print(f"\nAnomaly rate:")
print(df[anomaly_col].value_counts(normalize=True).round(4))
print(f"\nAnomaly count: {df[anomaly_col].sum()} / {len(df)}")

In [ ]:
# Cell tower counts
cell_col = [c for c in df.columns if 'cell' in c.lower()][0]
print(f"Cell column: {cell_col}")
print(f"Number of unique cells: {df[cell_col].nunique()}")
print(f"\nRecords per cell (top 10):")
print(df[cell_col].value_counts().head(10))

In [ ]:
# Basic validation
print("Missing values:")
print(df.isnull().sum()[df.isnull().sum() > 0])
if df.isnull().sum().sum() == 0:
    print("No missing values found.")

print(f"\nData types:\n{df.dtypes}")

## 3. Exploratory Data Analysis

In [ ]:
# Time-series plot for a sample cell
time_col = [c for c in df.columns if 'time' in c.lower() or 'date' in c.lower() or 'timestamp' in c.lower()]
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
kpi_cols = [c for c in numeric_cols if c != anomaly_col]

sample_cell = df[cell_col].value_counts().index[0]
cell_data = df[df[cell_col] == sample_cell].copy()

if time_col:
    cell_data = cell_data.sort_values(time_col[0])

fig, axes = plt.subplots(min(3, len(kpi_cols)), 1, figsize=(14, 10), sharex=True)
if min(3, len(kpi_cols)) == 1:
    axes = [axes]
for i, col in enumerate(kpi_cols[:3]):
    ax = axes[i]
    ax.plot(cell_data.index, cell_data[col], linewidth=0.8, label=col)
    anomaly_mask = cell_data[anomaly_col] == 1
    ax.scatter(cell_data.index[anomaly_mask], cell_data.loc[anomaly_mask, col],
               color='red', s=20, zorder=5, label='Anomaly')
    ax.set_ylabel(col)
    ax.legend(loc='upper right')
ax.set_xlabel('Index')
fig.suptitle(f'Time-Series KPIs for Cell: {sample_cell}', fontsize=14)
plt.tight_layout()
plt.show()

In [ ]:
# Anomaly distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

df[anomaly_col].value_counts().plot(kind='bar', ax=axes[0], color=['steelblue', 'crimson'])
axes[0].set_title('Anomaly Label Distribution')
axes[0].set_xlabel('Label')
axes[0].set_ylabel('Count')
axes[0].set_xticklabels(['Normal', 'Anomaly'], rotation=0)

# Anomaly rate per cell
cell_anomaly_rate = df.groupby(cell_col)[anomaly_col].mean().sort_values(ascending=False)
axes[1].bar(range(len(cell_anomaly_rate)), cell_anomaly_rate.values, color='coral')
axes[1].set_title('Anomaly Rate per Cell')
axes[1].set_xlabel('Cell Index')
axes[1].set_ylabel('Anomaly Rate')
axes[1].axhline(y=cell_anomaly_rate.mean(), color='black', linestyle='--', label='Mean')
axes[1].legend()

plt.tight_layout()
plt.show()

In [ ]:
# KPI distributions by label (normal vs anomaly)
plot_cols = kpi_cols[:6]
n_plots = len(plot_cols)
n_rows = (n_plots + 2) // 3

fig, axes = plt.subplots(n_rows, 3, figsize=(16, 4 * n_rows))
axes = axes.flatten() if n_rows > 1 else [axes] if n_plots == 1 else axes

for i, col in enumerate(plot_cols):
    for label, color in zip([0, 1], ['steelblue', 'crimson']):
        subset = df[df[anomaly_col] == label][col]
        axes[i].hist(subset, bins=50, alpha=0.6, color=color,
                     label='Anomaly' if label == 1 else 'Normal', density=True)
    axes[i].set_title(f'{col}')
    axes[i].legend()

for j in range(i + 1, len(axes)):
    axes[j].set_visible(False)

fig.suptitle('KPI Distributions by Label', fontsize=14, y=1.02)
plt.tight_layout()
plt.show()

## 4. Feature Engineering

In [ ]:
from feature_engineer import FeatureEngineer

fe = FeatureEngineer()
df_features = fe.fit_transform(df)
print(f"Feature matrix shape: {df_features.shape}")
print(f"Original columns: {len(df.columns)} -> Engineered columns: {len(df_features.columns)}")

In [ ]:
# Inspect new features
new_cols = [c for c in df_features.columns if c not in df.columns]
print(f"New engineered features ({len(new_cols)}):")
for col in new_cols:
    print(f"  - {col}")

df_features.head()

In [ ]:
# Prepare feature matrix (exclude label and non-numeric columns)
exclude_cols = [anomaly_col, cell_col] + time_col
feature_cols = [c for c in df_features.select_dtypes(include=[np.number]).columns if c not in exclude_cols]
X = df_features[feature_cols].copy()
y_true = df_features[anomaly_col].values

print(f"Feature matrix X: {X.shape}")
print(f"Ground truth labels y: {y_true.shape}")
print(f"Features used: {feature_cols}")

## 5. Model Training

In [ ]:
from model import IsolationForestModel

model = IsolationForestModel(random_state=RANDOM_STATE)
print("Isolation Forest model initialized.")
print(f"Model parameters: {model}")

In [ ]:
# Train unsupervised - no labels provided
model.fit(X)
print("Model training complete (unsupervised - no labels used).")

In [ ]:
# Predict anomaly scores
anomaly_scores = model.decision_function(X)
anomaly_preds = model.predict(X)

# Isolation Forest returns -1 for anomalies and 1 for normal; convert to 0/1
anomaly_preds_binary = (anomaly_preds == -1).astype(int)

print(f"Anomaly scores range: [{anomaly_scores.min():.4f}, {anomaly_scores.max():.4f}]")
print(f"Predicted anomalies: {anomaly_preds_binary.sum()} / {len(anomaly_preds_binary)}")
print(f"Predicted anomaly rate: {anomaly_preds_binary.mean():.4f}")

## 6. Evaluation & Metrics

In [ ]:
from sklearn.metrics import (
    classification_report, f1_score, precision_score,
    recall_score, roc_auc_score, roc_curve, confusion_matrix
)

# Evaluate against ground truth
print("Classification Report (Isolation Forest vs Ground Truth):")
print("=" * 60)
print(classification_report(y_true, anomaly_preds_binary, target_names=['Normal', 'Anomaly']))

f1 = f1_score(y_true, anomaly_preds_binary)
precision = precision_score(y_true, anomaly_preds_binary)
recall = recall_score(y_true, anomaly_preds_binary)
print(f"F1 Score:   {f1:.4f}")
print(f"Precision:  {precision:.4f}")
print(f"Recall:     {recall:.4f}")

In [ ]:
# Anomaly score distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Distribution of anomaly scores by true label
for label, color, name in zip([0, 1], ['steelblue', 'crimson'], ['Normal', 'Anomaly']):
    mask = y_true == label
    axes[0].hist(anomaly_scores[mask], bins=60, alpha=0.6, color=color, label=name, density=True)
axes[0].set_title('Anomaly Score Distribution by True Label')
axes[0].set_xlabel('Anomaly Score (lower = more anomalous)')
axes[0].set_ylabel('Density')
axes[0].legend()

# Confusion matrix
cm = confusion_matrix(y_true, anomaly_preds_binary)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[1],
            xticklabels=['Normal', 'Anomaly'], yticklabels=['Normal', 'Anomaly'])
axes[1].set_title('Confusion Matrix')
axes[1].set_xlabel('Predicted')
axes[1].set_ylabel('Actual')

plt.tight_layout()
plt.show()

In [ ]:
# ROC Curve using anomaly scores
# Negate scores because lower score = more anomalous in sklearn Isolation Forest
roc_scores = -anomaly_scores
fpr, tpr, thresholds = roc_curve(y_true, roc_scores)
auc_score = roc_auc_score(y_true, roc_scores)

fig, ax = plt.subplots(figsize=(8, 6))
ax.plot(fpr, tpr, color='darkorange', lw=2, label=f'Isolation Forest (AUC = {auc_score:.4f})')
ax.plot([0, 1], [0, 1], color='gray', lw=1, linestyle='--', label='Random')
ax.set_xlabel('False Positive Rate')
ax.set_ylabel('True Positive Rate')
ax.set_title('ROC Curve - Anomaly Detection')
ax.legend(loc='lower right')
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()
print(f"ROC AUC Score: {auc_score:.4f}")

## 7. Interpretation

In [ ]:
from sklearn.inspection import permutation_importance

# Permutation importance using the anomaly detector's decision_function
perm_result = permutation_importance(
    model, X, y_true,
    n_repeats=10,
    random_state=RANDOM_STATE,
    scoring='f1'
)

perm_df = pd.DataFrame({
    'feature': feature_cols,
    'importance_mean': perm_result.importances_mean,
    'importance_std': perm_result.importances_std
}).sort_values('importance_mean', ascending=False)

print("Feature Importance (Permutation):")
print(perm_df.head(15).to_string(index=False))

In [ ]:
# Plot feature importance
top_n = min(15, len(perm_df))
top_features = perm_df.head(top_n)

fig, ax = plt.subplots(figsize=(10, 8))
ax.barh(range(top_n), top_features['importance_mean'].values,
        xerr=top_features['importance_std'].values,
        color='steelblue', alpha=0.8)
ax.set_yticks(range(top_n))
ax.set_yticklabels(top_features['feature'].values)
ax.invert_yaxis()
ax.set_xlabel('Mean Importance (Permutation)')
ax.set_title('Top Feature Importances for Anomaly Detection')

plt.tight_layout()
plt.show()

In [ ]:
# Anomaly score visualization across cells
df_features['anomaly_score'] = anomaly_scores
df_features['predicted_anomaly'] = anomaly_preds_binary

cell_scores = df_features.groupby(cell_col).agg(
    mean_score=('anomaly_score', 'mean'),
    min_score=('anomaly_score', 'min'),
    anomaly_rate=('predicted_anomaly', 'mean'),
    true_anomaly_rate=(anomaly_col, 'mean')
).sort_values('mean_score')

fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Mean anomaly score per cell
colors = ['crimson' if rate > 0.1 else 'steelblue' for rate in cell_scores['true_anomaly_rate']]
axes[0].bar(range(len(cell_scores)), cell_scores['mean_score'], color=colors, alpha=0.8)
axes[0].set_title('Mean Anomaly Score per Cell')
axes[0].set_xlabel('Cell (sorted by score)')
axes[0].set_ylabel('Mean Anomaly Score')

# Predicted vs true anomaly rate per cell
axes[1].scatter(cell_scores['true_anomaly_rate'], cell_scores['anomaly_rate'],
                alpha=0.7, s=60, color='darkorange')
axes[1].plot([0, cell_scores['true_anomaly_rate'].max()],
             [0, cell_scores['true_anomaly_rate'].max()],
             'k--', alpha=0.5, label='Perfect')
axes[1].set_xlabel('True Anomaly Rate')
axes[1].set_ylabel('Predicted Anomaly Rate')
axes[1].set_title('Predicted vs True Anomaly Rate per Cell')
axes[1].legend()

plt.tight_layout()
plt.show()

## 8. Business Insights & Conclusions

In [ ]:
# Threshold sensitivity analysis for early warning
thresholds_to_test = np.percentile(anomaly_scores, [1, 2, 5, 10, 15, 20])

print("Threshold Tuning Analysis:")
print("=" * 70)
print(f"{'Percentile':>12} {'Threshold':>12} {'Precision':>12} {'Recall':>12} {'F1':>12}")
print("-" * 70)

results = []
for pct, thr in zip([1, 2, 5, 10, 15, 20], thresholds_to_test):
    preds = (anomaly_scores < thr).astype(int)
    if preds.sum() > 0:
        p = precision_score(y_true, preds, zero_division=0)
        r = recall_score(y_true, preds, zero_division=0)
        f = f1_score(y_true, preds, zero_division=0)
    else:
        p, r, f = 0, 0, 0
    results.append({'percentile': pct, 'threshold': thr, 'precision': p, 'recall': r, 'f1': f})
    print(f"{pct:>12}th {thr:>12.4f} {p:>12.4f} {r:>12.4f} {f:>12.4f}")

results_df = pd.DataFrame(results)

In [ ]:
# Visualize threshold trade-offs
fig, ax = plt.subplots(figsize=(10, 6))
ax.plot(results_df['percentile'], results_df['precision'], 'o-', label='Precision', linewidth=2)
ax.plot(results_df['percentile'], results_df['recall'], 's-', label='Recall', linewidth=2)
ax.plot(results_df['percentile'], results_df['f1'], 'D-', label='F1 Score', linewidth=2)
ax.set_xlabel('Score Percentile Threshold')
ax.set_ylabel('Metric Value')
ax.set_title('Anomaly Detection: Precision-Recall Trade-off by Threshold')
ax.legend()
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [ ]:
# Early warning: time-to-detection analysis
print("Early Warning System Insights:")
print("=" * 60)
print(f"")
print(f"Model Performance Summary:")
print(f"  - ROC AUC: {auc_score:.4f}")
print(f"  - Default F1: {f1:.4f} (Precision: {precision:.4f}, Recall: {recall:.4f})")
print(f"")
print(f"Key Findings:")
print(f"  - The Isolation Forest identifies anomalies without requiring labeled data,")
print(f"    making it suitable for real-time deployment where labels are unavailable.")
print(f"  - Top contributing features suggest monitoring these KPIs for early warning.")
print(f"  - Cell-level analysis reveals some cells are more prone to anomalies,")
print(f"    indicating potential infrastructure or coverage issues.")
print(f"")
print(f"Recommendations:")
print(f"  1. Deploy with conservative threshold (high recall) for safety-critical alerts.")
print(f"  2. Use tiered alerting: high-confidence anomalies trigger immediate action,")
print(f"     borderline cases flagged for human review.")
print(f"  3. Retrain periodically as network patterns evolve seasonally.")
print(f"  4. Focus monitoring on top features identified by permutation importance.")
print(f"  5. Investigate cells with consistently high anomaly scores for root-cause analysis.")

### Summary

This notebook demonstrated an unsupervised anomaly detection pipeline for telecom network data:

- **Data**: Loaded and validated synthetic network KPI data with ground-truth anomaly labels.
- **EDA**: Explored time-series behavior, anomaly distributions, and KPI patterns.
- **Features**: Applied feature engineering to enrich the raw KPI signals.
- **Model**: Trained an Isolation Forest in a fully unsupervised manner (no labels used during training).
- **Evaluation**: Assessed detection quality against ground truth using F1, precision, recall, and ROC AUC.
- **Interpretation**: Identified key features driving anomaly detection and analyzed cell-level patterns.
- **Business Value**: Threshold tuning enables early warning systems with configurable precision-recall trade-offs.