## 1. Imports and Configuration

In [None]:
# Standard Library
import os
from datetime import datetime

# Third-Party Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import lightgbm as lgb

# Scikit-Learn - Model Selection
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold

# Scikit-Learn - Classifiers
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

# Scikit-Learn - Metrics
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, roc_curve, confusion_matrix, classification_report
)

# Scikit-Learn - Preprocessing
from sklearn.preprocessing import StandardScaler

# Visualization Settings
sns.set(style="whitegrid")
%matplotlib inline

# Define paths for input and output
FEATURE_PATH = os.path.join("data", "features", "combined_engineered_features.csv")
MODELS_DIR = os.path.join("data", "models")
os.makedirs(MODELS_DIR, exist_ok=True)

## 2. Data Loading and Preprocessing

In [None]:
# Load the feature-engineered dataset
df = pd.read_csv(FEATURE_PATH)
print(f"Dataset shape: {df.shape}")

# Validate presence of required label column
assert "label1" in df.columns, "label1 column not found in dataset"

# Extract features and create binary target variable
# Target: label1 (Attack=1, Benign=0)
X = df.drop(columns=["label1", "label2", "label3", "label4", "label_full"], errors='ignore')
y = df["label1"].apply(lambda x: 1 if str(x).lower() == "attack" else 0)

print(f"Feature matrix shape: {X.shape}")
print(f"\nClass distribution:")
print(y.value_counts().rename({0: "Benign", 1: "Attack"}))

## 3. Train-Test Split

In [None]:
RANDOM_STATE = 42

# Perform stratified train-test split to maintain class distribution
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y
)

print(f"Training set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")
print(f"\nTraining set class distribution:\n{y_train.value_counts()}")

## 4. Metrics Computation Functions

In [None]:
def compute_metrics(y_true, y_pred, y_prob=None):
    """
    Compute comprehensive binary classification metrics.
    
    Parameters:
    -----------
    y_true : array-like
        True labels
    y_pred : array-like
        Predicted labels
    y_prob : array-like, optional
        Predicted probabilities for positive class
        
    Returns:
    --------
    dict : Dictionary containing computed metrics
    """
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, zero_division=0)
    rec = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)
    cm = confusion_matrix(y_true, y_pred)
    
    # Extract confusion matrix components
    tn, fp, fn, tp = cm.ravel()
    
    # Calculate specificity (true negative rate)
    spec = tn / (tn + fp) if (tn + fp) > 0 else 0.0
    
    # Calculate ROC-AUC if probabilities are provided
    roc_auc = roc_auc_score(y_true, y_prob) if (y_prob is not None) else np.nan
    
    return {
        "accuracy": acc,
        "precision": prec,
        "recall": rec,
        "specificity": spec,
        "f1": f1,
        "roc_auc": roc_auc,
        "confusion_matrix": cm
    }

## 5. Model Definition and Hyperparameter Grids

In [None]:
model_defs = {}

# ============================================================================
# LINEAR MODELS
# ============================================================================

# Logistic Regression - Fast baseline linear classifier
model_defs["log_reg"] = {
    "estimator": LogisticRegression(max_iter=5000, random_state=RANDOM_STATE),
    "param_grid": {
        "penalty": ["l2","elasticnet"],
        "C": [0.1, 1, 10, 100],
        "solver": ["lbfgs", "saga"],
        "class_weight": [None, "balanced"]
    }
}

# ============================================================================
# TREE-BASED MODELS
# ============================================================================

# Random Forest - Ensemble of decision trees
model_defs["rf"] = {
    "estimator": RandomForestClassifier(random_state=RANDOM_STATE, n_jobs=-1),
    "param_grid": {
        "n_estimators": [50, 100, 200],
        "max_depth": [10, 20, 30, None],
        "min_samples_split": [2, 5, 10],
        "min_samples_leaf": [1, 2, 4],
        "max_features": ["sqrt"],
        "class_weight": [None, "balanced"]
    }
}

# LightGBM - Fast gradient boosting classifier
model_defs["lgb"] = {
    "estimator": lgb.LGBMClassifier(
        random_state=RANDOM_STATE, 
        n_jobs=-1, 
        verbose=-1,
        is_unbalance=True
    ),
    "param_grid": {
        "n_estimators": [100, 200, 300],
        "max_depth": [5, 10, 15, -1],
        "learning_rate": [0.01, 0.05, 0.1],
        "num_leaves": [20, 30, 40, 50],
        "min_data_in_leaf": [10, 20, 30],
        "feature_fraction": [0.8, 0.9, 1.0],
        "bagging_fraction": [0.8, 0.9, 1.0]
    }
}

# ============================================================================
# NEURAL NETWORK MODELS
# ============================================================================

# Multi-Layer Perceptron - Neural network classifier
model_defs["mlp"] = {
    "estimator": MLPClassifier(random_state=RANDOM_STATE, max_iter=500, early_stopping=True, n_jobs=-1),
    "param_grid": {
        "hidden_layer_sizes": [(64,), (128,), (64, 32), (128, 64)],
        "activation": ["relu", "tanh"],
        "alpha": [0.0001, 0.001, 0.01],
        "learning_rate": ["constant", "adaptive"],
        "batch_size": [32, 64]
    }
}

# ============================================================================
# DISTANCE-BASED & PROBABILISTIC MODELS
# ============================================================================

# K-Nearest Neighbors - Distance-based instance classifier
model_defs["knn"] = {
    "estimator": KNeighborsClassifier(),
    "param_grid": {
        "n_neighbors": [3, 5, 7, 9, 11],
        "weights": ["uniform", "distance"],
        "metric": ["euclidean", "manhattan"]
    }
}

# Gaussian Naive Bayes - Probabilistic classifier
model_defs["nb"] = {
    "estimator": GaussianNB(),
    "param_grid": {
        "var_smoothing": np.logspace(-10, -6, 5)
    }
}

# ============================================================================
# MODEL TRAINING CONFIGURATION
# ============================================================================

MODELS_TO_RUN = ["log_reg", "rf", "lgb", "mlp", "knn", "nb"]

print(f"Models to train: {MODELS_TO_RUN}")
print(f"Optimization method: RandomizedSearchCV")
print(f"Cross-validation folds: 3")
print(f"Iterations per model: 10")
print(f"Total models: {len(MODELS_TO_RUN)}")

## 6. Model Training and Evaluation Function

In [None]:
def train_and_evaluate_model(model_key, model_def, X_train, X_test, y_train, y_test,
                             base_dir, cv_folds=3, n_iter=10):
    """
    Train a model using RandomizedSearchCV and evaluate on test set.
    
    Parameters:
    -----------
    model_key : str
        Model identifier
    model_def : dict
        Dictionary containing estimator and parameter grid
    X_train, X_test : array-like
        Training and test feature matrices
    y_train, y_test : array-like
        Training and test labels
    base_dir : str
        Base directory for saving results
    cv_folds : int
        Number of cross-validation folds
    n_iter : int
        Number of RandomizedSearchCV iterations
        
    Returns:
    --------
    tuple : (results_dict, best_model, predictions, probabilities, metrics)
    """
    estimator = model_def["estimator"]
    param_grid = model_def["param_grid"]
    
    MODEL_DIR = os.path.join(base_dir, model_key)
    os.makedirs(MODEL_DIR, exist_ok=True)
    
    print(f"\n{'='*60}")
    print(f"Training {model_key.upper()}")
    print(f"{'='*60}")
    
    # Perform hyperparameter tuning using RandomizedSearchCV
    if param_grid:
        cv = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=RANDOM_STATE)
        search = RandomizedSearchCV(
            estimator,
            param_distributions=param_grid,
            n_iter=n_iter,
            scoring="f1",
            cv=cv,
            n_jobs=-1,
            random_state=RANDOM_STATE,
            verbose=1
        )
        search.fit(X_train, y_train)
        best_model = search.best_estimator_
        best_params = search.best_params_
        cv_f1 = search.best_score_
    else:
        best_model = estimator
        best_model.fit(X_train, y_train)
        best_params = {}
        cv_f1 = np.nan
    
    print(f"Best parameters: {best_params}")
    print(f"Cross-validation F1 score: {cv_f1:.4f}")
    
    # Generate predictions on test set
    y_pred = best_model.predict(X_test)
    
    # Extract probability estimates for ROC curve
    if hasattr(best_model, "predict_proba"):
        y_prob = best_model.predict_proba(X_test)[:, 1]
    elif hasattr(best_model, "decision_function"):
        y_prob = best_model.decision_function(X_test)
        y_prob = (y_prob - y_prob.min()) / (y_prob.max() - y_prob.min() + 1e-10)
    else:
        y_prob = y_pred.astype(float)
    
    # Compute comprehensive metrics
    metrics = compute_metrics(y_test, y_pred, y_prob)
    
    # Generate and save confusion matrix visualization
    cm = metrics["confusion_matrix"]
    plt.figure(figsize=(5, 4))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
                xticklabels=["Benign", "Attack"], yticklabels=["Benign", "Attack"],
                cbar_kws={'label': 'Count'})
    plt.title(f"{model_key.upper()} - Confusion Matrix")
    plt.xlabel("Predicted")
    plt.ylabel("True")
    cm_path = os.path.join(MODEL_DIR, f"{model_key}_confusion_matrix.png")
    plt.tight_layout()
    plt.savefig(cm_path, dpi=300, bbox_inches='tight')
    plt.close()
    
    # Generate and save ROC curve visualization
    fpr, tpr, _ = roc_curve(y_test, y_prob)
    roc_auc = roc_auc_score(y_test, y_prob)
    plt.figure(figsize=(6, 5))
    plt.plot(fpr, tpr, color='darkorange', lw=2.5, label=f"AUC = {roc_auc:.4f}")
    plt.plot([0, 1], [0, 1], "k--", lw=1.5, label="Random Classifier")
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel("False Positive Rate", fontsize=11)
    plt.ylabel("True Positive Rate", fontsize=11)
    plt.title(f"ROC Curve - {model_key.upper()}", fontsize=12)
    plt.legend(loc="lower right")
    plt.grid(alpha=0.3)
    roc_path = os.path.join(MODEL_DIR, f"{model_key}_roc_curve.png")
    plt.tight_layout()
    plt.savefig(roc_path, dpi=300, bbox_inches='tight')
    plt.close()
    
    print(f"Visualizations saved for {model_key}")
    
    # Return summary metrics (include best_model for saving later)
    return {
        "model": model_key,
        "best_params": str(best_params),
        "cv_f1_score": cv_f1,
        "test_accuracy": metrics["accuracy"],
        "test_precision": metrics["precision"],
        "test_recall": metrics["recall"],
        "test_specificity": metrics["specificity"],
        "test_f1": metrics["f1"],
        "test_roc_auc": metrics["roc_auc"]
    }, best_model, y_pred, y_prob, metrics

## 7. Main Training Loop

In [None]:
# Generate timestamp for unique results directory
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
RUN_DIR = os.path.join("binary_classification", f"results_{timestamp}")
os.makedirs(RUN_DIR, exist_ok=True)
print(f"Results will be saved to: {RUN_DIR}\n")

results_list = []
models_dict = {}
y_preds_dict = {}
y_probs_dict = {}
metrics_dict = {}

# Train all models using optimized RandomizedSearchCV
for key in MODELS_TO_RUN:
    result_row, best_model, y_pred, y_prob, metrics = train_and_evaluate_model(
        model_key=key,
        model_def=model_defs[key],
        X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test,
        base_dir=RUN_DIR,
        cv_folds=3,
        n_iter=10
    )
    results_list.append(result_row)
    models_dict[key] = best_model
    y_preds_dict[key] = y_pred
    y_probs_dict[key] = y_prob
    metrics_dict[key] = metrics

print(f"\n{'='*60}")
print("MODEL TRAINING COMPLETED SUCCESSFULLY")
print(f"{'='*60}\n")

# Create metrics summary dataframe
metrics_df = pd.DataFrame(results_list)
metrics_csv = os.path.join(RUN_DIR, "01_metrics_summary_all_models.csv")
metrics_df.to_csv(metrics_csv, index=False)
print(f"Metrics summary saved: {metrics_csv}\n")
display(metrics_df)

# Identify best performing model
best_model_key = metrics_df.loc[metrics_df["test_f1"].idxmax(), "model"]
best_model_f1 = metrics_df.loc[metrics_df["test_f1"].idxmax(), "test_f1"]
print(f"\n{'='*60}")
print(f"BEST MODEL: {best_model_key.upper()}")
print(f"Test F1-Score: {best_model_f1:.4f}")
print(f"{'='*60}\n")

## 8. Best Model Detailed Report

In [None]:
# Extract best model results
best_model_obj = models_dict[best_model_key]
y_pred_best = y_preds_dict[best_model_key]
y_prob_best = y_probs_dict[best_model_key]
metrics_best = metrics_dict[best_model_key]

# Save best model to its results directory
best_model_path = os.path.join(RUN_DIR, f"best_binary_model_{best_model_key}.pkl")
joblib.dump(best_model_obj, best_model_path)
print(f"Best model saved: {best_model_path}\n")

# Also save to production directory for deployment
best_model_prod_path = os.path.join(MODELS_DIR, f"best_binary_classification_model.pkl")
joblib.dump(best_model_obj, best_model_prod_path)
print(f"Best model also saved to production: {best_model_prod_path}\n")

# Generate classification report for best model
class_report = classification_report(y_test, y_pred_best,
                                     target_names=["Benign", "Attack"],
                                     digits=4)

# Create detailed report text
report_text = f"""
{'='*70}
BINARY INTRUSION DETECTION - BEST MODEL REPORT
{'='*70}

Execution Timestamp: {timestamp}
Best Model: {best_model_key.upper()}
Results Location: {best_model_path}
Production Location: {best_model_prod_path}

{'='*70}
MODEL PERFORMANCE METRICS
{'='*70}

Accuracy:       {metrics_best['accuracy']:.4f}
Precision:      {metrics_best['precision']:.4f}
Recall:         {metrics_best['recall']:.4f}
Specificity:    {metrics_best['specificity']:.4f}
F1-Score:       {metrics_best['f1']:.4f}
ROC-AUC:        {metrics_best['roc_auc']:.4f}

Confusion Matrix:
{metrics_best['confusion_matrix']}

{'='*70}
CLASSIFICATION REPORT
{'='*70}

{class_report}

{'='*70}
BEST MODEL HYPERPARAMETERS
{'='*70}

{str(model_defs[best_model_key]['param_grid'])}

{'='*70}
"""

# Save report to file
report_path = os.path.join(RUN_DIR, "02_best_model_report.txt")
with open(report_path, "w") as f:
    f.write(report_text)

print("Best model report generated and saved")
print(report_text)

## 9. Model Comparison Visualizations

In [None]:
# 1. Performance metrics comparison across models
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
fig.suptitle("Model Performance Comparison", fontsize=16, fontweight='bold')

metrics_to_plot = ["test_accuracy", "test_precision", "test_recall",
                   "test_specificity", "test_f1", "test_roc_auc"]
colors = ['#FF6B6B' if model == best_model_key else '#4ECDC4'
          for model in metrics_df["model"]]

for idx, metric in enumerate(metrics_to_plot):
    ax = axes[idx // 3, idx % 3]
    bars = ax.bar(metrics_df["model"], metrics_df[metric], color=colors, alpha=0.8, edgecolor='black')
    ax.set_ylabel(metric.replace("test_", ""), fontsize=10)
    ax.set_title(metric.replace("test_", "").upper(), fontsize=11, fontweight='bold')
    ax.set_ylim([0, 1.05])
    ax.grid(axis='y', alpha=0.3)

    # Add value labels on bars
    for bar in bars:
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width()/2., height,
                f'{height:.3f}', ha='center', va='bottom', fontsize=9)

    plt.setp(ax.xaxis.get_majorticklabels(), rotation=45, ha='right')

plt.tight_layout()
comparison_path = os.path.join(RUN_DIR, "03_models_comparison_metrics.png")
plt.savefig(comparison_path, dpi=300, bbox_inches='tight')
plt.close()
print("Saved: models_comparison_metrics.png")

# 2. ROC curves comparison
plt.figure(figsize=(10, 8))
for model_key in MODELS_TO_RUN:
    y_prob = y_probs_dict[model_key]
    fpr, tpr, _ = roc_curve(y_test, y_prob)
    auc = roc_auc_score(y_test, y_prob)

    line_width = 3 if model_key == best_model_key else 1.5
    line_style = '-' if model_key == best_model_key else '--'
    alpha = 1.0 if model_key == best_model_key else 0.7

    plt.plot(fpr, tpr, lw=line_width, linestyle=line_style, alpha=alpha,
             label=f"{model_key.upper()} (AUC={auc:.4f})")

plt.plot([0, 1], [0, 1], 'k--', lw=2, label='Random Classifier')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate", fontsize=12)
plt.ylabel("True Positive Rate", fontsize=12)
plt.title("ROC Curves - All Models Comparison", fontsize=14, fontweight='bold')
plt.legend(loc="lower right", fontsize=10)
plt.grid(alpha=0.3)
roc_comparison_path = os.path.join(RUN_DIR, "04_roc_curves_comparison.png")
plt.tight_layout()
plt.savefig(roc_comparison_path, dpi=300, bbox_inches='tight')
plt.close()
print("Saved: roc_curves_comparison.png")

# 3. F1-score ranking
fig, ax = plt.subplots(figsize=(10, 6))
sorted_df = metrics_df.sort_values("test_f1", ascending=True)
colors_rank = ['#FF6B6B' if model == best_model_key else '#95E1D3'
               for model in sorted_df["model"]]
bars = ax.barh(sorted_df["model"], sorted_df["test_f1"], color=colors_rank, edgecolor='black', alpha=0.85)

ax.set_xlabel("F1-Score", fontsize=12)
ax.set_title("Models Ranked by F1-Score", fontsize=14, fontweight='bold')
ax.set_xlim([0, 1.05])
ax.grid(axis='x', alpha=0.3)

for idx, bar in enumerate(bars):
    width = bar.get_width()
    ax.text(width, bar.get_y() + bar.get_height()/2.,
            f' {width:.4f}', ha='left', va='center', fontsize=10, fontweight='bold')

plt.tight_layout()
ranking_path = os.path.join(RUN_DIR, "05_f1_score_ranking.png")
plt.savefig(ranking_path, dpi=300, bbox_inches='tight')
plt.close()
print("Saved: f1_score_ranking.png")

# 4. Metrics heatmap
fig, ax = plt.subplots(figsize=(12, 6))
metrics_for_heatmap = metrics_df[["model", "test_accuracy", "test_precision",
                                   "test_recall", "test_specificity", "test_f1", "test_roc_auc"]].set_index("model")
sns.heatmap(metrics_for_heatmap.T, annot=True, fmt='.4f', cmap='RdYlGn',
            cbar_kws={'label': 'Score'}, ax=ax, linewidths=0.5)
ax.set_title("Model Metrics Heatmap", fontsize=14, fontweight='bold')
plt.tight_layout()
heatmap_path = os.path.join(RUN_DIR, "06_metrics_heatmap.png")
plt.savefig(heatmap_path, dpi=300, bbox_inches='tight')
plt.close()
print("Saved: metrics_heatmap.png")

print("\nAll comparison visualizations generated successfully")

## 10. Execution Summary and Results Verification

In [None]:
print(f"\n{'='*70}")
print("RESULTS DIRECTORY STRUCTURE")
print(f"{'='*70}\n")

# List all generated files in results directory
for root, dirs, files in os.walk(RUN_DIR):
    level = root.replace(RUN_DIR, '').count(os.sep)
    indent = ' ' * 2 * level
    print(f'{indent}{os.path.basename(root)}/')
    sub_indent = ' ' * 2 * (level + 1)
    for file in files:
        file_path = os.path.join(root, file)
        file_size = os.path.getsize(file_path)
        size_str = f"{file_size / 1024:.1f}KB" if file_size > 1024 else f"{file_size}B"
        print(f'{sub_indent}{file} ({size_str})')

print(f"\n{'='*70}")
print("EXECUTION SUMMARY")
print(f"{'='*70}\n")

# Generate execution summary
summary_text = f"""
BINARY CLASSIFICATION MODEL TRAINING SUMMARY

Total Models Trained: {len(MODELS_TO_RUN)}
Models: {', '.join([m.upper() for m in MODELS_TO_RUN])}

Best Performing Model: {best_model_key.upper()}
Best F1-Score: {best_model_f1:.4f}

Results Location: {os.path.abspath(RUN_DIR)}

Generated Files:
- 01_metrics_summary_all_models.csv: Comprehensive metrics for all models
- 02_best_model_report.txt: Detailed analysis of the best model
- 03_models_comparison_metrics.png: Performance metrics comparison
- 04_roc_curves_comparison.png: ROC curves for all models
- 05_f1_score_ranking.png: Model ranking by F1-score
- 06_metrics_heatmap.png: Metrics heatmap visualization

Model-Specific Outputs:
"""

for model_key in MODELS_TO_RUN:
    model_dir = os.path.join(RUN_DIR, model_key)
    if os.path.exists(model_dir):
        files = os.listdir(model_dir)
        summary_text += f"\n  {model_key.upper()}:\n"
        for file in sorted(files):
            summary_text += f"    - {file}\n"

summary_text += f"""
{'='*70}
"""

# Save summary to file
summary_path = os.path.join(RUN_DIR, "00_EXECUTION_SUMMARY.txt")
with open(summary_path, "w") as f:
    f.write(summary_text)

print(summary_text)
print(f"Summary saved to: {summary_path}")