# Baseline Traffic and Features

In [2]:
import sys
from pathlib import Path
import pandas as pd

sys.path.append(str(Path("..").resolve()))
from pipeline.P1_Ingest.prepare_conn import prepare_conn_data
from pipeline.P2_Features.feature_catalog import generate_feature_catalog
from pipeline.P2_Features.generate_conn_features import (
    generate_conn_features,
    compute_feature_stats,
)

from pipeline.P3_Training.baseline_profiling import build_baseline_profile
from pipeline.P3_Training.model_train import train_baseline_model
from pipeline.P4_Detection.detection import phase4_detect, separation_verdict



# Phase 1: Ingest Baseline only
baseline_df, baseline_path = prepare_conn_data("../data/raw/baseline.csv","../data/clean")

# # Phase 2: Feature Generation: Baseline only
baseline_features= generate_conn_features(baseline_df,output_path="../data/features/baseline_features.parquet")

# # Phase 2.5: Feature Stats and Catalog: Baseline Only
baseline_features_stats_df = compute_feature_stats(baseline_features)
generate_feature_catalog(features_df=baseline_features,
                         stats_df=baseline_features_stats_df,
                         window="5min",
                         feature_set="conn_v1")



# Phase 3: Profiling and Modelling
build_baseline_profile(features_df=baseline_features,output_dir="../data/artifacts")

MODEL_VERSION = "1.0"
train_baseline_model(baseline_features=baseline_features,
                     feature_columns=baseline_features.columns,
                     model_dir="../models/",MODEL_VERSION=MODEL_VERSION)


(IsolationForest(contamination=0.05, n_estimators=300, n_jobs=-1,
                 random_state=42),
                                                connections  total_bytes  \
 src_ip                    window                                          
 0.0.0.0                   2026-01-08 23:15:00            1          0.0   
                           2026-01-08 23:20:00            2          0.0   
                           2026-01-08 23:25:00            3          0.0   
                           2026-01-08 23:30:00            1          0.0   
                           2026-01-08 23:35:00            3          0.0   
 ...                                                    ...          ...   
 fe80::dea6:32ff:fee1:7e28 2026-01-09 06:10:00            2        360.0   
                           2026-01-09 06:15:00            5       1280.0   
                           2026-01-09 06:25:00            2        180.0   
                           2026-01-09 06:30:00            3    

# Attack Data and Feature Generation

In [3]:

# P1:: Ingest
df_attack,_ = prepare_conn_data(raw_csv_path="../data/raw/attack_conn.csv",output_dir="../data/clean")

# P2: Feature Generation
attack_features = generate_conn_features(df_attack,output_path="../data/features/attack_features.parquet")

# Load Baseline features from file
baseline_features = pd.read_parquet("../data/features/baseline_features.parquet")

# Detection
MODEL = f"zeek_iforest_v{MODEL_VERSION}.joblib"
detection_summary = phase4_detect(
    model_path=f"../models/{MODEL}",
    baseline_features_path="../data/features/baseline_features.parquet",
    attack_features_path="../data/features/attack_features.parquet",
    feature_columns=attack_features.columns,
    output_dir="../data/artifacts/"
)

verdict = separation_verdict(detection_summary["attack_baseline_ratio"].iloc[0])
print("Final Verdict: ",verdict)

detection_summary

Final Verdict:  No meaningful separation


Unnamed: 0,baseline_mean,attack_mean,baseline_tail_pct,attack_tail_pct,attack_baseline_ratio
0,0.16,0.137,4.95,5.495,1.11


In [7]:
from pipeline.P5_Operations.detection import generate_report

# Generate Final Report
detection_summary = Path("../data/artifacts/detection_summary.csv")
output_report = Path("../data/reports/output.json")
generate_report(detection_summary,output_report)


[âœ“] Phase 5 report generated: ../data/reports/output.json
