In [None]:
from skopt import BayesSearchCV
from skopt.space import Real, Categorical
from sklearn.pipeline import Pipeline
import numpy as np
from logistic_scoring_transformer import LogisticAlertScorer

In [None]:
# sample risk factors after feature engineering
feature_cols = [
    "ioc_hit",
    "ids_alert",
    "fw_block",
    "suspicious_port_cat",
    "geo_anomaly",
    "uncommon_subnet"
]

In [None]:
#sample pipeline
pipeline = Pipeline([
    ("normalize", EventNormalizeTransformer()),
    ("ip_enrich", IPInfoAsyncTransformer(ip_columns=["src_ip", "dst_ip"], token=IPINFO_TOKEN)),
    ("port_cat", PortCategorizer(port_column=["src_port", "dst_port"])),
    ("alert_features", AlertFeatureTransformer(ioc_df=ioc_table)),
    ("logistic_scorer", LogisticAlertScorer(feature_cols=feature_cols))
])
pipeline

In [None]:
# Bayesian Hyperparameter Optimization
search_spaces = {
    "logistic_scorer__C": Real(1e-3, 10, prior="log-uniform"),
    "logistic_scorer__class_weight": Categorical(["balanced", None]),
    "logistic_scorer__thresholds": Categorical([
        {"high": 0.9, "medium": 0.6},
        {"high": 0.8, "medium": 0.5},
        {"high": 0.7, "medium": 0.4},
    ])
}

In [None]:
opt = BayesSearchCV(
    estimator=pipeline,
    search_spaces=search_spaces,
    n_iter=32, 
    scoring="f1",
    cv=5,
    random_state=42,
    n_jobs=-1
)

In [None]:
opt.fit(X_train, y_train)
print("Best Params:", opt.best_params_)
print("Best Score:", opt.best_score_)

In [None]:
best_model = opt.best_estimator_

In [None]:
# scoring modelling
final = best_model.transform(new_logs)