# AIOps: sklearn models baseline


## Additional functions

In [1]:
import pandas as pd
import numpy as np
import re
from typing import List, Optional, Tuple, Dict, Any
from dataclasses import dataclass

def read_csv_dataset(file_path, sep=','):
    return pd.read_csv(file_path, sep=sep)

def convert_timestamps_to_datetime(df, timestamp_column_name='timestamp', timestamp_unit='s'):
    df['datetime'] = pd.to_datetime(df[timestamp_column_name], unit=timestamp_unit)
    return df

def get_data(file_path, sep=',', timestamp_column_name='timestamp', timestamp_unit='s'):
    df = read_csv_dataset(file_path, sep=sep)
    if timestamp_column_name in df.columns:
        df = convert_timestamps_to_datetime(df, timestamp_column_name, timestamp_unit)
    print(f"Dataset shape: {df.shape}")
    return df

def normalize_log(line: str) -> str:
    return re.sub(r"\d+", "<NUM>", str(line))

## Main functions for clustering and prediction

In [5]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (classification_report, roc_auc_score, average_precision_score,
                             precision_recall_curve, f1_score)

import hdbscan
from hdbscan import approximate_predict

@dataclass
class WindowConfig:
    # cluster agg frequency
    freq: str = "1min"     
    # history length 
    history_len: int = 5 
    pre_failure_lead: pd.Timedelta = pd.Timedelta("30min")
    post_incident_ignore: pd.Timedelta = pd.Timedelta("0min")

@dataclass
class ClusterConfig:
    metric: str = "euclidean"
    min_cluster_size: int = 800
    min_samples: int = 400
    cluster_selection_method: str = "eom"
    cluster_selection_epsilon: float = 0.03
    prediction_data: bool = True
    fit_max_points: Optional[int] = None

@dataclass
class ModelConfig:
    random_state: int = 42
    class_weight: str = "balanced"
    max_iter: int = 1000

def make_labels_by_time(times: pd.Series, incidents: pd.DataFrame, lead: pd.Timedelta,
                        service: Optional[pd.Series] = None) -> pd.Series:
    times = pd.to_datetime(times)
    y = pd.Series(0, index=times.index, dtype=int)
    if service is None or "service" not in incidents.columns:
        inc_times = pd.to_datetime(incidents["datetime"]).sort_values().values
        for t0 in inc_times:
            mask = (times > (t0 - lead)) & (times <= t0)
            y.loc[mask] = 1
    else:
        for srv, grp in incidents.groupby("service"):
            inc_times = pd.to_datetime(grp["datetime"]).sort_values().values
            srv_mask = (service == srv).values
            t_srv = times[srv_mask]
            idx_srv = times.index[srv_mask]
            y_srv = pd.Series(0, index=idx_srv, dtype=int)
            for t0 in inc_times:
                mask = (t_srv > (t0 - lead)) & (t_srv <= t0)
                y_srv.loc[mask] = 1
            y.loc[idx_srv] = y_srv.values
    return y

def fit_hdbscan_on_train(embeddings: np.ndarray, cfg: ClusterConfig) -> hdbscan.HDBSCAN:
    clusterer = hdbscan.HDBSCAN(
        metric=cfg.metric,
        min_cluster_size=cfg.min_cluster_size,
        min_samples=cfg.min_samples,
        cluster_selection_method=cfg.cluster_selection_method,
        cluster_selection_epsilon=cfg.cluster_selection_epsilon,
        prediction_data=cfg.prediction_data,
    )
    if cfg.fit_max_points is not None and embeddings.shape[0] > cfg.fit_max_points:
        idx = np.random.RandomState(0).choice(embeddings.shape[0], cfg.fit_max_points, replace=False)
        clusterer.fit(embeddings[idx])
    else:
        clusterer.fit(embeddings)
    return clusterer

def predict_clusters(clusterer: hdbscan.HDBSCAN, embeddings: np.ndarray):
    labels, strengths = approximate_predict(clusterer, embeddings)
    return labels, strengths

def resample_and_pivot_counts(df: pd.DataFrame, time_col: str = "datetime", cluster_col: str = "cluster",
                              freq: str = "1min", service_col: Optional[str] = None) -> pd.DataFrame:
    df2 = df.copy()
    df2[time_col] = pd.to_datetime(df2[time_col])
    group_keys = [pd.Grouper(key=time_col, freq=freq), cluster_col]
    index_cols = [time_col]
    if service_col and service_col in df2.columns:
        group_keys = [service_col] + group_keys
        index_cols = [service_col, time_col]
    per_bucket = df2.groupby(group_keys).size().reset_index(name="count")
    pivot = per_bucket.pivot_table(index=index_cols, columns=cluster_col, values="count", fill_value=0, aggfunc="sum").reset_index()
    pivot.columns = [str(c) for c in pivot.columns]
    return pivot

def add_history_features(pivot: pd.DataFrame, time_col: str = "datetime", service_col: Optional[str] = None,
                         history_len: int = 5) -> pd.DataFrame:
    df = pivot.copy()
    df[time_col] = pd.to_datetime(df[time_col])
    cluster_cols = [c for c in df.columns if c not in ([time_col] + ([service_col] if service_col else []))]
    def _add_roll(g: pd.DataFrame) -> pd.DataFrame:
        g = g.sort_values(time_col)
        for c in cluster_cols:
            g[f"{c}_mean{history_len}"] = g[c].rolling(window=history_len, min_periods=1).mean()
            g[f"{c}_diff1"] = g[c].diff().fillna(0)
        return g
    if service_col and service_col in df.columns:
        df = df.groupby(service_col, group_keys=False).apply(_add_roll)
    else:
        df = _add_roll(df)
    return df

@dataclass
class Split:
    train_start: pd.Timestamp
    train_end: pd.Timestamp
    test_start: pd.Timestamp
    test_end: pd.Timestamp

def build_time_splits(times: pd.Series, n_splits: int = 3, min_train_period: str = "14D", test_period: str = "7D") -> List[Split]:
    times = pd.to_datetime(times).sort_values()
    start, end = times.min(), times.max()
    splits: List[Split] = []
    train_end = start + pd.Timedelta(min_train_period)
    for _ in range(n_splits):
        test_start = train_end
        test_end = min(test_start + pd.Timedelta(test_period), end)
        if test_start >= test_end:
            break
        splits.append(Split(start, train_end, test_start, test_end))
        train_end = test_end  # expanding
    return splits

def pick_threshold_by_f1(y_true: np.ndarray, y_score: np.ndarray) -> float:
    prec, rec, thr = precision_recall_curve(y_true, y_score)
    if len(thr) == 0:
        return 0.5
    f1s = (2 * prec[:-1] * rec[:-1]) / np.clip(prec[:-1] + rec[:-1], 1e-9, None)
    best = int(np.nanargmax(f1s))
    return float(thr[best])

@dataclass
class EventMetrics:
    detection_rate: float
    mean_lead_minutes: float
    false_alarms_per_day: float
    threshold: float

def evaluate_events(df_scores: pd.DataFrame, incidents: pd.DataFrame, window_cfg: WindowConfig,
                    service_col: Optional[str] = None, time_col: str = "datetime", proba_col: str = "proba",
                    threshold: float = 0.5) -> EventMetrics:
    d = df_scores.copy()
    d[time_col] = pd.to_datetime(d[time_col])
    d = d.sort_values(time_col)
    detections = 0
    leads: List[float] = []
    total_days = (d[time_col].max() - d[time_col].min()).total_seconds() / 86400.0
    alerts = d[d[proba_col] >= threshold].copy()
    is_true_alert = pd.Series(False, index=alerts.index)
    if service_col and service_col in d.columns and "service" in incidents.columns:
        for srv, inc_grp in incidents.groupby("service"):
            d_srv = d[d[service_col] == srv]
            alerts_srv = alerts[alerts[service_col] == srv]
            for _, row in inc_grp.iterrows():
                t0 = pd.to_datetime(row["datetime"])
                win_mask = (d_srv[time_col] > (t0 - window_cfg.pre_failure_lead)) & (d_srv[time_col] <= t0)
                win_alerts = d_srv[win_mask & (d_srv[proba_col] >= threshold)]
                if not win_alerts.empty:
                    detections += 1
                    first_alert = win_alerts[time_col].min()
                    leads.append((t0 - first_alert).total_seconds() / 60.0)
                idx_true = d_srv.index[win_mask & (d_srv[proba_col] >= threshold)]
                is_true_alert.loc[alerts_srv.index.intersection(idx_true)] = True
    else:
        for _, row in incidents.iterrows():
            t0 = pd.to_datetime(row["datetime"])
            win_mask = (d[time_col] > (t0 - window_cfg.pre_failure_lead)) & (d[time_col] <= t0)
            win_alerts = d[win_mask & (d[proba_col] >= threshold)]
            if not win_alerts.empty:
                detections += 1
                first_alert = win_alerts[time_col].min()
                leads.append((t0 - first_alert).total_seconds() / 60.0)
            idx_true = d.index[win_mask & (d[proba_col] >= threshold)]
            is_true_alert.loc[alerts.index.intersection(idx_true)] = True
    false_alarms = int((~is_true_alert).sum())
    return EventMetrics(
        detection_rate = detections / max(len(incidents), 1),
        mean_lead_minutes = float(np.mean(leads)) if leads else 0.0,
        false_alarms_per_day = false_alarms / max(total_days, 1e-9),
        threshold = threshold,
    )

def train_eval_timewindow(df_logs: pd.DataFrame, incidents: pd.DataFrame,
                          window_cfg: WindowConfig = WindowConfig(),
                          cluster_cfg: ClusterConfig = ClusterConfig(),
                          model_cfg: ModelConfig = ModelConfig(),
                          service_col: Optional[str] = None,
                          time_col: str = "datetime",
                          embedding_col: str = "embedding",
                          n_splits: int = 3, min_train_period: str = "14D", test_period: str = "7D") -> Dict[str, Any]:
    df = df_logs.copy()
    df[time_col] = pd.to_datetime(df[time_col])
    df = df.sort_values(time_col)
    splits = build_time_splits(df[time_col], n_splits=n_splits, min_train_period=min_train_period, test_period=test_period)
    if not splits:
        raise ValueError("Not enough data for the requested splits.")
    results = []
    chosen_thr: Optional[float] = None
    for k, sp in enumerate(splits):
        train_mask = (df[time_col] >= sp.train_start) & (df[time_col] < sp.train_end)
        test_mask  = (df[time_col] >= sp.test_start) & (df[time_col] < sp.test_end)
        train_df = df.loc[train_mask].copy()
        test_df  = df.loc[test_mask].copy()
        X_train_emb = np.vstack(train_df[embedding_col].to_list())
        print('hdbscan fitting fold', k+1)
        clusterer = fit_hdbscan_on_train(X_train_emb, cluster_cfg)
        print('hdbscan predicting fold', k+1)
        train_labels, train_strengths = predict_clusters(clusterer, X_train_emb)
        train_df["cluster"] = train_labels
        train_df["cluster_strength"] = train_strengths
        X_test_emb = np.vstack(test_df[embedding_col].to_list())
        test_labels, test_strengths = predict_clusters(clusterer, X_test_emb)
        test_df["cluster"] = test_labels
        test_df["cluster_strength"] = test_strengths
        pivot_train = resample_and_pivot_counts(train_df, time_col=time_col, cluster_col="cluster", freq=window_cfg.freq, service_col=service_col)
        pivot_test  = resample_and_pivot_counts(test_df,  time_col=time_col, cluster_col="cluster", freq=window_cfg.freq, service_col=service_col)
        feat_train = add_history_features(pivot_train, time_col=time_col, service_col=service_col, history_len=window_cfg.history_len)
        feat_test  = add_history_features(pivot_test,  time_col=time_col, service_col=service_col, history_len=window_cfg.history_len)
        y_train = make_labels_by_time(feat_train[time_col], incidents, lead=window_cfg.pre_failure_lead,
                                      service=feat_train[service_col] if service_col and service_col in feat_train.columns else None)
        y_test  = make_labels_by_time(feat_test[time_col],  incidents, lead=window_cfg.pre_failure_lead,
                                      service=feat_test[service_col]  if service_col and service_col in feat_test.columns else None)
        drop_cols = [time_col] + ([service_col] if service_col and service_col in feat_train.columns else [])
        print('training classifier fold', k+1)
        X_train = feat_train.drop(columns=drop_cols)
        X_test  = feat_test.drop(columns=drop_cols)
        pipe = Pipeline([
            ("scaler", StandardScaler(with_mean=False)),
            ("clf", LogisticRegression(max_iter=model_cfg.max_iter, class_weight=model_cfg.class_weight, random_state=model_cfg.random_state))
        ])
        pipe.fit(X_train, y_train)
        proba_train = pipe.predict_proba(X_train)[:, 1]
        proba_test  = pipe.predict_proba(X_test)[:, 1]
        if chosen_thr is None:
            chosen_thr = pick_threshold_by_f1(y_train.values, proba_train)
        pred_test = (proba_test >= chosen_thr).astype(int)
        roc = roc_auc_score(y_test, proba_test) if len(np.unique(y_test)) > 1 else np.nan
        pr_auc = average_precision_score(y_test, proba_test) if len(np.unique(y_test)) > 1 else np.nan
        f1 = f1_score(y_test, pred_test) if len(np.unique(y_test)) > 1 else np.nan
        rep = classification_report(y_test, pred_test, digits=3, zero_division=0)
        df_scores = feat_test[[time_col]].copy()
        df_scores["proba"] = proba_test
        if service_col and service_col in feat_test.columns:
            df_scores[service_col] = feat_test[service_col].values
        em = evaluate_events(df_scores, incidents, window_cfg, service_col=service_col, time_col=time_col, proba_col="proba", threshold=chosen_thr)
        results.append({
            "fold": k+1,
            "train_range": (sp.train_start, sp.train_end),
            "test_range": (sp.test_start, sp.test_end),
            "roc_auc": roc,
            "pr_auc": pr_auc,
            "f1_at_thr": f1,
            "thr": chosen_thr,
            "event_detection_rate": em.detection_rate,
            "mean_lead_minutes": em.mean_lead_minutes,
            "false_alarms_per_day": em.false_alarms_per_day,
            "classification_report": rep,
        })
    return {"results": results, "chosen_threshold": chosen_thr, "window_cfg": window_cfg, "cluster_cfg": cluster_cfg}


## Data loading

In [6]:
# path to files
INCIDENTS_CSV = '/Users/arinagoncharova/Documents/diploma/EDA/Aiops-Dataset/groundtruth/groundtruth-2022-05-03.csv'
LOGS_CSV = '/Users/arinagoncharova/Documents/diploma/EDA/Aiops-Dataset/data/2022-05-03/log/all/log_filebeat-testbed-log-service.csv'

failures_03_05_df = get_data(INCIDENTS_CSV)
service_failures_03_05_df = failures_03_05_df[failures_03_05_df['level'] == 'service'].copy()

# normalize service column name (use 'cmdb_id' if exists)
service_col_name = None
if 'cmdb_id' in service_failures_03_05_df.columns:
    service_col_name = 'cmdb_id'
else:
    for cand in ['smbd_id', 'сmbd_id']:
        if cand in service_failures_03_05_df.columns:
            service_failures_03_05_df = service_failures_03_05_df.rename(columns={cand: 'cmdb_id'})
            service_col_name = 'cmdb_id'
            break

incidents = service_failures_03_05_df.copy()
need_cols = ['datetime'] + ([service_col_name] if service_col_name else [])
incidents = incidents[need_cols].dropna().reset_index(drop=True)
print(incidents.head())

logs_03_05_service_df = get_data(LOGS_CSV)
logs_df = logs_03_05_service_df.copy()
if service_col_name and service_col_name not in logs_df.columns:
    for cand in ['smbd_id', 'сmbd_id']:
        if cand in logs_df.columns:
            logs_df = logs_df.rename(columns={cand: 'cmdb_id'})
            break
assert 'value' in logs_df.columns, f"Ожидается колонка 'value' в логах. Нашли: {list(logs_df.columns)}"
assert 'datetime' in logs_df.columns, "Логи должны содержать временную метку (см. get_data)."
print(logs_df.head())

Dataset shape: (50, 5)
             datetime                cmdb_id
0 2022-05-02 19:31:46  productcatalogservice
1 2022-05-02 20:32:34  recommendationservice
2 2022-05-02 22:12:54               frontend
3 2022-05-02 23:14:17  recommendationservice
4 2022-05-02 23:52:18            cartservice
Dataset shape: (5444332, 6)
                 log_id   timestamp        cmdb_id  \
0  Cp6Bt38B8vQa58bZsQau  1651507200     frontend-1   
1  EZ6Bt38B8vQa58bZqQWr  1651507200  cartservice-2   
2  FZ6Bt38B8vQa58bZqQWr  1651507200  cartservice-2   
3  Fp6Bt38B8vQa58bZqQWr  1651507200  cartservice-2   
4  F56Bt38B8vQa58bZqQWr  1651507200  cartservice-2   

                              log_name  \
0     log_frontend-service_application   
1  log_cartservice-service_application   
2  log_cartservice-service_application   
3  log_cartservice-service_application   
4  log_cartservice-service_application   

                                               value            datetime  
0         severity: debug,

## Logs embeddings calculation

In [None]:
import os, hashlib
from sentence_transformers import SentenceTransformer

# embeddings cache for unique normalized strings
CACHE_PATH = "/Users/arinagoncharova/Documents/diploma/repo/data/AIOps/preprocessed_while_modelling/e5_small_embeddings/embeddings_cache.parquet"

def norm_hash(s: str) -> str:
    import hashlib
    return hashlib.sha1(s.encode('utf-8')).hexdigest()

def load_cache():
    import pandas as pd, os
    if os.path.exists(CACHE_PATH):
        return pd.read_parquet(CACHE_PATH)
    return pd.DataFrame(columns=["norm_hash", "normalized_value", "embedding"])

def save_cache(df_cache):
    df_cache = df_cache.drop_duplicates("norm_hash")
    df_cache.to_parquet(CACHE_PATH, index=False)

def compute_embeddings_with_cache(df_logs: pd.DataFrame, text_col: str = "value",
                                  model_name: str = "intfloat/e5-small-v2",
                                  batch_size: int = 16):
    model = SentenceTransformer(model_name)
    df = df_logs.copy()
    df["normalized_value"] = df[text_col].fillna("").map(normalize_log)
    df["norm_hash"] = df["normalized_value"].map(norm_hash)
    cache = load_cache()
    known = set(cache["norm_hash"].tolist())
    uniq = df[["norm_hash", "normalized_value"]].drop_duplicates()
    to_compute = uniq[~uniq["norm_hash"].isin(known)]
    if not to_compute.empty:
        texts = to_compute["normalized_value"].tolist()
        embs = model.encode(texts, normalize_embeddings=True, show_progress_bar=True)
        add = to_compute.copy().reset_index(drop=True)
        add["embedding"] = list(embs)
        cache = pd.concat([cache, add[["norm_hash","normalized_value","embedding"]]], ignore_index=True)
        save_cache(cache)
    df = df.merge(cache[["norm_hash","embedding"]], on="norm_hash", how="left")
    return df

# cache usage
if 'embedding' not in logs_df.columns:
    df_with_emb = compute_embeddings_with_cache(logs_df, text_col='value', model_name='intfloat/e5-small-v2', batch_size=2048)
    logs_df = df_with_emb

keep_cols = ['datetime', 'embedding'] + ([service_col_name] if service_col_name else [])
df_logs = logs_df[keep_cols].copy().reset_index(drop=True)
df_logs.head()


Batches:   0%|          | 0/2640 [00:00<?, ?it/s]

Unnamed: 0,datetime,embedding,cmdb_id
0,2022-05-02 16:00:00,"[-0.06920783, 0.077275746, 0.019308727, 0.0091...",frontend-1
1,2022-05-02 16:00:00,"[-0.04647663, 0.028303048, 0.011438644, 0.0161...",cartservice-2
2,2022-05-02 16:00:00,"[-0.08472934, 0.028683404, 0.036607213, -0.018...",cartservice-2
3,2022-05-02 16:00:00,"[-0.063579135, 0.025242442, 0.0009641656, 0.01...",cartservice-2
4,2022-05-02 16:00:00,"[-0.08472934, 0.028683404, 0.036607213, -0.018...",cartservice-2


In [3]:
LOGS_CLUSTERED_PATH = '/Users/arinagoncharova/Documents/diploma/repo/data/AIOps/preprocessed_while_modelling/e5_small_embeddings/embedded_logs_2022-05-03.parquet'


In [None]:
df_logs.to_parquet(LOGS_CLUSTERED_PATH)

NameError: name 'df_logs' is not defined

In [4]:
df_logs = pd.read_parquet(LOGS_CLUSTERED_PATH)

In [7]:
# fast mode for checking
SPEED_MODE = True 

if SPEED_MODE:
    # shorten logs -> take 10% of logs
    df_logs = df_logs.sample(frac=0.1, random_state=42).sort_values("datetime")

    # time window simplification
    wcfg = WindowConfig(freq="5min", history_len=3, pre_failure_lead=pd.Timedelta("10min"))

    # lighten clustering
    ccfg = ClusterConfig(
        min_cluster_size=300,
        min_samples=150,
        cluster_selection_method="eom",
        cluster_selection_epsilon=0.03,
        # only 10k points are trained
        fit_max_points=10000,         
    )

    mcfg = ModelConfig(random_state=42, class_weight="balanced", max_iter=1000)

    # shorter windows
    n_splits = 1
    min_train_period = "6H"
    test_period = "6H"

else:
    # original parameters
    wcfg = WindowConfig(freq="1min", history_len=5, pre_failure_lead=pd.Timedelta("30min"))
    ccfg = ClusterConfig(min_cluster_size=800, min_samples=400, cluster_selection_method="eom", cluster_selection_epsilon=0.03)
    mcfg = ModelConfig(random_state=42, class_weight="balanced", max_iter=1000)
    n_splits = 1
    min_train_period = "12H"
    test_period = "12H"

res = train_eval_timewindow(
    df_logs=df_logs,
    incidents=incidents,
    window_cfg=wcfg,
    cluster_cfg=ccfg,
    model_cfg=mcfg,
    service_col=service_col_name,
    n_splits=n_splits,
    min_train_period=min_train_period,
    test_period=test_period,
)


  train_end = start + pd.Timedelta(min_train_period)
  test_end = min(test_start + pd.Timedelta(test_period), end)


hdbscan fitting fold 1




hdbscan predicting fold 1


  df = df.groupby(service_col, group_keys=False).apply(_add_roll)
  df = df.groupby(service_col, group_keys=False).apply(_add_roll)


training classifier fold 1


In [8]:
print("Chosen threshold:", res['chosen_threshold'])
for r in res['results']:
    print(
        f"Fold {r['fold']}\n"
        f"Train {r['train_range']}\n"
        f"Test {r['test_range']}\n"
        f"ROC-AUC = {r['roc_auc']:.3f}\n"
        f"PR-AUC = {r['pr_auc']:.3f} F1@thr = {r['f1_at_thr']:.3f}\n"
        f"Detection rate% = {100*r['event_detection_rate']:.1f}%\n"
        f"MTTD = {r['mean_lead_minutes']:.1f}m \n"
        f"FA/day={r['false_alarms_per_day']:.2f}"
    )

Chosen threshold: 0.4543204362564163
Fold 1
Train (Timestamp('2022-05-02 16:00:00'), Timestamp('2022-05-02 22:00:00'))
Test (Timestamp('2022-05-02 22:00:00'), Timestamp('2022-05-03 04:00:00'))
ROC-AUC = 0.502
PR-AUC = 0.164 F1@thr = 0.259
Detection rate% = 28.6%
MTTD = 8.1m 
FA/day=3581.75


In [9]:
from datetime import timedelta

# calculate time splits the same way as train_eval_timewindow
splits = build_time_splits(
    df_logs["datetime"],
    n_splits=1,              
    min_train_period="12H",
    test_period="12H"
)

# find number of incidents in each train/test range
def count_incidents_in_range(inc_df, start, end):
    times = pd.to_datetime(inc_df["datetime"])
    mask = (times >= start) & (times < end)
    return mask.sum()

for i, sp in enumerate(splits, 1):
    n_train = count_incidents_in_range(incidents, sp.train_start, sp.train_end)
    n_test  = count_incidents_in_range(incidents, sp.test_start,  sp.test_end)
    print(f"Fold {i}:")
    print(f"Train: {sp.train_start} - {sp.train_end} has incidents = {n_train}")
    print(f"Test: {sp.test_start} - {sp.test_end} has incidents = {n_test}")


Fold 1:
Train: 2022-05-02 16:00:00 - 2022-05-03 04:00:00 has incidents = 10
Test: 2022-05-03 04:00:00 - 2022-05-03 15:59:59 has incidents = 11


  train_end = start + pd.Timedelta(min_train_period)
  test_end = min(test_start + pd.Timedelta(test_period), end)


## Experimenting with models

In [11]:
import numpy as np
import pandas as pd
from sklearn.metrics import precision_recall_curve

def build_single_split_features(
    df_logs, incidents, window_cfg, cluster_cfg,
    service_col=None, time_col="datetime", embedding_col="embedding",
    min_train_period="12H", test_period="12H"
):
    splits = build_time_splits(df_logs[time_col], n_splits=1,
                               min_train_period=min_train_period, test_period=test_period)
    assert splits, "Не получилось построить сплит — мало данных"
    sp = splits[0]

    df = df_logs.sort_values(time_col).copy()
    train_df = df[(df[time_col] >= sp.train_start) & (df[time_col] < sp.train_end)].copy()
    test_df  = df[(df[time_col] >= sp.test_start)  & (df[time_col] < sp.test_end)].copy()

    Xtr = np.vstack(train_df[embedding_col].to_list())
    clusterer = hdbscan.HDBSCAN(
        metric=cluster_cfg.metric,
        min_cluster_size=cluster_cfg.min_cluster_size,
        min_samples=cluster_cfg.min_samples,
        cluster_selection_method=cluster_cfg.cluster_selection_method,
        cluster_selection_epsilon=cluster_cfg.cluster_selection_epsilon,
        prediction_data=True,
        approx_min_span_tree=True,
    )
    if cluster_cfg.fit_max_points and Xtr.shape[0] > cluster_cfg.fit_max_points:
        idx = np.random.RandomState(0).choice(Xtr.shape[0], cluster_cfg.fit_max_points, replace=False)
        clusterer.fit(Xtr[idx])
    else:
        clusterer.fit(Xtr)

    from hdbscan import approximate_predict
    tr_labels, tr_strength = approximate_predict(clusterer, Xtr)
    train_df["cluster"] = tr_labels

    Xte = np.vstack(test_df[embedding_col].to_list())
    te_labels, te_strength = approximate_predict(clusterer, Xte)
    test_df["cluster"] = te_labels

    pivot_train = resample_and_pivot_counts(train_df, time_col=time_col, cluster_col="cluster",
                                            freq=window_cfg.freq, service_col=service_col)
    pivot_test  = resample_and_pivot_counts(test_df,  time_col=time_col, cluster_col="cluster",
                                            freq=window_cfg.freq, service_col=service_col)

    feat_train = add_history_features(pivot_train, time_col=time_col, service_col=service_col,
                                      history_len=window_cfg.history_len)
    feat_test  = add_history_features(pivot_test,  time_col=time_col, service_col=service_col,
                                      history_len=window_cfg.history_len)

    y_train = make_labels_by_time(
        feat_train[time_col], incidents, lead=window_cfg.pre_failure_lead,
        service=feat_train[service_col] if service_col and service_col in feat_train.columns else None
    )
    y_test = make_labels_by_time(
        feat_test[time_col], incidents, lead=window_cfg.pre_failure_lead,
        service=feat_test[service_col] if service_col and service_col in feat_test.columns else None
    )

    drop_cols = [time_col] + ([service_col] if service_col and service_col in feat_train.columns else [])
    feature_cols = [c for c in feat_train.columns if c not in drop_cols]

    return (feat_train, y_train, feat_test, y_test, feature_cols)


In [None]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, FunctionTransformer
from sklearn.metrics import roc_auc_score, average_precision_score, f1_score, precision_recall_curve, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

# LightGBM is optional (to not restart notebook with missing package)
try:
    from lightgbm import LGBMClassifier
    _HAS_LGBM = True
except Exception:
    _HAS_LGBM = False

def make_models_dict():
    models = {
        "LogisticRegression": Pipeline([
            ("scaler", StandardScaler(with_mean=False)),
            ("clf", LogisticRegression(max_iter=1000, class_weight="balanced", random_state=42))
        ]),
        "DecisionTree": DecisionTreeClassifier(
            max_depth=5, class_weight="balanced", random_state=42
        ),
        "RandomForest": RandomForestClassifier(
            n_estimators=200, random_state=42, n_jobs=-1, class_weight="balanced_subsample"
        ),
        "NaiveBayes": Pipeline([
            ("scaler", StandardScaler(with_mean=False)),
            ("to_dense", FunctionTransformer(lambda X: X.toarray() if hasattr(X, "toarray") else X)),
            ("clf", GaussianNB())
        ]),
        "KNN": Pipeline([
            ("scaler", StandardScaler(with_mean=False)),
            ("clf", KNeighborsClassifier(n_neighbors=5))
        ]),
    }
    if _HAS_LGBM:
        models["LightGBM"] = LGBMClassifier(
            n_estimators=300, learning_rate=0.05, random_state=42
        )
    return models

def _predict_proba_or_score(model, Xtr, Xte):
    if hasattr(model, "predict_proba"):
        proba_tr = model.predict_proba(Xtr)[:, 1]
        proba_te = model.predict_proba(Xte)[:, 1]
    else:
        dec_tr = model.decision_function(Xtr)
        dec_te = model.decision_function(Xte)
        mm = MinMaxScaler()
        proba_tr = mm.fit_transform(dec_tr.reshape(-1,1)).ravel()
        proba_te = mm.transform(dec_te.reshape(-1,1)).ravel()
    return proba_tr, proba_te

def _pick_thr_by_f1(y_true, y_score):
    prec, rec, thr = precision_recall_curve(y_true, y_score)
    if len(thr) == 0:
        return 0.5
    f1s = (2*prec[:-1]*rec[:-1]) / np.clip(prec[:-1]+rec[:-1], 1e-9, None)
    return float(thr[int(np.nanargmax(f1s))])

def compare_custom_models(Xtr, ytr, Xte, yte, feature_names, pick_by="pr_auc"):
    models = make_models_dict()
    rows, per_model = [], {}

    for name, mdl in models.items():
        mdl.fit(Xtr, ytr)
        proba_tr, proba_te = _predict_proba_or_score(mdl, Xtr, Xte)
        thr = _pick_thr_by_f1(ytr, proba_tr)
        pred_te = (proba_te >= thr).astype(int)

        roc = roc_auc_score(yte, proba_te) if len(np.unique(yte))>1 else np.nan
        pra = average_precision_score(yte, proba_te) if len(np.unique(yte))>1 else np.nan
        f1  = f1_score(yte, pred_te) if len(np.unique(yte))>1 else np.nan
        rep = classification_report(yte, pred_te, digits=3, zero_division=0)

        # importances
        importances, kind = None, None
        try:
            base = mdl.named_steps["clf"] if hasattr(mdl, "named_steps") and "clf" in mdl.named_steps else mdl
            if hasattr(base, "feature_importances_"):
                importances = base.feature_importances_
                kind = "model_feature_importances_"
            elif hasattr(base, "coef_"):
                importances = np.abs(base.coef_).ravel()
                kind = "|coef_|"
        except Exception:
            pass

        fi_df = None
        if importances is not None and len(importances) == len(feature_names):
            fi_df = pd.DataFrame({"feature": feature_names, "importance": importances}).sort_values(
                "importance", ascending=False).reset_index(drop=True)

        rows.append({"model": name, "roc_auc": roc, "pr_auc": pra, "f1_at_thr": f1, "thr": thr})
        per_model[name] = {
            "classifier": mdl, "proba_test": proba_te, "pred_test": pred_te,
            "thr": thr, "classification_report": rep,
            "feature_importance": fi_df, "importance_kind": kind
        }

    rep_df = pd.DataFrame(rows).sort_values(pick_by, ascending=False)
    best_name = rep_df.iloc[0]["model"]
    return rep_df, best_name, per_model[best_name], per_model


In [13]:
_feat = build_single_split_features(
    df_logs=df_logs, incidents=incidents, window_cfg=wcfg, cluster_cfg=ccfg,
    service_col=service_col_name, min_train_period="12H", test_period="12H"
)
feat_train, y_train, feat_test, y_test, feature_cols = _feat

Xtr = feat_train[feature_cols].values
Xte = feat_test[feature_cols].values

rep_df2, best_name2, best_out2, all_out2 = compare_custom_models(
    Xtr, y_train.values, Xte, y_test.values, feature_cols, pick_by="pr_auc"
)
print("Best simple model is ", best_name2, " with a threshold:", best_out2["thr"])
rep_df2


  train_end = start + pd.Timedelta(min_train_period)
  test_end = min(test_start + pd.Timedelta(test_period), end)
  df = df.groupby(service_col, group_keys=False).apply(_add_roll)
  df = df.groupby(service_col, group_keys=False).apply(_add_roll)


[LightGBM] [Info] Number of positive: 446, number of negative: 2773
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003116 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2367
[LightGBM] [Info] Number of data points in the train set: 3219, number of used features: 48
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.138552 -> initscore=-1.827366
[LightGBM] [Info] Start training from score -1.827366
Best simple model is  DecisionTree  with a threshold: 0.5029221180195244




Unnamed: 0,model,roc_auc,pr_auc,f1_at_thr,thr
1,DecisionTree,0.509037,0.152403,0.258718,0.502922
4,KNN,0.502824,0.150663,0.146998,0.4
2,RandomForest,0.506905,0.149347,0.074074,0.555026
5,LightGBM,0.499549,0.148276,0.046921,0.3217
0,LogisticRegression,0.504323,0.145907,0.247131,0.476146
3,NaiveBayes,0.500702,0.145745,0.260286,4.2e-05


In [16]:
# Detailed analysis of the best model's predictions over time
timeline = feat_test[["datetime"]].copy()
timeline["y_true"] = y_test.values.astype(int)
timeline["proba"]  = best_out2["proba_test"]
timeline["pred"]   = best_out2["pred_test"]

fires = timeline[timeline["pred"] == 1].copy()

lead = wcfg.pre_failure_lead
inc_test = incidents[(incidents["datetime"] >= timeline["datetime"].min()) &
                     (incidents["datetime"] <= timeline["datetime"].max())].copy()

inc_test["first_fire"] = pd.NaT
inc_test["lead_minutes"] = np.nan
for i, row in inc_test.iterrows():
    t0 = pd.to_datetime(row["datetime"])
    mask = (timeline["datetime"] > t0 - lead) & (timeline["datetime"] <= t0) & (timeline["pred"] == 1)
    if mask.any():
        t_first = timeline.loc[mask, "datetime"].min()
        inc_test.at[i, "first_fire"] = t_first
        inc_test.at[i, "lead_minutes"] = (t0 - t_first).total_seconds()/60.0

detected = inc_test[inc_test["first_fire"].notna()].copy()
missed   = inc_test[inc_test["first_fire"].isna()].copy()

true_alert_mask = pd.Series(False, index=fires.index)
for _, row in inc_test.iterrows():
    t0 = pd.to_datetime(row["datetime"])
    win = (fires["datetime"] > t0 - lead) & (fires["datetime"] <= t0)
    true_alert_mask |= win
false_alarms = fires[~true_alert_mask].copy()

print("Alerts total:", len(fires))
print("Detected incidents:", len(detected), "of", len(inc_test))
print("False alarms:", len(false_alarms))

fi = best_out2["feature_importance"]
if fi is not None:
    print("Source of feature importances:", best_out2["importance_kind"])
    display(fi.head(20))

timeline.to_csv("results/pred_timeline_best.csv", index=False)
detected[["datetime","first_fire","lead_minutes"]].to_csv("results/incidents_detected.csv", index=False)
missed[["datetime"]].to_csv("results/incidents_missed.csv", index=False)
false_alarms[["datetime"]].to_csv("results/false_alarms.csv", index=False)
if fi is not None:
    fi.to_csv("results/feature_importance_best.csv", index=False)


Alerts total: 2841
Detected incidents: 11 of 11
False alarms: 2407
Source of feature importances: model_feature_importances_


Unnamed: 0,feature,importance
0,4_mean3,0.19556
1,0_mean3,0.194145
2,13_mean3,0.184261
3,14_mean3,0.170463
4,6_mean3,0.169264
5,6_diff1,0.03724
6,8_mean3,0.023939
7,8,0.012718
8,13_diff1,0.01241
9,4_diff1,0.0


## Full train and test

In [None]:
import joblib
from pathlib import Path

# get features
feat_train, y_train, feat_test, y_test, feature_cols = build_single_split_features(
    df_logs=df_logs,
    incidents=incidents,                
    window_cfg=wcfg,
    cluster_cfg=ccfg,
    service_col=service_col_name,
    min_train_period="12H",
    test_period="12H",
)

Xtr = feat_train[feature_cols].values
Xte = feat_test[feature_cols].values

# compare and select the best model
rep_l, best_l_name, best_l_out, all_l_out = compare_custom_models(
    Xtr, y_train.values, Xte, y_test.values, feature_cols, pick_by="pr_auc"
)
print("Best model (logs):", best_l_name, "| threshold:", best_l_out["thr"])
display(rep_l)

# prediction artifacts
timeline = feat_test[["datetime"]].copy()
timeline["y_true"] = y_test.values.astype(int)
timeline["proba"]  = best_l_out["proba_test"]
timeline["pred"]   = best_l_out["pred_test"]

lead = wcfg.pre_failure_lead
inc = incidents.copy()
inc["datetime"] = pd.to_datetime(inc["datetime"])

inc_test = inc[(inc["datetime"] >= timeline["datetime"].min()) &
               (inc["datetime"] <= timeline["datetime"].max())].copy()
inc_test["first_fire"] = pd.NaT
inc_test["lead_minutes"] = np.nan

for i, row in inc_test.iterrows():
    t0 = pd.to_datetime(row["datetime"])
    mask = (timeline["datetime"] > t0 - lead) & (timeline["datetime"] <= t0) & (timeline["pred"] == 1)
    if mask.any():
        t_first = timeline.loc[mask, "datetime"].min()
        inc_test.at[i, "first_fire"]   = t_first
        inc_test.at[i, "lead_minutes"] = (t0 - t_first).total_seconds()/60.0

detected = inc_test[inc_test["first_fire"].notna()].copy()
missed   = inc_test[inc_test["first_fire"].isna()].copy()

fires = timeline[timeline["pred"] == 1].copy()
true_alert_mask = pd.Series(False, index=fires.index)
for _, row in inc_test.iterrows():
    t0 = pd.to_datetime(row["datetime"])
    true_alert_mask |= ((fires["datetime"] > t0 - lead) & (fires["datetime"] <= t0))
false_alarms = fires[~true_alert_mask].copy()

print(f"Detected incidents: {len(detected)} / {len(inc_test)}")
print(f"False alarms: {len(false_alarms)}")


# with open(outdir / "logs_eval_summary.json", "w", encoding="utf-8") as f:
#     json.dump({
#         "best_model": best_l_name,
#         "threshold": float(best_l_out["thr"]),
#         "train_range": [str(feat_train["datetime"].min()), str(feat_train["datetime"].max())],
#         "test_range":  [str(feat_test["datetime"].min()),  str(feat_test["datetime"].max())],
#         "lead_minutes": int(lead.total_seconds() // 60),
#         "n_features": len(feature_cols)
#     }, f, ensure_ascii=False, indent=2)




  train_end = start + pd.Timedelta(min_train_period)
  test_end = min(test_start + pd.Timedelta(test_period), end)


[LightGBM] [Info] Number of positive: 298, number of negative: 422
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000656 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2596
[LightGBM] [Info] Number of data points in the train set: 720, number of used features: 48
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.413889 -> initscore=-0.347912
[LightGBM] [Info] Start training from score -0.347912
Best model (logs): RandomForest | threshold: 0.645




Unnamed: 0,model,roc_auc,pr_auc,f1_at_thr,thr
2,RandomForest,0.691889,0.638665,0.43309,0.645
5,LightGBM,0.658606,0.603442,0.356589,0.98681
4,KNN,0.631265,0.550797,0.540171,0.4
3,NaiveBayes,0.543128,0.520245,0.457746,1.404484e-13
1,DecisionTree,0.602232,0.491916,0.52549,0.5861111
0,LogisticRegression,0.520292,0.416721,0.481203,0.3922576


Detected incidents: 6 / 11
False alarms: 30


In [None]:
# save results
outdir = Path("results/only_logs_simplified")
best_clf = all_l_out[best_l_name]["classifier"]
joblib.dump(best_clf, outdir / "best_logs_model.joblib")

timeline.to_csv(outdir / "logs_pred_timeline.csv", index=False)
detected[["datetime","first_fire","lead_minutes"]].to_csv(outdir / "logs_incidents_detected.csv", index=False)
missed[["datetime"]].to_csv(outdir / "logs_incidents_missed.csv", index=False)
false_alarms[["datetime"]].to_csv(outdir / "logs_false_alarms.csv", index=False)
rep_l.to_csv(outdir / "logs_models_report.csv", index=False)

fi_df = all_l_out[best_l_name]["feature_importance"]
if fi_df is not None:
    fi_df.to_csv(outdir / "logs_feature_importance.csv", index=False)


In [None]:
import pandas as pd
import numpy as np


df_logs["datetime"] = pd.to_datetime(df_logs["datetime"])
tmin = df_logs["datetime"].min()
tmax = df_logs["datetime"].max()
span = tmax - tmin

print("Logs interval:", tmin, "- ", tmax, "| Duration:", span)

uniq_minutes = df_logs["datetime"].dt.floor("min").nunique()
print("Unique minutes:", uniq_minutes)

# incidents info
if not incidents.empty:
    print("Total incidents:", len(incidents), 
          "| min:", pd.to_datetime(incidents["datetime"]).min(),
          "| max:", pd.to_datetime(incidents["datetime"]).max())


Logs interval: 2022-05-02 16:00:00 -  2022-05-03 15:59:58 | Duration: 0 days 23:59:58
Unique minutes: 1434
Total incidents: 21 | min: 2022-05-02 18:02:09 | max: 2022-05-03 14:53:16


## Only metrics

In [17]:
import os
metrics_root = "/Users/arinagoncharova/Documents/diploma/EDA/Aiops-Dataset/data/2022-05-03/metric"

all_metrics = []

for root, dirs, files in os.walk(metrics_root):
    for f in files:
        if f.endswith(".csv"):
            file_path = os.path.join(root, f)
            try:
                df = pd.read_csv(file_path)
                all_metrics.append(df)
            except Exception as e:
                print(f"Error while reading {file_path}: {e}")

# combine all metrics into a single dataframe
metrics_df = pd.concat(all_metrics, ignore_index=True)
print(f"Total metrics: {len(metrics_df):,}")
metrics_df.head()

Total metrics: 7,599,784


Unnamed: 0,timestamp,cmdb_id,kpi_name,value,service,rr,sr,mrt,count
0,1651507200,istio-egressgateway-7bfdcc9d86-g2d4q,istio_agent_startup_duration_seconds,1.346764,,,,,
1,1651507200,istio-ingressgateway-565bffd4d-rn678,istio_agent_startup_duration_seconds,1.529217,,,,,
2,1651507260,istio-egressgateway-7bfdcc9d86-g2d4q,istio_agent_startup_duration_seconds,1.346764,,,,,
3,1651507260,istio-ingressgateway-565bffd4d-rn678,istio_agent_startup_duration_seconds,1.529217,,,,,
4,1651507320,istio-egressgateway-7bfdcc9d86-g2d4q,istio_agent_startup_duration_seconds,1.346764,,,,,


In [None]:
import re
import pandas as pd
import numpy as np

def normalize_service_name(s: str) -> str:
    if pd.isna(s):
        return None
    s = str(s).strip().lower()
    # strip protocol suffixes and port-ish tails
    s = re.sub(r"-(grpc|http|https)$", "", s)
    # optional: strip trailing '-svc' if your data uses that (disabled by default)
    # s = re.sub(r"-svc$", "", s)
    return s

# --- 2) Prepare metrics (already wide with rr/sr/mrt/count)
metrics_wide = metrics_use.copy()
# ensure datetime
metrics_wide["datetime"] = pd.to_datetime(metrics_wide["datetime"])
# ensure a service column, then normalize to 'service_base'
if "service" not in metrics_wide.columns:
    if "cmdb_id" in metrics_wide.columns:
        metrics_wide = metrics_wide.rename(columns={"cmdb_id": "service"})
    else:
        metrics_wide["service"] = "GLOBAL"
metrics_wide["service_base"] = metrics_wide["service"].map(normalize_service_name)

# numeric
for c in ["rr","sr","mrt","count"]:
    if c in metrics_wide.columns:
        metrics_wide[c] = pd.to_numeric(metrics_wide[c], errors="coerce")

# aggregate per minute × normalized service
metrics_high_agg = (
    metrics_wide
    .groupby([pd.Grouper(key="datetime", freq="1min"), "service_base"], as_index=False)
    .agg({"rr": "mean", "sr": "mean", "mrt": "mean", "count": "sum"})
    .rename(columns={"service_base":"service"})
)

print("Aggregated metrics shape:", metrics_high_agg.shape)
print("Services (metrics, normalized) sample:", metrics_high_agg["service"].dropna().unique()[:10])

# prepare incidents with the same normalized service
inc = service_failures_03_05_df.copy()
inc["datetime"] = pd.to_datetime(inc["datetime"])

# find which col stores service on incidents and normalize into 'service'
if "service" in inc.columns:
    inc["service"] = inc["service"].map(normalize_service_name)
elif "cmdb_id" in inc.columns:
    inc["service"] = inc["cmdb_id"].map(normalize_service_name)
else:
    # if no per-service info, keep global (labels will be global)
    inc["service"] = None

print("Services (incidents, normalized) sample:", inc["service"].dropna().unique()[:10])

# history features + labeling on the wide table
from dataclasses import dataclass
from typing import List, Optional

@dataclass
class WindowConfig:
    freq: str = "1min"
    history_len: int = 5
    pre_failure_lead: pd.Timedelta = pd.Timedelta("30min")

@dataclass
class Split:
    train_start: pd.Timestamp
    train_end: pd.Timestamp
    test_start: pd.Timestamp
    test_end: pd.Timestamp

def build_time_splits(times: pd.Series, n_splits=1, min_train_period="12H", test_period="12H") -> List[Split]:
    times = pd.to_datetime(times).sort_values()
    start, end = times.min(), times.max()
    splits = []
    train_end = start + pd.Timedelta(min_train_period)
    for _ in range(n_splits):
        test_start = train_end
        test_end = min(test_start + pd.Timedelta(test_period), end)
        if test_start >= test_end: break
        splits.append(Split(start, train_end, test_start, test_end))
        train_end = test_end
    return splits

def add_history_features(pivot: pd.DataFrame, time_col="datetime", service_col: Optional[str]=None, history_len=5) -> pd.DataFrame:
    df = pivot.copy()
    df[time_col] = pd.to_datetime(df[time_col])
    base_cols = [time_col] + ([service_col] if service_col and service_col in df.columns else [])
    metric_cols = [c for c in df.columns if c not in base_cols]
    def _add(g: pd.DataFrame):
        g = g.sort_values(time_col)
        for c in metric_cols:
            g[f"{c}_mean{history_len}"] = g[c].rolling(window=history_len, min_periods=1).mean()
            g[f"{c}_diff1"] = g[c].diff().fillna(0)
        return g
    if service_col and service_col in df.columns:
        df = df.groupby(service_col, group_keys=False).apply(_add)
    else:
        df = _add(df)
    return df

def make_labels_by_time(times: pd.Series, incidents: pd.DataFrame, lead: pd.Timedelta,
                        service: Optional[pd.Series]=None) -> pd.Series:
    times = pd.to_datetime(times)
    y = pd.Series(0, index=times.index, dtype=int)
    inc = incidents.copy()
    # will use 'service' if present & notna; else global
    has_service = "service" in inc.columns and inc["service"].notna().any()
    if service is not None and has_service:
        service = service.astype(str)
        for srv, grp in inc.dropna(subset=["service"]).groupby("service"):
            mask_srv = (service == str(srv))
            if not mask_srv.any(): continue
            t_srv = times[mask_srv]
            y_srv = pd.Series(0, index=t_srv.index, dtype=int)
            for t0 in pd.to_datetime(grp["datetime"]).sort_values():
                win = (t_srv > (t0 - lead)) & (t_srv <= t0)
                if win.any(): y_srv.loc[win] = 1
            y.loc[y_srv.index] = y_srv.values
    else:
        for t0 in pd.to_datetime(inc["datetime"]).sort_values():
            win = (times > (t0 - lead)) & (times <= t0)
            if win.any(): y.loc[win.index[win]] = 1
    return y

def build_features_from_wide(
    df: pd.DataFrame,
    incidents: pd.DataFrame,
    wcfg: WindowConfig,
    time_col="datetime",
    service_col="service",
    min_train_period="12H",
    test_period="12H"
):
    splits = build_time_splits(df[time_col], n_splits=1,
                               min_train_period=min_train_period, test_period=test_period)
    assert splits, "Not enough data for requested windows"
    sp = splits[0]
    train_df = df[(df[time_col] >= sp.train_start) & (df[time_col] < sp.train_end)].copy()
    test_df  = df[(df[time_col] >= sp.test_start)  & (df[time_col] < sp.test_end)].copy()

    feat_train = add_history_features(train_df, time_col=time_col, service_col=service_col, history_len=wcfg.history_len)
    feat_test  = add_history_features(test_df,  time_col=time_col, service_col=service_col, history_len=wcfg.history_len)

    y_train = make_labels_by_time(
        feat_train[time_col], incidents, wcfg.pre_failure_lead,
        service=feat_train[service_col] if service_col in feat_train.columns else None
    )
    y_test = make_labels_by_time(
        feat_test[time_col], incidents, wcfg.pre_failure_lead,
        service=feat_test[service_col] if service_col in feat_test.columns else None
    )

    drop_cols = [time_col, service_col]
    feature_cols = [c for c in feat_train.columns if c not in drop_cols]
    Xtr = feat_train[feature_cols].values
    Xte = feat_test[feature_cols].values
    return feat_train, y_train, feat_test, y_test, feature_cols, Xtr, Xte, sp

# --- 6) Build features & labels using NORMALIZED services
wcfg_metrics = WindowConfig(freq="1min", history_len=5, pre_failure_lead=pd.Timedelta("30min"))

feat_train_m, y_train_m, feat_test_m, y_test_m, feature_cols_m, Xtr_m, Xte_m, sp_m = build_features_from_wide(
    df=metrics_high_agg,     # aggregated by minute × normalized service
    incidents=inc,           # incidents with normalized 'service'
    wcfg=wcfg_metrics,
    time_col="datetime",
    service_col="service",   # <- important: use normalized service
    min_train_period="12H", test_period="12H"
)

print("Train window:", sp_m.train_start, "→", sp_m.train_end,
      "| positives:", int(y_train_m.sum()), "/", len(y_train_m))
print("Test  window:", sp_m.test_start,  "→", sp_m.test_end,
      "| positives:", int(y_test_m.sum()),  "/", len(y_test_m))


Aggregated metrics shape: (14392, 6)
Services (metrics, normalized) sample: ['adservice' 'cartservice' 'checkoutservice' 'currencyservice'
 'emailservice' 'frontend' 'paymentservice' 'productcatalogservice'
 'recommendationservice' 'shippingservice']
Services (incidents, normalized) sample: ['productcatalogservice' 'recommendationservice' 'frontend' 'cartservice'
 'emailservice' 'adservice' 'currencyservice' 'checkoutservice']
Train window: 2022-05-02 16:00:00 → 2022-05-03 04:00:00 | positives: 310 / 7200
Test  window: 2022-05-03 04:00:00 → 2022-05-03 15:59:00 | positives: 320 / 7182


  train_end = start + pd.Timedelta(min_train_period)
  test_end = min(test_start + pd.Timedelta(test_period), end)
  df = df.groupby(service_col, group_keys=False).apply(_add)
  df = df.groupby(service_col, group_keys=False).apply(_add)


In [27]:
rep_m, best_m_name, best_m_out, all_m_out = compare_custom_models(
    Xtr_m, y_train_m.values, Xte_m, y_test_m.values, feature_cols_m, pick_by="pr_auc"
)
print("Best model is", best_m_name, "with threshold:", best_m_out["thr"])
rep_m


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[LightGBM] [Info] Number of positive: 310, number of negative: 6890
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001030 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2261
[LightGBM] [Info] Number of data points in the train set: 7200, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.043056 -> initscore=-3.101254
[LightGBM] [Info] Start training from score -3.101254
Best model is RandomForest with threshold: 0.54




Unnamed: 0,model,roc_auc,pr_auc,f1_at_thr,thr
2,RandomForest,0.536422,0.070254,0.0,0.54
5,LightGBM,0.538944,0.063036,0.038567,0.328538
1,DecisionTree,0.50114,0.061567,0.060139,0.822392
0,LogisticRegression,0.564314,0.048689,0.0,0.665648
4,KNN,0.524034,0.046982,0.037879,0.4
3,NaiveBayes,0.377689,0.034226,0.0,1.0


## Logs + metrics

In [None]:
import re
import numpy as np
import pandas as pd
from dataclasses import dataclass
from typing import List, Optional

@dataclass
class WindowConfig:
    freq: str = "1min"
    history_len: int = 5
    pre_failure_lead: pd.Timedelta = pd.Timedelta("30min")

@dataclass
class Split:
    train_start: pd.Timestamp
    train_end: pd.Timestamp
    test_start: pd.Timestamp
    test_end: pd.Timestamp

def build_time_splits_from_range(start: pd.Timestamp, end: pd.Timestamp,
                                 min_train_period="12H", test_period="12H") -> List[Split]:
    start = pd.to_datetime(start); end = pd.to_datetime(end)
    if start >= end:
        return []
    train_end = start + pd.Timedelta(min_train_period)
    test_start = train_end
    test_end = min(test_start + pd.Timedelta(test_period), end)
    if test_start >= test_end:
        return []
    return [Split(start, train_end, test_start, test_end)]

# normalization and  minimal feature makers
def normalize_service_name(s: str) -> Optional[str]:
    if s is None or (isinstance(s, float) and pd.isna(s)): return None
    s = str(s).strip().lower()
    s = re.sub(r"-(grpc|http|https)$", "", s)   
    # s = re.sub(r"-svc$", "", s) 
    return s

# try/except ensures we reuse them.
try:
    resample_and_pivot_counts
except NameError:
    def resample_and_pivot_counts(df, time_col="datetime", cluster_col="cluster", freq="1min", service_col=None):
        d = df.copy()
        d[time_col] = pd.to_datetime(d[time_col])
        keys = [pd.Grouper(key=time_col, freq=freq), cluster_col]
        idx = [time_col]
        if service_col and service_col in d.columns:
            keys = [service_col] + keys
            idx = [service_col, time_col]
        per_bucket = d.groupby(keys).size().reset_index(name="cnt")
        pivot = per_bucket.pivot_table(index=idx, columns=cluster_col, values="cnt",
                                       aggfunc="sum", fill_value=0).reset_index()
        # rename cluster columns to str
        pivot.columns = [str(c) for c in pivot.columns]
        return pivot

try:
    add_history_features
except NameError:
    def add_history_features(pivot: pd.DataFrame, time_col="datetime", service_col: Optional[str]=None, history_len=5) -> pd.DataFrame:
        df = pivot.copy()
        df[time_col] = pd.to_datetime(df[time_col])
        base = [time_col] + ([service_col] if service_col and service_col in df.columns else [])
        feat_cols = [c for c in df.columns if c not in base]
        def _add(g: pd.DataFrame):
            g = g.sort_values(time_col)
            for c in feat_cols:
                g[f"{c}_mean{history_len}"] = g[c].rolling(history_len, min_periods=1).mean()
                g[f"{c}_diff1"] = g[c].diff().fillna(0)
            return g
        if service_col and service_col in df.columns:
            df = df.groupby(service_col, group_keys=False).apply(_add)
        else:
            df = _add(df)
        return df

try:
    make_labels_by_time
except NameError:
    def make_labels_by_time(times: pd.Series, incidents: pd.DataFrame, lead: pd.Timedelta,
                            service: Optional[pd.Series]=None) -> pd.Series:
        times = pd.to_datetime(times)
        y = pd.Series(0, index=times.index, dtype=int)
        inc = incidents.copy()
        has_service = "service" in inc.columns and inc["service"].notna().any()
        if service is not None and has_service:
            service = service.astype(str)
            for srv, grp in inc.dropna(subset=["service"]).groupby("service"):
                mask_srv = (service == str(srv))
                if not mask_srv.any(): continue
                t_srv = times[mask_srv]
                y_srv = pd.Series(0, index=t_srv.index, dtype=int)
                for t0 in pd.to_datetime(grp["datetime"]).sort_values():
                    win = (t_srv > (t0 - lead)) & (t_srv <= t0)
                    if win.any(): y_srv.loc[win] = 1
                y.loc[y_srv.index] = y_srv.values
        else:
            for t0 in pd.to_datetime(inc["datetime"]).sort_values():
                win = (times > (t0 - lead)) & (times <= t0)
                if win.any():
                    y.loc[win.index[win]] = 1
        return y

# prepare metrics (wide -> per-minute agg by normalized service)
#  metrics_df with columns: timestamp|datetime, service or cmdb_id, rr/sr/mrt/count
metrics_df = metrics_df.copy()
if "datetime" not in metrics_df.columns:
    metrics_df["datetime"] = pd.to_datetime(metrics_df["timestamp"], unit="s")
else:
    metrics_df["datetime"] = pd.to_datetime(metrics_df["datetime"])
metrics_df = metrics_df.sort_values("datetime")

# ensure service, normalize to base
if "service" not in metrics_df.columns:
    if "cmdb_id" in metrics_df.columns:
        metrics_df = metrics_df.rename(columns={"cmdb_id":"service"})
    else:
        metrics_df["service"] = "GLOBAL"
metrics_df["service"] = metrics_df["service"].map(normalize_service_name)

# numeric cast
for c in ["rr","sr","mrt","count"]:
    if c in metrics_df.columns:
        metrics_df[c] = pd.to_numeric(metrics_df[c], errors="coerce")

# aggregate per minute × service
metrics_agg = (
    metrics_df
    .groupby([pd.Grouper(key="datetime", freq="1min"), "service"], as_index=False)
    .agg({"rr":"mean", "sr":"mean", "mrt":"mean", "count":"sum"})
)

# prepare logs (embeddings -> HDBSCAN clusters → per-minute counts)
# df_logs with columns: 'datetime', service_col_name, 'embedding' (array-like)
df_logs = df_logs.copy()
df_logs["datetime"] = pd.to_datetime(df_logs["datetime"])

# normalize service on logs to the same 'service' col
if "service" not in df_logs.columns:
    if 'service' in df_logs.columns:
        pass
    else:
        # use your known service col name variable; if not present, fallback to no service dimension
        svc_col = service_col_name if 'service_col_name' in globals() else None
        if svc_col and svc_col in df_logs.columns:
            df_logs = df_logs.rename(columns={svc_col: "service"})
        elif "cmdb_id" in df_logs.columns:
            df_logs = df_logs.rename(columns={"cmdb_id":"service"})
        else:
            df_logs["service"] = "GLOBAL"
df_logs["service"] = df_logs["service"].map(normalize_service_name)

# HDBSCAN clustering on train only, then approximate_predict on test
import hdbscan
from hdbscan import approximate_predict

def build_log_features_with_split(df_logs, sp: Split, window_cfg: WindowConfig,
                                  cluster_cfg=None, time_col="datetime", embedding_col="embedding", service_col="service"):
    # slice
    df = df_logs.sort_values(time_col)
    train_df = df[(df[time_col] >= sp.train_start) & (df[time_col] < sp.train_end)].copy()
    test_df  = df[(df[time_col] >= sp.test_start)  & (df[time_col] < sp.test_end)].copy()

    # fit HDBSCAN on train
    Xtr = np.vstack(train_df[embedding_col].to_list())
    clusterer = hdbscan.HDBSCAN(
        metric=getattr(cluster_cfg, "metric", "euclidean"),
        min_cluster_size=getattr(cluster_cfg, "min_cluster_size", 800),
        min_samples=getattr(cluster_cfg, "min_samples", 400),
        cluster_selection_method=getattr(cluster_cfg, "cluster_selection_method", "eom"),
        cluster_selection_epsilon=getattr(cluster_cfg, "cluster_selection_epsilon", 0.03),
        prediction_data=True,
        approx_min_span_tree=True,
    )
    fit_max = getattr(cluster_cfg, "fit_max_points", None)
    if fit_max and Xtr.shape[0] > fit_max:
        idx = np.random.RandomState(0).choice(Xtr.shape[0], fit_max, replace=False)
        clusterer.fit(Xtr[idx])
    else:
        clusterer.fit(Xtr)

    tr_labels, _ = approximate_predict(clusterer, Xtr)
    train_df["cluster"] = tr_labels

    Xte = np.vstack(test_df[embedding_col].to_list())
    te_labels, _ = approximate_predict(clusterer, Xte)
    test_df["cluster"] = te_labels

    # resample -> pivot counts per minute×service
    pivot_train = resample_and_pivot_counts(train_df, time_col=time_col, cluster_col="cluster",
                                            freq=window_cfg.freq, service_col=service_col)
    pivot_test  = resample_and_pivot_counts(test_df,  time_col=time_col, cluster_col="cluster",
                                            freq=window_cfg.freq, service_col=service_col)

    # add history features
    feat_train = add_history_features(pivot_train, time_col=time_col, service_col=service_col,
                                      history_len=window_cfg.history_len)
    feat_test  = add_history_features(pivot_test,  time_col=time_col, service_col=service_col,
                                      history_len=window_cfg.history_len)
    return feat_train, feat_test

# build 1 aligned time split (intersection of logs & metrics ranges)
log_min, log_max = df_logs["datetime"].min(), df_logs["datetime"].max()
met_min, met_max = metrics_agg["datetime"].min(), metrics_agg["datetime"].max()
common_start = max(log_min, met_min)
common_end   = min(log_max, met_max)

min_train_period = "12H"
test_period = "12H"
splits = build_time_splits_from_range(common_start, common_end, min_train_period, test_period)
if not splits:
    raise ValueError("Not enough overlapping data between logs and metrics for the requested windows.")
sp = splits[0]
print("Aligned split:")
print("  TRAIN:", sp.train_start, "→", sp.train_end)
print("  TEST :", sp.test_start,  "→", sp.test_end)

# calculate features
wcfg = WindowConfig(freq="1min", history_len=5, pre_failure_lead=pd.Timedelta("30min"))

# logs -> features in aligned windows
feat_train_l, feat_test_l = build_log_features_with_split(df_logs, sp, wcfg, cluster_cfg=None,
                                                          time_col="datetime", embedding_col="embedding", service_col="service")

# metrics -> just slice per window and add history
def slice_by_split(df, sp, time_col="datetime"):
    return (df[(df[time_col] >= sp.train_start) & (df[time_col] < sp.train_end)].copy(),
            df[(df[time_col] >= sp.test_start)  & (df[time_col] < sp.test_end)].copy())

met_train, met_test = slice_by_split(metrics_agg, sp, time_col="datetime")
feat_train_m = add_history_features(met_train, time_col="datetime", service_col="service", history_len=wcfg.history_len)
feat_test_m  = add_history_features(met_test,  time_col="datetime", service_col="service", history_len=wcfg.history_len)

# merge LOG + metrics features on ["datetime","service"] and relabel
base_keys = ["datetime","service"]
train_merged = pd.merge(feat_train_l, feat_train_m, on=base_keys, how="outer").sort_values("datetime").fillna(0)
test_merged  = pd.merge(feat_test_l,  feat_test_m,  on=base_keys, how="outer").sort_values("datetime").fillna(0)

# normalize incidents & services
inc = service_failures_03_05_df.copy()
inc["datetime"] = pd.to_datetime(inc["datetime"])
if "service" in inc.columns:
    inc["service"] = inc["service"].map(normalize_service_name)
elif "cmdb_id" in inc.columns:
    inc["service"] = inc["cmdb_id"].map(normalize_service_name)
else:
    inc["service"] = None  # fall back to global if no service info

# labels on merged grid
y_train_c = make_labels_by_time(
    train_merged["datetime"], inc, wcfg.pre_failure_lead,
    service=train_merged["service"] if "service" in train_merged.columns else None
)
y_test_c = make_labels_by_time(
    test_merged["datetime"], inc, wcfg.pre_failure_lead,
    service=test_merged["service"] if "service" in test_merged.columns else None
)

# train/evaluate sklearn models on merged features
# build X matrices
drop_cols = base_keys
feature_cols_c = [c for c in train_merged.columns if c not in drop_cols]
Xtr_c = train_merged[feature_cols_c].values
Xte_c = test_merged[feature_cols_c].values

# compare_custom_models must already be defined (we added it earlier in your notebook)
rep_c, best_c_name, best_c_out, all_c_out = compare_custom_models(
    Xtr_c, y_train_c.values, Xte_c, y_test_c.values, feature_cols_c, pick_by="pr_auc"
)
print("Best model (logs+metrics):", best_c_name, "| threshold:", best_c_out["thr"])
display(rep_c)

# prediction results: detected/missed/false alarms
timeline = test_merged[["datetime"]].copy()
timeline["y_true"] = y_test_c.values.astype(int)
timeline["proba"]  = best_c_out["proba_test"]
timeline["pred"]   = best_c_out["pred_test"]

lead = wcfg.pre_failure_lead
inc_test = inc[(inc["datetime"] >= timeline["datetime"].min()) & (inc["datetime"] <= timeline["datetime"].max())].copy()
inc_test["first_fire"] = pd.NaT
inc_test["lead_minutes"] = np.nan
for i, row in inc_test.iterrows():
    t0 = pd.to_datetime(row["datetime"])
    mask = (timeline["datetime"] > t0 - lead) & (timeline["datetime"] <= t0) & (timeline["pred"] == 1)
    if mask.any():
        t_first = timeline.loc[mask, "datetime"].min()
        inc_test.at[i, "first_fire"] = t_first
        inc_test.at[i, "lead_minutes"] = (t0 - t_first).total_seconds()/60.0

detected = inc_test[inc_test["first_fire"].notna()].copy()
missed   = inc_test[inc_test["first_fire"].isna()].copy()
fires = timeline[timeline["pred"] == 1].copy()
true_alert_mask = pd.Series(False, index=fires.index)
for _, row in inc_test.iterrows():
    t0 = pd.to_datetime(row["datetime"])
    true_alert_mask |= ((fires["datetime"] > t0 - lead) & (fires["datetime"] <= t0))
false_alarms = fires[~true_alert_mask].copy()

print(f"Detected incidents: {len(detected)} / {len(inc_test)} | False alarms: {len(false_alarms)}")

# save artifacts
timeline.to_csv("combo_pred_timeline.csv", index=False)
detected[["datetime","first_fire","lead_minutes"]].to_csv("combo_incidents_detected.csv", index=False)
missed[["datetime"]].to_csv("combo_incidents_missed.csv", index=False)
false_alarms[["datetime"]].to_csv("combo_false_alarms.csv", index=False)


  train_end = start + pd.Timedelta(min_train_period)
  test_end = min(test_start + pd.Timedelta(test_period), end)


Aligned split:
  TRAIN: 2022-05-02 16:00:00 → 2022-05-03 04:00:00
  TEST : 2022-05-03 04:00:00 → 2022-05-03 15:59:00


  df = df.groupby(service_col, group_keys=False).apply(_add)
  df = df.groupby(service_col, group_keys=False).apply(_add)
  df = df.groupby(service_col, group_keys=False).apply(_add)
  df = df.groupby(service_col, group_keys=False).apply(_add)


[LightGBM] [Info] Number of positive: 310, number of negative: 19179
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007654 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5273
[LightGBM] [Info] Number of data points in the train set: 19489, number of used features: 137
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.015906 -> initscore=-4.124999
[LightGBM] [Info] Start training from score -4.124999
Best model (logs+metrics): LightGBM | threshold: 0.27674758026313834




Unnamed: 0,model,roc_auc,pr_auc,f1_at_thr,thr
5,LightGBM,0.84335,0.061668,0.048913,0.276748
2,RandomForest,0.692565,0.056548,0.0,0.55
0,LogisticRegression,0.845122,0.045202,0.0,0.8358
3,NaiveBayes,0.836919,0.045001,0.086126,1.0
4,KNN,0.564238,0.023319,0.036,0.4
1,DecisionTree,0.588426,0.023316,0.075,0.868967


Detected incidents: 6 / 11 | False alarms: 28
