In [1]:
# Experiment Settings, logger, plotter
from utils.exp_logger import Logger
from utils.exp_metrics_plotter import MetricsPlotter
from utils.utils import set_settings
from utils.exp_config import get_config
config = get_config('OurModelConfig')
set_settings(config)

<module 'module.name' from '/home/rtx4090/code/python/current/Iregular/configs/OurModelConfig.py'> OurModelConfig
✅ All __pycache__ folders removed


OurModelConfig(classification=False, ablation=0, try_exp=1, ts_var=1, input_size=21, bs=512, lr=0.001, decay=0.0001, loss_func='MSELoss', optim='Adam', epochs=50, patience=10, verbose=50, device='cuda', monitor_metric='MAE', use_amp=False, monitor_reverse=False, path='./data', task='bench201', dataset='weather', predict_target='y', eval_set=True, shuffle=False, scaler_method='minmax', spliter_ratio='7:1:2', sample_method='ours', seq_len=96, pred_len=192, logger='zyx', model='ours', d_model=96, num_layers=3, retrain=True, seed=0, rounds=5, runid=0, debug=False, record=True, hyper_search=False, continue_train=False, data_dropout=0.3, att_method='self', num_heads=4, att_bias=False, thresh=0.3, pc_alpha=0.05, causal_lr=0.05, pre_gate=0.8, sub_method='DirectLiNGAM', golem_epoch=5000)

In [None]:
config.thresh = 0.3
config.pc_alpha = 0.05
config.causal_lr = 0.05
config.pre_gate = 0.80
config.sub_method = 'DirectLiNGAM'
config.golem_epoch = 5000

In [5]:
from castle.algorithms import ICALiNGAM
from castle.algorithms import DirectLiNGAM
from castle.algorithms import PC
from castle.algorithms import Notears
from castle.algorithms import GraNDAG
from castle.algorithms import GOLEM
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import numpy as np 
import time

from models.hcd import HCD
from models.sada import SADA


def get_causal_matrix(data, method, config):
    data = data.astype(np.float32)
    data = StandardScaler().fit_transform(data)
    start = time.time()
    if method == 'PC':
        model = PC(alpha=config.pc_alpha)
    if method == 'ICALiNGAM':
        model = ICALiNGAM(thresh=config.thresh)
    if method == 'DirectLiNGAM':
        model = DirectLiNGAM(thresh=config.thresh)
        
    if method == 'Notears':
        model = Notears(w_threshold=config.thresh)
    if method == 'GraNDAG':
        model = GraNDAG(input_dim=data.shape[1], device_type='gpu')
    if method == 'GOLEM':
        model = GOLEM(num_iter=config.golem_epoch, graph_thres=config.thresh, device_type='gpu', learning_rate=config.lr)
        
    if method == 'SADA':
        model = SADA(theta=10, alpha=0.05, k=10, max_cond=3, sub_method="pc", thresh=config.thresh, pc_alpha=config.pc_alpha)
    if method == 'HCD':
        model = HCD(pre_gate=config.pre_gate, thresh=config.thresh, method=config.sub_method)
        
    model.learn(data)
    end=time.time()
    execute_time = end - start
    print(f"Method {method} Done. Execution time = {execute_time}")
    return model.causal_matrix, execute_time

import numpy as np 
import pandas as pd 
df = pd.read_csv('data/timeseries/weather.csv').to_numpy()[:, 1:].astype(np.float32)
df = df[:760]
df.shape

(760, 21)

In [6]:
causal_matrix, execute_time = get_causal_matrix(df, 'SADA', config)
print("Edges:", int((causal_matrix > 0).sum()))

Method SADA Done. Execution time = 0.32186102867126465
Edges: 38


In [7]:
def run_all_methods(data, config):
    """
    data: np.ndarray [n, d]
    config: 超参数配置对象
    返回: {method_name: {"causal_matrix": W, "time": t}}
    """
    methods = [
        'PC', 'ICALiNGAM', 'DirectLiNGAM',
        'SADA', 'HCD', 
        'Notears', 'GraNDAG', 'GOLEM',
    ]
    results = {}
    for m in methods:   # 按字典序排序
        try:
            W, t = get_causal_matrix(data, m, config)
            results[m] = {"causal_matrix": W, "time": t}
        except Exception as e:
            results[m] = {"error": str(e)}
    return results

results = run_all_methods(df, config)

Method PC Done. Execution time = 0.22101783752441406




Method ICALiNGAM Done. Execution time = 1.166490077972412
Method DirectLiNGAM Done. Execution time = 0.7869257926940918
Method SADA Done. Execution time = 0.3693225383758545


2025-10-27 16:13:36,774 - /home/rtx4090/anaconda3/lib/python3.12/site-packages/castle/algorithms/gradient/notears/linear.py[line:195] - INFO: [start]: n=760, d=21, iter_=100, h_=1e-08, rho_=1e+16


Method HCD Done. Execution time = 2.032595634460449


2025-10-27 16:13:36,983 - /home/rtx4090/anaconda3/lib/python3.12/site-packages/castle/algorithms/gradient/notears/linear.py[line:206] - INFO: [iter 0] h=3.923e-01, loss=10.500, rho=1.0e+00
2025-10-27 16:13:37,166 - /home/rtx4090/anaconda3/lib/python3.12/site-packages/castle/algorithms/gradient/notears/linear.py[line:206] - INFO: [iter 1] h=2.639e-01, loss=3.513, rho=1.0e+00
2025-10-27 16:13:37,266 - /home/rtx4090/anaconda3/lib/python3.12/site-packages/castle/algorithms/gradient/notears/linear.py[line:206] - INFO: [iter 1] h=1.189e-01, loss=4.206, rho=1.0e+01
2025-10-27 16:13:37,381 - /home/rtx4090/anaconda3/lib/python3.12/site-packages/castle/algorithms/gradient/notears/linear.py[line:206] - INFO: [iter 1] h=4.050e-02, loss=11.130, rho=1.0e+02
2025-10-27 16:13:37,449 - /home/rtx4090/anaconda3/lib/python3.12/site-packages/castle/algorithms/gradient/notears/linear.py[line:206] - INFO: [iter 2] h=2.360e-02, loss=3.959, rho=1.0e+02
2025-10-27 16:13:37,564 - /home/rtx4090/anaconda3/lib/pyth

Method Notears Done. Execution time = 30.69205355644226


Training Iterations: 100%|██████████| 10000/10000 [09:30<00:00, 17.53it/s]
2025-10-27 16:23:38,344 - /home/rtx4090/anaconda3/lib/python3.12/site-packages/castle/algorithms/gradient/notears/torch/golem.py[line:119] - INFO: GPU is available.
2025-10-27 16:23:38,345 - /home/rtx4090/anaconda3/lib/python3.12/site-packages/castle/algorithms/gradient/notears/torch/golem.py[line:190] - INFO: Started training for 5000 iterations.
2025-10-27 16:23:38,390 - /home/rtx4090/anaconda3/lib/python3.12/site-packages/castle/algorithms/gradient/notears/torch/golem.py[line:203] - INFO: [Iter 0] score=101.617, likelihood=101.617, h=0.0e+00


Method GraNDAG Done. Execution time = 570.8765504360199


2025-10-27 16:23:44,984 - /home/rtx4090/anaconda3/lib/python3.12/site-packages/castle/algorithms/gradient/notears/torch/golem.py[line:203] - INFO: [Iter 5000] score=78.093, likelihood=74.469, h=2.4e-01


Method GOLEM Done. Execution time = 6.642951488494873


In [8]:
for k, v in results.items():
    print("==========" , k , "==========")
    if "error" in v:
        print("Error:", v["error"])
    else:
        print("Time:", v["time"])
        print("Edges:", int((v["causal_matrix"] > 0).sum()))

Time: 0.22101783752441406
Edges: 41
Time: 1.166490077972412
Edges: 93
Time: 0.7869257926940918
Edges: 51
Time: 0.3693225383758545
Edges: 38
Time: 2.032595634460449
Edges: 71
Time: 30.69205355644226
Edges: 20
Time: 570.8765504360199
Edges: 14
Time: 6.642951488494873
Edges: 66


In [None]:
# 基于分而治之算法
causal_matrix, execute_time = get_causal_matrix(df, 'HCD', config)
causal_matrix, execute_time = get_causal_matrix(df, 'SADA', config)
# 基于统计学算法的
causal_matrix, execute_time = get_causal_matrix(df, 'PC', config)
causal_matrix, execute_time = get_causal_matrix(df, 'ICALiNGAM', config)
causal_matrix, execute_time = get_causal_matrix(df, 'DirectLiNGAM', config)
# 基于训练的
causal_matrix, execute_time = get_causal_matrix(df, 'Notears', config)
causal_matrix, execute_time = get_causal_matrix(df, 'GraNDAG', config)
causal_matrix, execute_time = get_causal_matrix(df, 'GOLEM', config)

The data must be a pandas.DataFrame that consists of multivariate time series metrics data. We require the data to have a column named time that stores the timestep. Each other column stores a time series for metrics data with the name format of <service>_<metric>. For example, the column cart_cpu stores the CPU utilization of service cart.

In [4]:
import pandas as pd 
import numpy as np 
df = pd.read_csv('./data/microservice/fse-ob/paymentservice_cpu/1/data.csv')
# for item in df.keys():
    # print(item)
df.shape

(721, 419)

In [9]:
import pandas as pd 
import numpy as np 
df = pd.read_csv('./data/microservice/fse-ob/cartservice_cpu/1/data.csv')
for item in df.keys():
    print(item)

time
adservice_container-cpu-system-seconds-total
cartservice_container-cpu-system-seconds-total
checkoutservice_container-cpu-system-seconds-total
currencyservice_container-cpu-system-seconds-total
emailservice_container-cpu-system-seconds-total
frontend_container-cpu-system-seconds-total
main_container-cpu-system-seconds-total
paymentservice_container-cpu-system-seconds-total
productcatalogservice_container-cpu-system-seconds-total
recommendationservice_container-cpu-system-seconds-total
redis_container-cpu-system-seconds-total
shippingservice_container-cpu-system-seconds-total
adservice_container-cpu-usage-seconds-total
cartservice_container-cpu-usage-seconds-total
checkoutservice_container-cpu-usage-seconds-total
currencyservice_container-cpu-usage-seconds-total
emailservice_container-cpu-usage-seconds-total
frontend_container-cpu-usage-seconds-total
main_container-cpu-usage-seconds-total
paymentservice_container-cpu-usage-seconds-total
productcatalogservice_container-cpu-usage-sec

In [10]:
df

Unnamed: 0,time,adservice_container-cpu-system-seconds-total,cartservice_container-cpu-system-seconds-total,checkoutservice_container-cpu-system-seconds-total,currencyservice_container-cpu-system-seconds-total,emailservice_container-cpu-system-seconds-total,frontend_container-cpu-system-seconds-total,main_container-cpu-system-seconds-total,paymentservice_container-cpu-system-seconds-total,productcatalogservice_container-cpu-system-seconds-total,...,192-168-16-130-9100_node-network-receive-drop-total,192-168-23-73-9100_node-network-receive-drop-total,192-168-46-122-9100_node-network-receive-drop-total,192-168-49-59-9100_node-network-receive-drop-total,192-168-77-1-9100_node-network-receive-drop-total,192-168-16-130-9100_node-network-transmit-drop-total,192-168-23-73-9100_node-network-transmit-drop-total,192-168-46-122-9100_node-network-transmit-drop-total,192-168-49-59-9100_node-network-transmit-drop-total,192-168-77-1-9100_node-network-transmit-drop-total
0,1692568979,0.292056,2.148679,0.280954,1.188069,0.141014,4.392528,0.278753,0.114518,1.822222,...,0,0,0,0,0,0,0,0,0,0
1,1692568980,0.292056,2.148679,0.245229,1.016835,0.149284,4.392528,0.278753,0.114518,2.083624,...,0,0,0,0,0,0,0,0,0,0
2,1692568981,0.292056,2.148679,0.245229,0.997034,0.149284,4.392528,0.278753,0.114518,2.042502,...,0,0,0,0,0,0,0,0,0,0
3,1692568982,0.292056,2.148679,0.245229,1.157778,0.149284,4.392528,0.278753,0.114518,2.001381,...,0,0,0,0,0,0,0,0,0,0
4,1692568983,0.292056,2.148679,0.245229,1.157778,0.149284,4.426871,0.278753,0.113214,1.960260,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
716,1692569695,0.291806,1.189656,0.115356,1.107055,0.229369,3.280973,0.303696,0.089986,1.489169,...,0,0,0,0,0,0,0,0,0,0
717,1692569696,0.291806,1.189656,0.115356,1.107055,0.229369,3.218028,0.303696,0.089986,1.489169,...,0,0,0,0,0,0,0,0,0,0
718,1692569697,0.291806,1.189656,0.115356,1.107055,0.229369,3.571763,0.303696,0.089986,1.489169,...,0,0,0,0,0,0,0,0,0,0
719,1692569698,0.217838,1.189656,0.115356,1.107055,0.229369,3.571763,0.303696,0.094668,1.489169,...,0,0,0,0,0,0,0,0,0,0


In [11]:
df.shape

(721, 427)

In [12]:
data = df.to_numpy()
data

array([[1.693e+09, 2.921e-01, 2.149e+00, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [1.693e+09, 2.921e-01, 2.149e+00, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [1.693e+09, 2.921e-01, 2.149e+00, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       ...,
       [1.693e+09, 2.918e-01, 1.190e+00, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [1.693e+09, 2.178e-01, 1.190e+00, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [1.693e+09, 2.579e-01, 1.190e+00, ..., 0.000e+00, 0.000e+00,
        0.000e+00]])

In [18]:
# Experiment Settings, logger, plotter
from utils.exp_logger import Logger
from utils.exp_metrics_plotter import MetricsPlotter
from utils.utils import set_settings
from utils.exp_config import get_config
config = get_config('OurModelConfig')
set_settings(config)

import pandas as pd 
import numpy as np 
from models.causal import get_causal_matrix
df = pd.read_csv('./data/microservice/fse-ob/cartservice_cpu/1/data.csv')
data = df.to_numpy().astype(np.float32)
causal_matrix, cost = get_causal_matrix(data, 'GOLEM', config)

2025-10-27 17:10:18,816 - /home/rtx4090/anaconda3/lib/python3.12/site-packages/castle/algorithms/gradient/notears/torch/golem.py[line:119] - INFO: GPU is available.
2025-10-27 17:10:18,825 - /home/rtx4090/anaconda3/lib/python3.12/site-packages/castle/algorithms/gradient/notears/torch/golem.py[line:190] - INFO: Started training for 5000 iterations.
2025-10-27 17:10:18,828 - /home/rtx4090/anaconda3/lib/python3.12/site-packages/castle/algorithms/gradient/notears/torch/golem.py[line:203] - INFO: [Iter 0] score=2600.230, likelihood=2600.230, h=0.0e+00


<module 'module.name' from '/home/rtx4090/code/python/current/Iregular/configs/OurModelConfig.py'> OurModelConfig
✅ All __pycache__ folders removed


2025-10-27 17:12:22,431 - /home/rtx4090/anaconda3/lib/python3.12/site-packages/castle/algorithms/gradient/notears/torch/golem.py[line:203] - INFO: [Iter 5000] score=1992.812, likelihood=1908.815, h=5.0e+00


Method GOLEM Done. Execution time = 126.09758496284485


In [1]:
# Experiment Settings, logger, plotter
from utils.exp_logger import Logger
from utils.exp_metrics_plotter import MetricsPlotter
from utils.utils import set_settings
from utils.exp_config import get_config
config = get_config('OurModelConfig')
set_settings(config)

import pandas as pd 
import numpy as np 
from models.causal import get_causal_matrix
df = pd.read_csv('./data/microservice/fse-ob/cartservice_cpu/1/data.csv')
data = df.to_numpy().astype(np.float32)
config.sub_method = 'GOLEM'
causal_matrix, cost = get_causal_matrix(data, 'HCD', config)

2025-10-27 17:19:17,086 - /home/rtx4090/anaconda3/lib/python3.12/site-packages/castle/backend/__init__.py[line:36] - INFO: You can use `os.environ['CASTLE_BACKEND'] = backend` to set the backend(`pytorch` or `mindspore`).
2025-10-27 17:19:17,108 - /home/rtx4090/anaconda3/lib/python3.12/site-packages/castle/algorithms/__init__.py[line:36] - INFO: You are using ``pytorch`` as the backend.


<module 'module.name' from '/home/rtx4090/code/python/current/Iregular/configs/OurModelConfig.py'> OurModelConfig
✅ All __pycache__ folders removed


  c /= stddev[:, None]
  c /= stddev[None, :]
2025-10-27 17:19:45,163 - /home/rtx4090/anaconda3/lib/python3.12/site-packages/castle/algorithms/gradient/notears/torch/golem.py[line:119] - INFO: GPU is available.
2025-10-27 17:19:45,218 - /home/rtx4090/anaconda3/lib/python3.12/site-packages/castle/algorithms/gradient/notears/torch/golem.py[line:190] - INFO: Started training for 5000 iterations.
2025-10-27 17:19:45,465 - /home/rtx4090/anaconda3/lib/python3.12/site-packages/castle/algorithms/gradient/notears/torch/golem.py[line:203] - INFO: [Iter 0] score=519.870, likelihood=519.870, h=0.0e+00
2025-10-27 17:20:03,457 - /home/rtx4090/anaconda3/lib/python3.12/site-packages/castle/algorithms/gradient/notears/torch/golem.py[line:203] - INFO: [Iter 5000] score=519.870, likelihood=519.870, h=0.0e+00
2025-10-27 17:20:03,751 - /home/rtx4090/anaconda3/lib/python3.12/site-packages/castle/algorithms/gradient/notears/torch/golem.py[line:119] - INFO: GPU is available.
2025-10-27 17:20:03,753 - /home/rt

Method HCD Done. Execution time = 21.62129222590749


In [None]:
# Experiment Settings, logger, plotter
from utils.exp_logger import Logger
from utils.exp_metrics_plotter import MetricsPlotter
from utils.utils import set_settings
from utils.exp_config import get_config
config = get_config('OurModelConfig')
set_settings(config)

import pandas as pd 
import numpy as np 
from models.causal import get_causal_matrix
df = pd.read_csv('./data/microservice/fse-ob/cartservice_cpu/1/data.csv')
config.sub_method = 'GOLEM'
causal_matrix, cost = get_causal_matrix(data, 'SADA', config)

In [12]:
import pandas as pd
import os
from baro.utility import read_data  # 可复用BARO的工具函数

root_address = './data/microservice/fse-ob/cartservice_cpu/1/'
# 读取数据（含预处理）
data = read_data(root_address + "data.csv")  # 自动处理缺失值、标准化指标名等

# 读取故障注入时间（用于确定异常时间点）
with open(os.path.join(root_address, "inject_time.txt")) as f:
    inject_time = int(f.readline().strip())  # 故障注入时间（异常开始时间）
anomalies = [inject_time]  # 异常时间点列表

In [15]:
# You can put the code here to a file named test.py
from baro.anomaly_detection import bocpd
from baro.root_cause_analysis import robust_scorer
from baro.utility import download_data, read_data

# read data from data.csv
root_address = './data/microservice/fse-ob/cartservice_cpu/1/'
data = read_data(root_address + "data.csv")

# perform anomaly detection 
anomalies = bocpd(data) 
print("Anomalies are detected at timestep:", anomalies[0])

# perform root cause analysis
root_causes = robust_scorer(data, anomalies=anomalies)["ranks"]

# print the top 5 root causes
print("Top 5 root causes:", root_causes[:5])

Anomalies are detected at timestep: 330
Top 5 root causes: ['recommendationservice_container-memory-usage-bytes', 'recommendationservice_container-memory-working-set-bytes', 'recommendationservice_container-memory-rss', 'redis_container-fs-writes-bytes-total', 'redis_container-fs-reads-bytes-total']


In [1]:
import os 
os.listdir('data/microservice/fse-ob')

['cartservice_mem',
 'paymentservice_delay',
 'productcatalogservice_delay',
 'checkoutservice_cpu',
 'productcatalogservice_cpu',
 'paymentservice_loss',
 'paymentservice_mem',
 'productcatalogservice_loss',
 'checkoutservice_mem',
 'checkoutservice_delay',
 'productcatalogservice_mem',
 'paymentservice_cpu',
 'cartservice_cpu',
 'currencyservice_mem',
 'currencyservice_cpu',
 'currencyservice_loss',
 'checkoutservice_loss',
 'cartservice_delay',
 'currencyservice_delay',
 'cartservice_loss']