In [1]:
import numpy as np
import pandas as pd
import graphviz
import lingam
from lingam.utils import make_dot, print_causal_directions, print_dagc

print([np.__version__, pd.__version__, graphviz.__version__, lingam.__version__])

np.set_printoptions(precision=3, suppress=True)
np.random.seed(0)

['1.26.4', '2.2.2', '0.20.3', '2.0.4']


In [4]:
from os import listdir
from os.path import isfile, join
import numpy as np
import pandas as pd
import lingam
import time

# 定义处理单个数据集的函数
def process_dataset(X, threshold=0, k_value=0):

    n_samples = X.shape[0]
    n_features = X.shape[1] 
    model = lingam.VARLiNGAM(lags=2)

    start_time = time.time()
    model.fit(X)
    end_time = time.time()
    execution_time = end_time - start_time

    adjacency_matrices = model.adjacency_matrices_
    
    result = np.zeros_like(adjacency_matrices[0])
    for idx, adj_matrix in enumerate(adjacency_matrices):
        result += np.abs(adj_matrix)
    
    result = (result > threshold).astype(int)
    
    causality_matrix_df = pd.DataFrame(result)
    causality_matrix_df.to_csv(f'test_results_before/causality_matrix_k{k_value}.csv', index=False, header=False)

    return {
        "features": n_features,
        "n_samples": n_samples,
        "total excution time": execution_time
    }


# 定义处理所有数据集的函数
def process_all_datasets(output_file, threshold=0):
    X = pd.read_csv('data/large-data/sp500.csv')
    results=[]
    k=400
    while k<=400:
        X_k = X.iloc[:, :k]
        result = process_dataset(X_k , threshold, k_value=k)
        results.append(result)
        k = 2 * k

    results_df = pd.DataFrame(results)
    results_df.to_csv(output_file, index=False)


    return results_df


output_file = 'test_results_before/sp500_cpu_before_400.csv'

# 处理所有数据集并保存结果
results_df = process_all_datasets(output_file, threshold=0)
print(results_df)


Estimate VAR coefficients time: 1.9209682941436768 seconds
Measure method: pwling
search causal order Execution time: 41271.6442091465 seconds
estimate adjacency matrix Execution time: 99.44349265098572 seconds
   features  n_samples  total excution time
0       400       2604         42698.192989


In [3]:

print(np.mean(results_df["f1score"]))
print(np.std(results_df["f1score"]))

0.6197515738247849
0.13946696550768203
