In [1]:
import numpy as np
import matplotlib.pyplot as plt
import os
import urllib.request
from matplotlib.font_manager import fontManager
import matplotlib
import pandas as pd
import properscoring as ps
from scipy.stats import t
import gc

In [15]:
def calculate_metrics(y_true, y_pred_median, y_pred_samples):

    y_pred_samples= y_pred_samples.astype(np.float16)
    metrics = {}
    metrics['mae'] = float(np.mean(np.abs(y_pred_median - y_true)))
    metrics['rmse'] = float(np.sqrt(np.mean((y_pred_median - y_true)**2)))
    rae_numerator = np.sum(np.abs(y_pred_median - y_true))
    rae_denominator = np.sum(np.abs(np.mean(y_true) - y_true))
    metrics['rae'] = float(rae_numerator / (rae_denominator + 1e-9))
    non_zero_mask = y_true != 0
    metrics['mape'] = float(np.mean(np.abs((y_pred_median[non_zero_mask] - y_true[non_zero_mask]) / y_true[non_zero_mask])) * 100) if np.any(non_zero_mask) else 0.0
    gc.collect()
    metrics['crps'] = float(ps.crps_ensemble(y_true, y_pred_samples.transpose(1, 0, 2, 3), axis=0).mean())

    horizon_metrics = []
    for i in range(y_true.shape[1]): # 遍历预测长度
        y_true_h, y_pred_median_h, y_pred_samples_h = y_true[:, i, :], y_pred_median[:, i, :], y_pred_samples[:, :, i, :]
        mae_h = float(np.mean(np.abs(y_pred_median_h - y_true_h)))
        rmse_h = float(np.sqrt(np.mean((y_pred_median_h - y_true_h)**2)))
        rae_num_h = np.sum(np.abs(y_pred_median_h - y_true_h))
        rae_den_h = np.sum(np.abs(np.mean(y_true_h) - y_true_h))
        rae_h = float(rae_num_h / (rae_den_h + 1e-9))
        non_zero_mask_h = y_true_h != 0
        if np.any(non_zero_mask_h):
            mape_h = float(np.mean(np.abs((y_pred_median_h[non_zero_mask_h] - y_true_h[non_zero_mask_h]) / y_true_h[non_zero_mask_h])) * 100)
        else:
            mape_h = 0.0
        crps_h = float(ps.crps_ensemble(y_true_h, y_pred_samples_h.transpose(1, 0, 2), axis=0).mean())
        horizon_metrics.append([f't+{i+1}', mae_h, rmse_h, rae_h, mape_h, crps_h])
        # horizon_metrics.append([f't+{i+1}', mae_h, rmse_h, rae_h, mape_h])

    df = pd.DataFrame(horizon_metrics, columns=['Horizon', 'MAE', 'RMSE', 'RAE', 'MAPE', 'CRPS'])
    # df = pd.DataFrame(horizon_metrics, columns=['Horizon', 'MAE', 'RMSE', 'RAE', 'MAPE'])
    metrics['horizon_metrics'] = df.to_dict('records')
    return metrics

def print_metrics(metrics):
    print("\n--- Overall Metrics ---")
    print(f"MAE:  {metrics['mae']:.4f}")
    print(f"RMSE: {metrics['rmse']:.4f}")
    print(f"RAE:  {metrics['rae']:.4f}")
    print(f"MAPE: {metrics['mape']:.2f}%")
    print(f"CRPS: {metrics['crps']:.4f}")
    print("-----------------------\n")
    
    if 'horizon_metrics' in metrics:
        horizon_df = pd.DataFrame(metrics['horizon_metrics'])
        print("--- Horizon-wise Metrics ---")
        print(horizon_df)
        print("----------------------------\n")

    if 'dm_stat' in metrics:
        print("--- Significance Test (Diebold-Mariano) ---")
        print(f"Comparing Your Model against Naive Baseline:")
        print(f"DM Statistic: {metrics['dm_stat']:.4f}, P-value: {metrics['p_value']:.7f}")
        if metrics['p_value'] < 0.05:
            print("Conclusion: Your model is STATISTICALLY SIGNIFICANTLY BETTER than the Naive baseline (p < 0.05).")
        else:
            print("Conclusion: No statistical evidence that your model is better than the Naive baseline (p >= 0.05).")
        print("--------------------------------------------\n")

def dm_test(e1, e2, h=12, crit="MAD"):
    e1, e2 = np.array(e1), np.array(e2)
    # d = np.abs(e1) - np.abs(e2)
    if crit == "MAD":
        d = np.abs(e1) - np.abs(e2)
    elif crit == "MSE":
        d = np.square(e1) - np.square(e2)
    else:
        # 如果传入不支持的参数，则报错
        raise ValueError(f"Unknown criterion: {crit}. Supported values are 'MAD' or 'MSE'.")
    d_mean = np.mean(d)
    n = len(d)
    d_centered = d - d_mean
    acov = np.correlate(d_centered, d_centered, mode="full")[n-1:n+h] / n
    var_d = acov[0] + 2 * np.sum([(1 - lag/(h+1)) * acov[lag] for lag in range(1, h)])
    var_d = max(var_d, 1e-12)
    dm_stat = d_mean / np.sqrt(var_d / n)
    p_value = 1 - t.cdf(dm_stat, df=n-1)
    return dm_stat, p_value



In [21]:

model_predictions = np.load("results/pred_20251220_223951_best_post_test.npy")
ground_truths = np.load("results/truths.npy")
model_samples = np.load("results/samples_20251220_223951_best_post_test.npy")
# baseline_predictions = np.load("urbanev/TimeXer_predictions.npy")
# baseline_predictions = np.concatenate([baseline_predictions[:, :, -1:], baseline_predictions], axis=-1)[:, :, :-1]
# baseline_predictions = baseline_predictions[:model_predictions.shape[0]]
# model_predictions = model_predictions[:baseline_predictions.shape[0]]
# model_samples = model_samples[:baseline_predictions.shape[0]]


In [22]:
final_metrics = calculate_metrics(ground_truths, model_predictions, model_samples)
print_metrics(final_metrics)


--- Overall Metrics ---
MAE:  0.0234
RMSE: 0.0388
RAE:  0.1885
MAPE: 13.92%
CRPS: 0.0183
-----------------------

--- Horizon-wise Metrics ---
   Horizon       MAE      RMSE       RAE       MAPE      CRPS
0      t+1  0.025992  0.041488  0.209052  16.393502  0.020389
1      t+2  0.022469  0.036165  0.180751  14.324091  0.017875
2      t+3  0.021760  0.035450  0.175055  13.647938  0.017225
3      t+4  0.022517  0.037053  0.181142  13.903193  0.017770
4      t+5  0.022939  0.038178  0.184530  13.859428  0.018124
5      t+6  0.023290  0.039152  0.187380  13.909088  0.018372
6      t+7  0.023081  0.038767  0.185713  13.534498  0.018141
7      t+8  0.022591  0.037889  0.181772  12.939841  0.017565
8      t+9  0.021032  0.034953  0.169206  12.154234  0.016319
9     t+10  0.021024  0.034413  0.169107  11.977214  0.016207
10    t+11  0.023839  0.039159  0.191690  13.378477  0.018450
11    t+12  0.030655  0.050583  0.246400  16.965298  0.023538
----------------------------




--- Overall Metrics ---
MAE:  0.0073
RMSE: 0.0155
RAE:  0.0590
MAPE: 4.23%
CRPS: 0.0054
-----------------------

--- Horizon-wise Metrics ---
   Horizon       MAE      RMSE       RAE      MAPE      CRPS
0      t+1  0.007375  0.014278  0.059321  4.464262  0.005396
1      t+2  0.008380  0.017637  0.067415  4.811353  0.006177
2      t+3  0.008010  0.017471  0.064443  4.670246  0.005896
3      t+4  0.008270  0.018171  0.066532  4.691518  0.006078
4      t+5  0.007972  0.017354  0.064132  4.523544  0.005842
5      t+6  0.008014  0.017209  0.064478  4.481112  0.005815
6      t+7  0.007687  0.016377  0.061850  4.287670  0.005569
7      t+8  0.007440  0.015899  0.059865  4.197359  0.005423
8      t+9  0.006854  0.014420  0.055142  3.923718  0.004996
9     t+10  0.006715  0.013969  0.054017  3.874601  0.004912
10    t+11  0.006176  0.012120  0.049665  3.631556  0.004543
11    t+12  0.005160  0.008646  0.041478  3.203855  0.003759
----------------------------

