In [1]:
import sys
import pkg_resources

# Get Python version
python_version = sys.version.split(" ")[0]

# Define the packages
packages = ["numpy", "pandas", "scipy", "scikit-learn", "dill", "joblib", "psutil", "pip", "surmise"]

# Get package versions, skipping those that are not installed
package_versions = {}
for pkg in packages:
    try:
        package_versions[pkg] = pkg_resources.get_distribution(pkg).version
    except pkg_resources.DistributionNotFound:
        package_versions[pkg] = "Not installed"

print(f"Python version: {python_version}")
for pkg, version in package_versions.items():
    print(f"{pkg}: {version}")

Python version: 3.11.8
numpy: 1.26.4
pandas: 2.2.1
scipy: 1.12.0
scikit-learn: 1.5.1
dill: 0.3.8
joblib: 1.3.2
psutil: 5.9.8
pip: 24.0
surmise: 0.0.0


  import pkg_resources


# Emulator train and test : No k-fold

In [1]:
import sys, os
sys.path.append(os.path.abspath('../../surmise/emulationmethods'))
sys.path.append(os.path.abspath('../..'))

import numpy as np
import matplotlib.pyplot as plt
import dill
import gzip

# import surmise
from surmise.emulation import emulator
from AKSGP import Emulator as emulator_AKSGP
from PCGP_scikit import Emulator as PCGP_scikit

import logging

# Configure logging for the Emulator class
logging.basicConfig(
    level=logging.INFO, 
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', 
    datefmt='%Y-%m-%d %H:%M:%S'
)
logger = logging.getLogger(__name__)


# Load training data
train_dir = 'simulation_data/Pb_Pb_2760_Grad/train'
X = np.loadtxt(os.path.join(train_dir, 'X.txt'))
Ymean = np.loadtxt(os.path.join(train_dir, 'Ymean.txt'))
Ystd = np.loadtxt(os.path.join(train_dir, 'Ystd.txt'))

# for quick tests
numdesignpt = 100
numobs = 4
X = X[:numdesignpt, :]
Ymean = Ymean[:numdesignpt,:numobs]
Ystd = Ystd[:numdesignpt,:numobs]


in AKSGP emulator class called


### Train and save emulators

In [3]:
xloc = np.arange(Ymean.shape[1])  # refers to the observable indices

In [4]:
# Uncomment to train and save emulators. 

methods = ['PCGP', 'PCGP_scikit', 'PCSK', 'AKSGP']  # specify only the emulators you want to train

emus = {}
for method in methods:

    if method == 'PCGP':
        prior = {'min': np.min(Ymean.T), 'max': np.max(Ymean.T)}
        args = {'prior': prior}
        emus[method] = emulator(x=xloc, theta=X, f=Ymean.T, method=method, args=args)
        
    elif method == 'PCSK':
        args = {'simsd': Ystd.T}
        emus[method] = emulator(x=xloc, theta=X, f=Ymean.T, method=method, args=args)
        
    elif method == 'AKSGP':
        emus[method] = emulator_AKSGP(X=X, Y_mean=Ymean, Y_std=Ystd)
        emus[method].fit(kernel='AKS', nrestarts=10, seed=None)
        
    elif method == 'PCGP_scikit':
        emus[method] = PCGP_scikit(X=X, Y=Ymean, npc = 10)
        emus[method].fit(nrestarts=10)
        
    else:
        logger.error(f"Unknown method '{method}'.\n")
        continue
        
    # Saving the emulators after training
    try:
        filename = f'emulator_{method}.dill.gz'
        with gzip.open(filename, 'wb') as f:
            dill.dump(emus[method], f)
        logger.info(f"Emulator '{method}' trained and saved.\n")
    except Exception as e:
        logger.error(f"Failed to save emulator '{method}': {e}\n")
            


2024-09-06 14:17:41 - __main__ - INFO - Emulator 'PCGP' trained and saved.

2024-09-06 14:17:43 - PCGP_scikit - INFO - GP 0 score : 0.9552706589610446
2024-09-06 14:17:43 - PCGP_scikit - INFO - GP 1 score : 0.8651365452599868
2024-09-06 14:17:43 - PCGP_scikit - INFO - GP 2 score : 0.8763665551580173
2024-09-06 14:17:43 - PCGP_scikit - INFO - GP 3 score : 0.37207779030865606
2024-09-06 14:17:43 - __main__ - INFO - Emulator 'PCGP_scikit' trained and saved.

2024-09-06 14:17:46 - __main__ - INFO - Emulator 'PCSK' trained and saved.

2024-09-06 14:17:46 - AKSGP - INFO - Automatic kernel selection opted. Best kernel for each output dimension will be selected from the list of kernels:
   ['Matern12', 'Matern32', 'Matern52', 'RBF']

2024-09-06 14:17:46 - AKSGP - INFO - Shape of training arrays: (90, 17), (90, 4), (90, 4)
2024-09-06 14:17:46 - AKSGP - INFO - Shape of validation arrays: (10, 17), (10, 4), (10, 4)
2024-09-06 14:17:46 - AKSGP - INFO - Training GPs with all available kernels...
20

### Load saved emulators

In [5]:
emulators = {}
for method in ['PCGP', 'PCSK', 'AKSGP', 'PCGP_scikit']:
    filename = f'emulator_{method}.dill.gz'
    with gzip.open(filename, 'rb') as f:
        emulators[method] = dill.load(f)

### Compute metrics

In [6]:
import pandas as pd
from metrics import *
from sklearn.preprocessing import StandardScaler

# Load testing data
test_dir = 'simulation_data/Pb_Pb_2760_Grad/test'

X_test = np.loadtxt(os.path.join(test_dir, 'X.txt'))
Ymean_test = np.loadtxt(os.path.join(test_dir, 'Ymean.txt'))
Ystd_test = np.loadtxt(os.path.join(test_dir, 'Ystd.txt'))


Ymean_test = Ymean_test[:numdesignpt,:numobs]
Ystd_test = Ystd_test[:numdesignpt,:numobs]

def metrics_cal(means1, var1, means2, var2):
    # Initialize array's to store the distances
    kl_div = np.zeros(means1.shape)
    wasserstein_dist = np.zeros(means1.shape)
    hellinger_dist = np.zeros(means1.shape)
    
    # Loop over each pair of means and variances
    for i in range(means1.shape[0]):
        for j in range(means1.shape[1]):
            mu1 = means1[i, j]
            mu2 = means2[i, j]
            var1_ij = var1[i, j]
            var2_ij = var2[i, j]
            
            # Calculate the distances for the current pair
            kl_div[i, j] = kl_divergence_gaussian(mu1=mu1, Cov1=var1_ij, mu2=mu2, Cov2=var2_ij)
            hellinger_dist[i, j] = hellinger_distance_gaussian(mu1=mu1, Cov1=var1_ij, mu2=mu2, Cov2=var2_ij)
            wasserstein_dist[i, j] = wasserstein_distance_gaussian(mu1=mu1, Cov1=var1_ij, mu2=mu2, Cov2=var2_ij)

    return kl_div, hellinger_dist, wasserstein_dist


scaler_Y = StandardScaler()
Ymean_test = scaler_Y.fit_transform(Ymean_test)
Ystd_test = Ystd_test / scaler_Y.scale_



# Initialize a list to store the results
results = []

for method, emu in emulators.items():
    # print(method)
    
    if method in {'PCGP', 'PCSK'}:
        pred = emu.predict(x=xloc, theta=X_test)
        predmean = pred.mean().T
        predvar = pred.var().T
        
        predmean = scaler_Y.transform(predmean)
        predvar = np.square(np.sqrt(predvar)/scaler_Y.scale_)

        print(f"Training cpu time {method}: {emu.traintotalcputime}")
        print(f"Training wall-clock time {method}: {emu.trainwallclocktime}\n")
        print(f"Prediction cpu time {method}: {emu.predicttotalcputime}")
        print(f"Prediction wall-clock time {method}: {emu.predictwallclocktime}\n")
        
        
    elif method in {'AKSGP', 'PCGP_scikit'}:
        predmean, predstd = emu.predict(X_test)
        predvar = np.square(predstd)

        predmean = scaler_Y.transform(predmean)
        predvar = np.square(np.sqrt(predvar)/scaler_Y.scale_)

        print(f"Training cpu time {method}: {emu.traintotalcputime}")
        print(f"Training wall-clock time {method}: {emu.trainwallclocktime}\n")
        print(f"Prediction cpu time {method}: {emu.predicttotalcputime}")
        print(f"Prediction wall-clock time {method}: {emu.predictwallclocktime}\n")
    
    Yvar_test = np.square(Ystd_test)
    
    EC = intervalstats(Ymean_test, predmean, predvar)
    RMSE = rmse(Ymean_test, predmean)
    NRMSE = normalized_rmse(Ymean_test, predmean)
    KLdiv, HD, WD = metrics_cal(predmean, predvar, Ymean_test, Yvar_test)

    # Store the results in the list
    results.append({
        'Method': method,
        'Empirical Coverage': ', '.join(['{:.6f}'.format(val) for val in EC]),
        'RMSE': '{:.6f}'.format(RMSE),
        'NRMSE': '{:.6f}'.format(NRMSE),
        'KL Divergence': '{:.6f}'.format(np.mean(KLdiv)),
        'Hellinger Distance': '{:.6f}'.format(np.mean(HD)),
        'Wasserstein Distance': '{:.6f}'.format(np.mean(WD))
    })

# Convert the list of results into a DataFrame
results_df = pd.DataFrame(results)


2024-09-06 14:16:14 - numexpr.utils - INFO - NumExpr defaulting to 8 threads.


Training cpu time AKSGP: 9.17000000000553
Training wall-clock time AKSGP: 2.563896894454956

Prediction cpu time AKSGP: 0.0
Prediction wall-clock time AKSGP: 0.001956939697265625

Training cpu time PCGP_scikit: 6.930000000000291
Training wall-clock time PCGP_scikit: 1.5556249618530273

Prediction cpu time PCGP_scikit: 0.0
Prediction wall-clock time PCGP_scikit: 0.0017108917236328125



In [6]:

# Format and print the header
header = f"{'Method':<12} {'Empirical Coverage':<27} {'RMSE':<14} {'NRMSE':<12} {'KL Divergence':<15} {'Hellinger Distance':<20} {'Wasserstein Distance':<20}"
print(header)
print('-' * len(header))

# Format and print each row
for index, row in results_df.iterrows():
    print(f"{row['Method']:<12} {row['Empirical Coverage']:<25} {row['RMSE']:<15} {row['NRMSE']:<15} {row['KL Divergence']:<17} {row['Hellinger Distance']:<20} {row['Wasserstein Distance']:<20}")


Method       Empirical Coverage          RMSE           NRMSE        KL Divergence   Hellinger Distance   Wasserstein Distance
------------------------------------------------------------------------------------------------------------------------------
PCGP         0.887977, 0.829358        0.278953        0.060936        29.258718         0.622252             0.266622            
PCSK         0.876442, 0.852003        0.292602        0.063949        32.442426         0.631018             0.280686            
AKSGP        0.847410, 0.704204        0.272827        0.059591        24.122800         0.621285             0.254739            
PCGP_scikit  0.972239, 1.452532        0.298755        0.065328        67.216975         0.668979             0.386162            
