In [1]:
import sys
import pkg_resources

# Get Python version
python_version = sys.version.split(" ")[0]

# Define the packages
packages = ["numpy", "pandas", "scipy", "scikit-learn", "dill", "joblib", "psutil", "pip", "surmise"]

# Get package versions, skipping those that are not installed
package_versions = {}
for pkg in packages:
    try:
        package_versions[pkg] = pkg_resources.get_distribution(pkg).version
    except pkg_resources.DistributionNotFound:
        package_versions[pkg] = "Not installed"

print(f"Python version: {python_version}")
for pkg, version in package_versions.items():
    print(f"{pkg}: {version}")

Python version: 3.11.8
numpy: 1.26.4
pandas: 2.2.1
scipy: 1.12.0
scikit-learn: 1.5.1
dill: 0.3.8
joblib: 1.3.2
psutil: 5.9.8
pip: 24.0
surmise: 0.0.0


  import pkg_resources


# Emulator train and test : No k-fold

In [2]:
import sys, os
sys.path.append(os.path.abspath('../../surmise/emulationmethods'))
sys.path.append(os.path.abspath('../../surmise'))
sys.path.append(os.path.abspath('../..'))

import numpy as np
import matplotlib.pyplot as plt
import dill
import gzip

from emulation import emulator
from AKSGP import Emulator as emulator_AKSGP
from PCGP_scikit import Emulator as PCGP_scikit

import logging

# Configure logging for the Emulator class
logging.basicConfig(
    level=logging.INFO, 
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', 
    datefmt='%Y-%m-%d %H:%M:%S'
)
logger = logging.getLogger(__name__)


# Load training data
train_dir = 'simulation_data/Pb_Pb_2760_Grad/train'
X = np.loadtxt(os.path.join(train_dir, 'X.txt'))
Ymean = np.loadtxt(os.path.join(train_dir, 'Ymean.txt'))
Ystd = np.loadtxt(os.path.join(train_dir, 'Ystd.txt'))

# for quick tests
numdesignpt = 300
numobs = 4
X = X[:numdesignpt, :]
Ymean = Ymean[:numdesignpt,:numobs]
Ystd = Ystd[:numdesignpt,:numobs]


### Train and save emulators

In [3]:
xloc = np.arange(Ymean.shape[1])  # refers to the observable indices

In [4]:
# Uncomment to train and save emulators. 

methods = ['PCGP', 'PCGP_scikit', 'PCSK', 'AKSGP']  # specify only the emulators you want to train

emus = {}
for method in methods:

    if method == 'PCGP':
        prior = {'min': np.min(Ymean.T), 'max': np.max(Ymean.T)}
        args = {'prior': prior}
        emus[method] = emulator(x=xloc, theta=X, f=Ymean.T, method=method, args=args)
        
    elif method == 'PCSK':
        args = {'simsd': Ystd.T}
        emus[method] = emulator(x=xloc, theta=X, f=Ymean.T, method=method, args=args)
        
    elif method == 'AKSGP':
        emus[method] = emulator_AKSGP(X=X, Y_mean=Ymean, Y_std=Ystd)
        emus[method].fit(kernel='AKS', nrestarts=10, seed=None)
        
    elif method == 'PCGP_scikit':
        emus[method] = PCGP_scikit(X=X, Y=Ymean, npc = 10)
        emus[method].fit(nrestarts=10)
        
    else:
        logger.error(f"Unknown method '{method}'.\n")
        continue
        
    # Saving the emulators after training
    try:
        filename = f'emulator_{method}.dill.gz'
        with gzip.open(filename, 'wb') as f:
            dill.dump(emus[method], f)
        logger.info(f"Emulator '{method}' trained and saved.\n")
    except Exception as e:
        logger.error(f"Failed to save emulator '{method}': {e}\n")
            


2024-09-09 20:53:35 - __main__ - INFO - Emulator 'PCGP' trained and saved.

2024-09-09 20:53:52 - PCGP_scikit - INFO - GP 0 score : 0.9644108910971223
2024-09-09 20:53:52 - PCGP_scikit - INFO - GP 1 score : 0.8688748887367096
2024-09-09 20:53:52 - PCGP_scikit - INFO - GP 2 score : 0.21184906788210045
2024-09-09 20:53:52 - PCGP_scikit - INFO - GP 3 score : 0.1795463331345818
2024-09-09 20:53:52 - __main__ - INFO - Emulator 'PCGP_scikit' trained and saved.

2024-09-09 20:54:05 - __main__ - INFO - Emulator 'PCSK' trained and saved.

2024-09-09 20:54:05 - AKSGP - INFO - Automatic kernel selection opted. Best kernel for each output dimension will be selected from the list of kernels:
   ['Matern12', 'Matern32', 'Matern52', 'RBF']

2024-09-09 20:54:05 - AKSGP - INFO - Shape of training arrays: (270, 17), (270, 4), (270, 4)
2024-09-09 20:54:05 - AKSGP - INFO - Shape of validation arrays: (30, 17), (30, 4), (30, 4)
2024-09-09 20:54:05 - AKSGP - INFO - Training GPs with all available kernels...

### Load saved emulators

In [5]:
emulators = {}
for method in ['PCGP', 'PCGP_scikit', 'PCSK', 'AKSGP']:
    filename = f'emulator_{method}.dill.gz'
    with gzip.open(filename, 'rb') as f:
        emulators[method] = dill.load(f)

### Compute metrics

In [6]:
import pandas as pd
from metrics import *
from sklearn.preprocessing import StandardScaler

# Load testing data
test_dir = 'simulation_data/Pb_Pb_2760_Grad/test'

X_test = np.loadtxt(os.path.join(test_dir, 'X.txt'))
Ymean_test = np.loadtxt(os.path.join(test_dir, 'Ymean.txt'))
Ystd_test = np.loadtxt(os.path.join(test_dir, 'Ystd.txt'))


Ymean_test = Ymean_test[:numdesignpt,:numobs]
Ystd_test = Ystd_test[:numdesignpt,:numobs]

def metrics_cal(means1, var1, means2, var2):
    # Initialize array's to store the distances
    kl_div = np.zeros(means1.shape)
    wasserstein_dist = np.zeros(means1.shape)
    hellinger_dist = np.zeros(means1.shape)
    
    # Loop over each pair of means and variances
    for i in range(means1.shape[0]):
        for j in range(means1.shape[1]):
            mu1 = means1[i, j]
            mu2 = means2[i, j]
            var1_ij = var1[i, j]
            var2_ij = var2[i, j]
            
            # Calculate the distances for the current pair
            kl_div[i, j] = kl_divergence_gaussian(mu1=mu1, Cov1=var1_ij, mu2=mu2, Cov2=var2_ij)
            hellinger_dist[i, j] = hellinger_distance_gaussian(mu1=mu1, Cov1=var1_ij, mu2=mu2, Cov2=var2_ij)
            wasserstein_dist[i, j] = wasserstein_distance_gaussian(mu1=mu1, Cov1=var1_ij, mu2=mu2, Cov2=var2_ij)

    return kl_div, hellinger_dist, wasserstein_dist


scaler_Y = StandardScaler()
Ymean_test = scaler_Y.fit_transform(Ymean_test)
Ystd_test = Ystd_test / scaler_Y.scale_



# Initialize a list to store the results
results = []

for method, emu in emulators.items():
    # print(method)
    
    if method in {'PCGP', 'PCSK'}:
        pred = emu.predict(x=xloc, theta=X_test)
        predmean = pred.mean().T
        predvar = pred.var().T
        
        predmean = scaler_Y.transform(predmean)
        predvar = np.square(np.sqrt(predvar)/scaler_Y.scale_)
        
    elif method in {'AKSGP', 'PCGP_scikit'}:
        predmean, predstd = emu.predict(X_test)
        predvar = np.square(predstd)

        predmean = scaler_Y.transform(predmean)
        predvar = np.square(np.sqrt(predvar)/scaler_Y.scale_)
    
    Yvar_test = np.square(Ystd_test)
    
    EC = intervalstats(Ymean_test, predmean, predvar)
    RMSE = rmse(Ymean_test, predmean)
    NRMSE = normalized_rmse(Ymean_test, predmean)
    KLdiv, HD, WD = metrics_cal(predmean, predvar, Ymean_test, Yvar_test)

    # Store the results in the list
    results.append({
        'Method': method,
        '95% Coverage': '{:.6f}'.format(EC[0]),
        'PI Width': '{:.6f}'.format(EC[1]),
        'RMSE': '{:.6f}'.format(RMSE),
        'NRMSE': '{:.6f}'.format(NRMSE),
        'KL Divergence': '{:.6f}'.format(np.mean(KLdiv)),
        'Hellinger Distance': '{:.6f}'.format(np.mean(HD)),
        'Wasserstein Distance': '{:.6f}'.format(np.mean(WD)),
        'Training time (s)': '{:.6f}'.format(emu.trainwallclocktime),
        'Prediction time (s)': '{:.6f}'.format(emu.predictwallclocktime),
    })

# Convert the list of results into a DataFrame
results_df = pd.DataFrame(results)


2024-09-09 20:54:27 - numexpr.utils - INFO - NumExpr defaulting to 8 threads.


In [7]:
# Get all the methods from the dataframe
methods = results_df['Method'].values

# Create the transposed header with methods
header = f"{'Metric':<30}" + " ".join([f"{method:<15}" for method in methods])
print(header)
print('-' * len(header))

# Define the metrics to transpose (including the new ones)
metrics = ['95% Coverage', 'PI Width', 'RMSE', 'NRMSE', 'KL Divergence', 'Hellinger Distance', 'Wasserstein Distance', 
           'Training time (s)', 'Prediction time (s)']

# Print each metric row with values for each method
for metric in metrics:
    row = f"{metric:<30}" + " ".join([f"{results_df.loc[index, metric]:<15}" for index in range(len(methods))])
    print(row)

    if metric == 'Wasserstein Distance':
        print('-' * len(header))
        
print('-' * len(header))

Metric                        PCGP            PCGP_scikit     PCSK            AKSGP          
---------------------------------------------------------------------------------------------
95% Coverage                  0.908602        0.967742        0.873656        0.884409       
PI Width                      0.727235        0.952633        0.811478        0.749150       
RMSE                          0.215053        0.243941        0.248784        0.229805       
NRMSE                         0.051420        0.058327        0.059494        0.055017       
KL Divergence                 20.942500       32.780621       29.081159       24.569754      
Hellinger Distance            0.675628        0.707023        0.696260        0.681411       
Wasserstein Distance          0.235710        0.295790        0.270833        0.249846       
---------------------------------------------------------------------------------------------
Training time (s)             7.761672        16.943299     