# Sensitivity Analysis

## Using AutoEmulate

Start by importing the necessary modules
* For using autoemulate on an M1 mac, you need to install lightGBM via conda: `conda install lightgbm`

 * Or install autoemulate using this link: `pip install https://github.com/alan-turing-institute/autoemulate.git@remove-lightgbm`

In [50]:
%load_ext autoreload 
%autoreload 2
import joblib
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
from autoemulate.compare import AutoEmulate
from SALib.sample import saltelli
from SALib.analyze import sobol

!pwd

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
/Users/aalexander-ikwue/Documents/GitHub/SensitivityAnalysis/Tutorials


Specifying the number of samples to be used within the emulator

In [63]:
n_runs=50
main_path = os.getcwd()


'/Users/aalexander-ikwue/Documents/GitHub/SensitivityAnalysis/Tutorials'

Loading the dataset and pre-trained models (scaler, PCA and reduced PCA), and then matching the indices of the PCA-reduced data (y) with the original data (X)

In [52]:
input_X = f"../Tutorials/data/input/input_parameters_{n_runs}.csv"
x_data_frame = pd.read_csv(input_X, index_col="Index")

relevant_columns = []
for col in x_data_frame.columns:
    relevant_columns.append(col)
    if col == 'T': break

scaler_loaded = joblib.load(f'../Tutorials/data/output_pca_{n_runs}/scaler.pkl')
pca_loaded = joblib.load(f'../Tutorials/data/output_pca_{n_runs}/pca.pkl')

y = pca_loaded

X = x_data_frame[relevant_columns].loc[y.index]

In [53]:
output_emulation = f'../Tutorials/data/output_emulation_{n_runs}'
os.system(f'mkdir -p {output_emulation}')

0

In [54]:
for i in range(y.shape[1]):
    print(f' Component {i + 1}')
    em = AutoEmulate()
    em.setup(X, y.values[:,i], 
            param_search=True, 
            param_search_type='bayes', 
            param_search_iters=5,
             model_subset=["NeuralNetSk", "SecondOrderPolynomial", "SupportVectorMachines", "GradientBoosting", "RandomForest"],
            #  n_jobs=6
            )
    
    best_model = em.compare()
    em.print_results()
    best_model = em.refit_model(em.best_model)

    em.save_model(best_model, path=f"{output_emulation}/best_emulator_component_{i}")


 Component 1


Unnamed: 0,Values
Simulation input shape (X),"(39, 17)"
Simulation output shape (y),"(39,)"
# hold-out set samples (test_set_size),8
Do hyperparameter search (param_search),True
Type of hyperparameter search (search_type),bayes
# sampled parameter settings (param_search_iters),5
Scale data before fitting (scale),True
Scaler (scaler),StandardScaler
Dimensionality reduction before fitting (reduce_dim),False
Dimensionality reduction method (dim_reducer),PCA


Initializing:   0%|          | 0/5 [00:00<?, ?it/s]

Unnamed: 0,model,short,r2,rmse
0,SupportVectorMachines,svm,0.0084,9.1153
1,NeuralNetSk,nns,-0.0884,7.589
2,SecondOrderPolynomial,sop,-0.1257,9.7283
3,GradientBoosting,gb,-0.5457,10.9619
4,RandomForest,rf,-1.7026,9.4447


 Component 2


Unnamed: 0,Values
Simulation input shape (X),"(39, 17)"
Simulation output shape (y),"(39,)"
# hold-out set samples (test_set_size),8
Do hyperparameter search (param_search),True
Type of hyperparameter search (search_type),bayes
# sampled parameter settings (param_search_iters),5
Scale data before fitting (scale),True
Scaler (scaler),StandardScaler
Dimensionality reduction before fitting (reduce_dim),False
Dimensionality reduction method (dim_reducer),PCA


Initializing:   0%|          | 0/5 [00:00<?, ?it/s]

Unnamed: 0,model,short,r2,rmse
0,NeuralNetSk,nns,0.3094,1.0691
1,SupportVectorMachines,svm,0.1335,1.3276
2,SecondOrderPolynomial,sop,-0.1114,1.4212
3,RandomForest,rf,-0.4745,1.4018
4,GradientBoosting,gb,-0.6329,1.5138


 Component 3


Unnamed: 0,Values
Simulation input shape (X),"(39, 17)"
Simulation output shape (y),"(39,)"
# hold-out set samples (test_set_size),8
Do hyperparameter search (param_search),True
Type of hyperparameter search (search_type),bayes
# sampled parameter settings (param_search_iters),5
Scale data before fitting (scale),True
Scaler (scaler),StandardScaler
Dimensionality reduction before fitting (reduce_dim),False
Dimensionality reduction method (dim_reducer),PCA


Initializing:   0%|          | 0/5 [00:00<?, ?it/s]

Unnamed: 0,model,short,r2,rmse
0,SupportVectorMachines,svm,-0.0591,1.1374
1,RandomForest,rf,-0.139,1.1786
2,SecondOrderPolynomial,sop,-0.4057,1.3844
3,NeuralNetSk,nns,-0.5376,1.1846
4,GradientBoosting,gb,-0.5911,1.2195


 Component 4


Unnamed: 0,Values
Simulation input shape (X),"(39, 17)"
Simulation output shape (y),"(39,)"
# hold-out set samples (test_set_size),8
Do hyperparameter search (param_search),True
Type of hyperparameter search (search_type),bayes
# sampled parameter settings (param_search_iters),5
Scale data before fitting (scale),True
Scaler (scaler),StandardScaler
Dimensionality reduction before fitting (reduce_dim),False
Dimensionality reduction method (dim_reducer),PCA


Initializing:   0%|          | 0/5 [00:00<?, ?it/s]

Unnamed: 0,model,short,r2,rmse
0,NeuralNetSk,nns,0.2339,0.6596
1,GradientBoosting,gb,-0.0094,0.7606
2,RandomForest,rf,-0.1535,0.7072
3,SupportVectorMachines,svm,-0.2953,0.7756
4,SecondOrderPolynomial,sop,-0.3716,0.8478


In [55]:
pca1_emulator = joblib.load(f'../Tutorials/data/output_emulation_{n_runs}/best_emulator_component_0')
pca2_emulator = joblib.load(f'../Tutorials/data/output_emulation_{n_runs}/best_emulator_component_1')
pca3_emulator = joblib.load(f'../Tutorials/data/output_emulation_{n_runs}/best_emulator_component_2')
pca4_emulator = joblib.load(f'../Tutorials/data/output_emulation_{n_runs}/best_emulator_component_3')

In [56]:
input_X = f"../Tutorials/data/input/input_parameters_{n_runs}.csv"
input_data = pd.read_csv(input_X, index_col="Index")

In [57]:
relevant_columns = []
for col in input_data.columns:
    relevant_columns.append(col)
    if col == 'T': break

In [58]:
problem = {
    'num_vars': len(relevant_columns),
    'names': relevant_columns,
    'bounds' : input_data[relevant_columns].describe().loc[['min', 'max']].T.values
}

In [59]:
param_values = saltelli.sample(problem, 1024, calc_second_order=True)
param_values.shape

(36864, 17)

In [60]:
Y_pca1 = pca1_emulator.predict(param_values)
Y_pca2 = pca2_emulator.predict(param_values)
Y_pca3 = pca3_emulator.predict(param_values)
Y_pca4 = pca4_emulator.predict(param_values)

In [73]:
sobol_indices_pca = [
    sobol.analyze(problem, Y_pca1, calc_second_order=True),
    sobol.analyze(problem, Y_pca2, calc_second_order=True),
    sobol.analyze(problem, Y_pca3, calc_second_order=True),
    sobol.analyze(problem, Y_pca4, calc_second_order=True)
]

In [75]:
for i in range(4):
    # Construct the Sobol indices DataFrame for S1
    os.system(f'mkdir -p {main_path}/results/pca{i+1}')

    S1 = pd.DataFrame(sobol_indices_pca[i]['S1'], index=relevant_columns, columns=['S1'])
    S1.sort_values('S1', inplace=True, ascending=False)
    S1.to_csv(f'{main_path}/results/pca{i+1}/s1_{n_runs}.csv')
    
    # Construct the Sobol indices DataFrame for ST
    ST = pd.DataFrame(sobol_indices_pca[i]['ST'], index=relevant_columns, columns=['ST'])
    ST.sort_values('ST', inplace=True, ascending=False)
    ST.to_csv(f'{main_path}/results/pca{i+1}/st_{n_runs}.csv')
    
    # Calculate the cumulative sum of ST and normalize it
    ST_cumsum = ST.cumsum() / ST.cumsum().iloc[-1]
    ST_cumsum.to_csv(f'{main_path}/results/pca{i+1}/st_cumsum_{n_runs}.csv')