In [None]:
from gsa_framework.models.test_functions import Morris4
from gsa_framework.sensitivity_analysis.correlations import Correlations
from gsa_framework.sensitivity_analysis.saltelli_sobol import SaltelliSobol
from gsa_framework.sensitivity_analysis.gradient_boosting import GradientBoosting
from gsa_framework.sensitivity_analysis.delta import Delta
from gsa_framework.convergence_robustness_validation import Validation
from gsa_framework.convergence_robustness_validation import Convergence
from pathlib import Path
import time
from gsa_framework.utils import read_hdf5_array

if __name__ == "__main__":

    path_base = Path('/data/user/kim_a/paper_gsa/')

    # 1. Models
    num_params = 5000
    num_influential = num_params // 100
    iterations_validation = 2000
    write_dir = path_base / "{}_morris4".format(num_params)
    model = Morris4(num_params=num_params, num_influential=num_influential)
#     gsa_seed = 3407
    gsa_seed = 6000814
    validation_seed = 7043
    num_influential_validation = 2*num_influential

    fig_format = ["pickle"]  # can have elements "pdf", "html", "pickle"

    # TODO Choose which GSA to perform
    flag_sobol = 0
    flag_correlation = 1
    flag_xgboost = 0
    flag_delta = 0

    if flag_sobol:
        iterations = 100 * num_params
        gsa = SaltelliSobol(iterations=iterations, model=model, write_dir=write_dir)
        # S_dict = gsa.generate_gsa_indices()
        S_dict = gsa.perform_gsa()
        first = S_dict["First order"]
        total = S_dict["Total order"]
#         gsa.plot_sa_results(
#             S_dict,
#             S_dict_analytical=model.S_dict_analytical,
#             fig_format=fig_format,
#         )

        t0 = time.time()
        val = Validation(
            model=model,
            iterations=iterations_validation,
            seed=validation_seed,
            default_x_rescaled=None,
            write_dir=write_dir,
        )
        tag = "TotalIndex"
        influential_Y = val.get_influential_Y_from_gsa(total, num_influential_validation, tag=tag)
        t1 = time.time()
        print("Total validation time  -> {:8.3f} s \n".format(t1 - t0))
        val.plot_histogram_Y_all_Y_inf(
            influential_Y, num_influential_validation, tag=tag, fig_format=fig_format
        )
        
#         conv = Convergence(
#             gsa.filepath_Y,
#             gsa.num_params,
#             gsa.generate_gsa_indices,
#             gsa.gsa_label,
#             write_dir,
#             num_steps=100,
#         )
#         conv.run_convergence(parameter_inds=parameter_inds, fig_format=fig_format)

    if flag_correlation:
        iterations = 4 * num_params
        gsa = Correlations(
            iterations=iterations,
            model=model,
            write_dir=write_dir,
            seed=gsa_seed,
        )
        S_dict = gsa.perform_gsa()
        pearson = S_dict["pearson"]
        spearman = S_dict["spearman"]
#         gsa.plot_sa_results(S_dict, S_boolean=model.S_boolean, fig_format=fig_format)

#         t0 = time.time()
#         val = Validation(
#             model=model,
#             iterations=iterations_validation,
#             seed=validation_seed,
#             default_x_rescaled=None,
#             write_dir=write_dir,
#         )
#         tag = "SpearmanIndex"
#         influential_Y = val.get_influential_Y_from_gsa(
#             spearman, num_influential_validation, tag=tag
#         )
#         t1 = time.time()
#         print("Total validation time  -> {:8.3f} s \n".format(t1 - t0))
#         val.plot_histogram_Y_all_Y_inf(
#             influential_Y, num_influential_validation, tag=tag, fig_format=fig_format
#         )

#         conv = Convergence(
#             gsa.filepath_Y,
#             gsa.num_params,
#             gsa.generate_gsa_indices,
#             gsa.gsa_label,
#             write_dir,
#             num_steps=100,
#         )
#         conv.run_convergence(
#             parameter_inds=parameter_inds,
#             fig_format=fig_format,
#         )

    if flag_delta:
        iterations = 8 * num_params
        num_resamples = 0
        gsa = DeltaMoment(
            iterations=iterations,
            model=model,
            write_dir=write_dir,
            num_resamples=num_resamples,
            seed=gsa_seed,
        )
        S_dict = gsa.perform_gsa()
        S_dict.pop('delta_conf')
        delta = S_dict['delta']
        gsa.plot_sa_results(
            S_dict,
            S_boolean=model.S_boolean,
            fig_format=fig_format,
        )
        t0 = time.time()
        val = Validation(
            model=model,
            iterations=iterations_validation,
            seed=validation_seed,
            default_x_rescaled=None,
            write_dir=write_dir,
        )
        tag = "DeltaIndex"
        influential_Y = val.get_influential_Y_from_gsa(
            delta, num_influential_validation, tag=tag
        )
        t1 = time.time()
        print("Total validation time  -> {:8.3f} s \n".format(t1 - t0))
        val.plot_histogram_Y_all_Y_inf(
            influential_Y, num_influential_validation, tag=tag, fig_format=fig_format
        )
        
    if flag_xgboost:
        iterations = 2 * num_params
        test_size = 0.2
        if num_params == 1000:
            tuning_parameters = dict(
                learning_rate=0.1,
                gamma=0,
                min_child_weight=30,
                max_depth=2,
                reg_lambda=10,
                reg_alpha=0,
                n_estimators=500,
                subsample=0.6,
                colsample_bytree=0.3,
            )
        gsa = GradientBoosting(
            iterations=iterations,
            model=model,
            write_dir=write_dir,
            seed=gsa_seed,
            tuning_parameters=tuning_parameters,
            test_size=test_size,
            xgb_model=None,
        )

        S_dict = gsa.perform_gsa(flag_save_S_dict=True)
        print(S_dict["stat.r2"], S_dict["stat.explained_variance"])
        gsa.plot_sa_results(
            {"fscores": S_dict["fscores"]},
            fig_format=fig_format,
        )

In [None]:
gsa.iterations

# Stability correlation coefficients

In [None]:
from setups_paper_gwp import *
from copy import deepcopy
from gsa_framework.sensitivity_analysis.correlations import corrcoef_parallel_stability_spearman
from gsa_framework.models.test_functions import Morris4
from pathlib import Path

In [None]:
path_base = Path('/data/user/kim_a/paper_gsa/')
# read X and Y
num_params = 5000
num_influential = num_params // 100
write_dir = path_base / "{}_morris4".format(num_params)
model = Morris4(num_params=num_params, num_influential=num_influential)
# gsa_seed = 3407
gsa_seed = 6000814
fig_format = ["pickle"]  # can have elements "pdf", "html", "pickle"

iter_corr = 4*num_params
gsa = Correlations(
    iterations=iter_corr,
    model=model,
    write_dir=write_dir,
    seed=gsa_seed,
)

X_rescaled = read_hdf5_array(gsa.filepath_X_rescaled)
Y = read_hdf5_array(gsa.filepath_Y).flatten()

num_steps = 50
num_bootstrap = 60

# Convergence class
conv = Convergence(
    gsa.filepath_Y,
    gsa.num_params,
    gsa.generate_gsa_indices,
    gsa.gsa_label,
    gsa.write_dir,
    num_steps=num_steps,
)

write_dir_stability = gsa.write_dir / 'stability_intermediate_{}'.format(gsa.gsa_label)
write_dir_stability.mkdir(parents=True, exist_ok=True)
# Generate random seeds
np.random.seed(gsa.seed)
stability_seeds = np.random.randint(
    low=0,
    high=2147483647,
    size=(len(conv.iterations_for_convergence), num_bootstrap),
)

In [None]:
%%time
filename_S = "stability.S.{}.{}.{}Step{}.{}.{}.pickle".format(
    gsa.gsa_label, gsa.sampling_label, gsa.iterations, conv.iterations_step, num_bootstrap, gsa.seed,
)
filepath_S = gsa.write_dir / "arrays" / filename_S
if filepath_S.exists():
    print("--> {} already exists".format(filename_S))
    S_dict_stability = read_pickle(filepath_S)
else:
    S_dict_stability = {}
    for i,iterations_current in enumerate(conv.iterations_for_convergence):
        S_array = np.zeros([0,num_params])
        print("{}".format(iterations_current))
        filename_S_current = "S.{}Step{}.{}.{}.pickle".format(iterations_current,conv.iterations_step,num_bootstrap,gsa.seed)
        filepath_S_current = write_dir_stability / filename_S_current
        if filepath_S_current.exists():
            print("--> {} already exists".format(filename_S_current))
            S_dict = read_pickle(filepath_S_current)
        else:
            for j in range(num_bootstrap):
                stability_seed = stability_seeds[i,j]
                np.random.seed(stability_seed)
                choice = np.random.choice(np.arange(gsa.iterations), iterations_current, replace=False)
                Y_current = Y[choice]
                X_current = X_rescaled[choice,:]
                S_current = corrcoef_parallel_stability_spearman(Y_current, X_current)['spearman']
                S_array = np.vstack([S_array, S_current])
            S_dict = {iterations_current: {"spearman": S_array}}
            write_pickle(S_dict, filepath_S_current)
        S_dict_stability.update(S_dict)
    write_pickle(S_dict_stability, filepath_S)



# Dask

In [None]:
from dask.distributed import Client, LocalCluster
from dask_jobqueue import SLURMCluster
from pathlib import Path
import os
import dask

In [None]:
which_pc = "merlin_paper_gsa"
if 'merlin' in which_pc:
    path_dask_logs = Path('/data/user/kim_a/dask_logs')
    path_dask_logs.mkdir(parents=True, exist_ok=True)
    cluster = SLURMCluster(cores     = 8,
                           memory    ="160GB", 
                           walltime  = '23:00:00',
                           interface ='ib0',
                           local_directory = path_dask_logs.as_posix(),
                           log_directory   = path_dask_logs.as_posix(),
                           queue="daily",
                           ) 
elif 'local' in which_pc:
    cluster = LocalCluster(memory_limit='7GB') 

In [None]:
client = Client(cluster)

In [None]:
n_workers = 60
cluster.scale(n_workers)

In [None]:
client

In [None]:
# client.close()
# cluster.close() 

# Stability delta and xgboost

In [None]:
from setups_paper_gwp import *
from copy import deepcopy
from gsa_framework.models.test_functions import Morris4
from gsa_framework.sensitivity_methods.delta import delta_indices_stability
from gsa_framework.sensitivity_methods.gradient_boosting import xgboost_indices_stability
from gsa_framework.sensitivity_analysis.delta import Delta
from gsa_framework.sensitivity_analysis.gradient_boosting import GradientBoosting
from gsa_framework.convergence_robustness_validation import Convergence
from gsa_framework.utils import *
from gsa_framework.sampling.get_samples import latin_hypercube_samples
from pathlib import Path
import time
import warnings
warnings.filterwarnings("ignore")

path_base = Path('/data/user/kim_a/paper_gsa/')
setup_xgbo = setup_xgbo_morris4

## with DASK

In [None]:
def compute_per_worker_delt(num_params, iterations_current, stability_seed):
    iter_delt = 8*num_params
    gsa_delt = setup_delt(num_params, iter_delt, setup_morris4_model)
    filepath_Y = gsa_delt.write_dir_stability / "Y.step{}.seed{}.pickle".format(iterations_current, stability_seed)
    Y = read_pickle(filepath_Y).flatten()
    X = latin_hypercube_samples(gsa_delt.iterations, gsa_delt.num_params, seed=gsa_delt.seed)
    np.random.seed(stability_seed)
    choice = np.random.choice(np.arange(gsa_delt.iterations), iterations_current, replace=False)
    Xr = gsa_delt.model.rescale(X[choice, :])
    del X
    filepath_S = gsa_delt.write_dir_stability / "S.step{}.seed{}.pickle".format(iterations_current, stability_seed)
    if not filepath_S.exists():
        S_dict = delta_moment_stability(
            Y, Xr, num_resamples=gsa_delt.num_resamples, seed=stability_seed
        )
        write_pickle(S_dict, filepath_S)
    else:
        print("{} already exists".format(filepath_S.name))
        S_dict = read_pickle(filepath_S)
    
    return S_dict

def compute_per_worker_xgbo(num_params, iterations_current, stability_seed):
    iter_xgbo =4*num_params
    gsa_xgbo = setup_xgbo(num_params, iter_xgbo, setup_morris4_model, path_base)
    filepath_Y = gsa_xgbo.write_dir_stability / "Y.step{}.seed{}.pickle".format(iterations_current, stability_seed)
    Y = read_pickle(filepath_Y).flatten()
    np.random.seed(gsa_xgbo.seed)
    X = np.random.rand(iter_xgbo, num_params)
    np.random.seed(stability_seed)
    choice = np.random.choice(np.arange(gsa_xgbo.iterations), iterations_current, replace=True)
    Xr = gsa_xgbo.model.rescale(X[choice, :])
    del X
    filepath_S = gsa_xgbo.write_dir_stability / "S.step{}.seed{}.pickle".format(iterations_current, stability_seed)
    if not filepath_S.exists():
        S_dict = xgboost_scores_stability(
            Y, 
            Xr, 
            tuning_parameters=gsa_xgbo.tuning_parameters,
            test_size=gsa_xgbo.test_size,
            xgb_model = gsa_xgbo.xgb_model,
        )
        write_pickle(S_dict, filepath_S)
    else:
        print("{} already exists".format(filepath_S.name))
        S_dict = read_pickle(filepath_S)
    
    return S_dict

In [None]:
num_params = 1000
iter_delt = 8*num_params
iter_xgbo = 4*num_params
gsa_delt = setup_delt(num_params, iter_delt, setup_morris4_model, path_base)
gsa_xgbo = setup_xgbo(num_params, iter_xgbo, setup_morris4_model, path_base)

num_steps = 50
num_bootstrap = 60

option = 'xgboost'
if option=='delta':
    gsa = gsa_delt
    compute_per_worker = compute_per_worker_delt
elif option=='xgboost':
    gsa = gsa_xgbo
    compute_per_worker = compute_per_worker_xgbo

task_per_worker = dask.delayed(compute_per_worker)
# task_per_worker = compute_per_worker

In [None]:
gsa_xgbo.perform_gsa()

### Pay attention to replace=True or False!! should be True in the end, but now for all methods but xgboost results are computed with False

In [None]:
conv = Convergence(
    gsa.filepath_Y,
    gsa.num_params,
    gsa.generate_gsa_indices,
    gsa.gsa_label,
    gsa.write_dir_convergence,
    num_steps=num_steps,
)

np.random.seed(gsa.seed)
stability_seeds = np.random.randint(
    low=0,
    high=2147483647,
    size=(len(conv.iterations_for_convergence), num_bootstrap),
)

Y = read_hdf5_array(gsa.filepath_Y).flatten()

num_times = n_workers // num_bootstrap
model_evals = []
i = 0
for i_iter in range(len(conv.iterations_for_convergence)//num_times+1):
    iterations_current_multiple = conv.iterations_for_convergence[i_iter*num_times:(i_iter+1)*num_times]
    model_evals_bootstrap_j_k = []
    for iterations_current in iterations_current_multiple:
        model_evals_bootstrap_j = []
        for j in range(num_bootstrap):
            stability_seed = stability_seeds[i,j]
            np.random.seed(stability_seed)
            choice = np.random.choice(np.arange(gsa.iterations), iterations_current, replace=True) 
            # Write Y
            filepath_Y_ij = gsa.write_dir_stability / "Y.step{}.seed{}.pickle".format(iterations_current, stability_seed)
            if not filepath_Y_ij.exists():
                Y_ij = Y[choice]
                write_pickle(Y_ij, filepath_Y_ij)
            else:
    #             print("{} already exists".format(filepath_Y_ij.name))  
                pass
            # Model evals
            filepath_S_current = gsa.write_dir_stability / "S.step{}.seed{}.pickle".format(iterations_current, stability_seed)
            if not filepath_S_current.exists():
                model_eval = task_per_worker(num_params, iterations_current, stability_seed)
                model_evals_bootstrap_j.append(model_eval)
        model_evals_bootstrap_j_k += model_evals_bootstrap_j
        i += 1
    if len(model_evals_bootstrap_j_k) > 0:
        model_evals.append(model_evals_bootstrap_j_k)
        
        
        
# Y = read_hdf5_array(gsa.filepath_Y).flatten()
# model_evals = []
# for i,iterations_current in enumerate(conv.iterations_for_convergence):
#     model_evals_bootstrap_j = []
#     for j in range(num_bootstrap):
#         stability_seed = stability_seeds[i,j]
#         np.random.seed(stability_seed)
#         choice = np.random.choice(np.arange(gsa.iterations), iterations_current, replace=False)
#         # Write Y
#         filepath_Y_ij = gsa.write_dir_stability / "Y.step{}.seed{}.pickle".format(iterations_current, stability_seed)
#         if not filepath_Y_ij.exists():
#             Y_ij = Y[choice]
#             write_pickle(Y_ij, filepath_Y_ij)
#         else:
# #             print("{} already exists".format(filepath_Y_ij.name))  
#             pass
#         # Model evals
#         model_eval = task_per_worker(num_params, iterations_current, stability_seed)
#         model_evals_bootstrap_j.append(model_eval)
#     model_evals.append(model_evals_bootstrap_j)

In [None]:
%%time
for i,model_evals_bootstrap_j_k in enumerate(model_evals):
    print(i)
    dask.compute(model_evals_bootstrap_j_k)

In [None]:
# %%time
# for i,model_evals_bootstrap_j in enumerate(model_evals):
#     print(i)
#     dask.compute(model_evals_bootstrap_j)

In [None]:
# Collect all results
def create_stability_dict_delt(num_params, iterations_for_convergence, stability_seeds):
    iter_delt = 8*num_params
    gsa_delt = setup_delt(num_params, iter_delt, setup_morris4_model, path_base)
    iterations_step = iterations_for_convergence[1] - iterations_for_convergence[0]
    num_bootstrap = stability_seeds.shape[1]
    filename_S_stability = "stability.S.{}.{}.{}Step{}.{}.{}.pickle".format(
    gsa_delt.gsa_label, gsa_delt.sampling_label, gsa_delt.iterations, iterations_step, num_bootstrap, gsa_delt.seed,
    )
    filepath_S_stability = gsa_delt.write_dir / 'arrays' / filename_S_stability
    if filepath_S_stability.exists():
        print("{} already exists".format(filepath_S_stability.name))  
        S_dict = read_pickle(filepath_S_stability)
    else:
        S_dict = {}
        for i,iterations_current in enumerate(iterations_for_convergence):
            S_array = np.zeros((0,num_params))
            for j in range(num_bootstrap):
                stability_seed = stability_seeds[i,j]
                filepath_S = \
                gsa_delt.write_dir_stability / "S.step{}.seed{}.pickle".format(iterations_current, stability_seed)
                if not filepath_S.exists():
                    print("{} does not exist".format(filepath_S.name))
                    return
                else:
                    S_current = read_pickle(filepath_S)
                    S_array = np.vstack([S_array, S_current['delta']])
            S_dict[iterations_current] = {"delta": S_array}
        write_pickle(S_dict, filepath_S_stability)
    return S_dict

# Collect all results
def create_stability_dict_xgbo(num_params, iterations_for_convergence, stability_seeds):
    iter_xgbo = 4*num_params
    gsa_xgbo = setup_xgbo(num_params, iter_xgbo, setup_morris4_model, path_base)
    iterations_step = iterations_for_convergence[1] - iterations_for_convergence[0]
    num_bootstrap = stability_seeds.shape[1]
    filename_S_stability = "stability.S.{}.{}.{}Step{}.{}.{}.pickle".format(
    gsa_xgbo.gsa_label, gsa_xgbo.sampling_label, gsa_xgbo.iterations, iterations_step, num_bootstrap, gsa_xgbo.seed,
    )
    filepath_S_stability = gsa_xgbo.write_dir / 'arrays' / filename_S_stability
    if filepath_S_stability.exists():
        print("{} already exists".format(filepath_S_stability.name))  
        S_dict = read_pickle(filepath_S_stability)
    else:
        S_dict = {}
        for i,iterations_current in enumerate(iterations_for_convergence):
            S_dict_arrays = {}
            for j in range(num_bootstrap):
                stability_seed = stability_seeds[i,j]
                filepath_S = \
                gsa_xgbo.write_dir_stability / "S.step{}.seed{}.pickle".format(iterations_current, stability_seed)
                if not filepath_S.exists():
                    print("{} does not exist".format(filepath_S.name))
                    return
                else:
                    S_current = read_pickle(filepath_S)
                    if len(S_dict_arrays) == 0:
                        stats = [s for s in S_current.keys() if 'stat.' in s]
                        importance_types = [imp for imp in S_current.keys() if 'stat.' not in imp]
                        keys = stats + importance_types
                        S_dict_arrays = {s: deepcopy(np.zeros((0,1))) for s in stats}
                        S_dict_arrays.update(
                            {
                                imp: deepcopy(np.zeros((0,num_params))) for imp in importance_types
                            }
                        )
                    for k in keys:
                        S_dict_arrays[k] = np.vstack([S_dict_arrays[k], S_current[k]])
            S_dict[iterations_current] = {k: S_dict_arrays[k] for k in keys}
        write_pickle(S_dict, filepath_S_stability)
    return S_dict

In [None]:
%%time
S_dict = create_stability_dict_xgbo(num_params, conv.iterations_for_convergence, stability_seeds)

## without dask

In [None]:
from setups_paper_gwp import *
from copy import deepcopy
from gsa_framework.test_functions import Morris4
from gsa_framework.sensitivity_analysis.delta_moment import delta_moment_parallel_stability
from gsa_framework.methods.delta_moment import DeltaMoment
from gsa_framework.convergence import Convergence
from pathlib import Path
import time
import warnings

In [None]:
path_base = Path('/data/user/kim_a/paper_gsa/')
# read X and Y
num_params = 5000
num_influential = num_params // 100
write_dir = path_base / "{}_morris4".format(num_params)
model = Morris4(num_params=num_params, num_influential=num_influential)
gsa_seed = 3407
fig_format = ["pickle"]  # can have elements "pdf", "html", "pickle"

iter_delt = 8*num_params
num_resamples = 1
gsa = DeltaMoment(
    iterations=iter_delt,
    model=model,
    write_dir=write_dir,
    num_resamples=num_resamples,
    seed=gsa_seed,
)

X_rescaled = read_hdf5_array(gsa.filepath_X_rescaled)
Y = read_hdf5_array(gsa.filepath_Y).flatten()

num_steps = 50
num_bootstrap = 60

# Convergence class
conv = Convergence(
    gsa.filepath_Y,
    gsa.num_params,
    gsa.generate_gsa_indices,
    gsa.gsa_label,
    gsa.write_dir,
    num_steps=num_steps,
)

write_dir_stability = gsa.write_dir / 'stability_intermediate_{}'.format(gsa.gsa_label)
write_dir_stability.mkdir(parents=True, exist_ok=True)
# Generate random seeds
np.random.seed(gsa.seed)
stability_seeds = np.random.randint(
    low=0,
    high=2147483647,
    size=(len(conv.iterations_for_convergence), num_bootstrap),
)

In [None]:
%%time
warnings.filterwarnings("ignore")

if __name__ == "__main__":
    
    filename_S = "stability.S.{}.{}.{}Step{}.{}.{}.pickle".format(
        gsa.gsa_label, gsa.sampling_label, gsa.iterations, conv.iterations_step, num_bootstrap, gsa.seed,
    )
    filepath_S = gsa.write_dir / "arrays" / filename_S
    if filepath_S.exists():
        print("--> {} already exists".format(filename_S))
        S_dict_stability = read_pickle(filepath_S)
    else:
        S_dict_stability = {}
        for i,iterations_current in enumerate(conv.iterations_for_convergence):
            S_array = np.zeros([0,num_params])
            print("{}".format(iterations_current))
            filename_S_current = "S.{}Step{}.{}.{}.pickle".format(iterations_current,conv.iterations_step,num_bootstrap,gsa.seed)
            filepath_S_current = write_dir_stability / filename_S_current
            if filepath_S_current.exists():
                print("--> {} already exists".format(filename_S_current))
                S_dict = read_pickle(filepath_S_current)
            else:
                for j in range(num_bootstrap):
                    stability_seed = stability_seeds[i,j]
                    np.random.seed(stability_seed)
                    choice = np.random.choice(np.arange(gsa.iterations), iterations_current, replace=False)
                    Y_current = Y[choice]
                    X_current = X_rescaled[choice,:]
                    S_current = delta_moment_parallel_stability(Y_current, X_current, num_resamples=num_resamples)
                    S_array = np.vstack([S_array, S_current['delta']])
                S_dict = {iterations_current: {"delta": S_array}}
                write_pickle(S_dict, filepath_S_current)
            S_dict_stability.update(S_dict)
        write_pickle(S_dict_stability, filepath_S)


# Analyze bootstrap values

In [None]:
from gsa_framework.test_functions import Morris4
from gsa_framework.methods.correlations import CorrelationCoefficients
from gsa_framework.methods.saltelli_sobol import SaltelliSobol
from gsa_framework.methods.gradient_boosting import GradientBoosting
from gsa_framework.methods.delta_moment import DeltaMoment
from gsa_framework.validation import Validation
from gsa_framework.convergence import Convergence
from pathlib import Path
import time
from setups_paper_gwp import *


if __name__ == "__main__":
    num_params = 5000
    iter_delt = 8*num_params
    gsa = setup_delt(num_params, iter_delt, setup_morris4_model)
    num_steps = 50
    num_bootstrap = 60

    # Convergence class
    conv = Convergence(
        gsa.filepath_Y,
        gsa.num_params,
        gsa.generate_gsa_indices,
        gsa.gsa_label,
        gsa.write_dir,
        num_steps=num_steps,
    )

    filename_S = "stability.S.{}.{}.{}Step{}.{}.{}.pickle".format(
        gsa.gsa_label, gsa.sampling_label, gsa.iterations, conv.iterations_step, num_bootstrap, gsa.seed,
    )
    filepath_S = gsa.write_dir / "arrays" / filename_S
    S_dict = read_pickle(filepath_S)

In [None]:
S_bootstrap = S_dict[39200]['delta'][:,2000]

In [None]:
import plotly.graph_objects as go

In [None]:
Y = S_bootstrap
num_bins = 14
bin_min, bin_max = min(Y), max(Y)
bins_ = np.linspace(bin_min, bin_max, num_bins, endpoint=True)
freq1, bins1 = np.histogram(Y, bins=bins_)
fig = go.Figure()
fig.add_trace(
    go.Bar(
        x=bins1,
        y=freq1,
    ),
)

# Tuning xgboost

In [None]:
%%time

from gsa_framework.test_functions import Morris4
from gsa_framework.methods.gradient_boosting import GradientBoosting
from pathlib import Path
from sklearn.model_selection import GridSearchCV, train_test_split
import numpy as np
import xgboost as xgb
from sklearn.metrics import explained_variance_score, r2_score

if __name__ == "__main__":

    path_base = Path('/data/user/kim_a/paper_gsa/')
#     path_base = Path("/Users/akim/PycharmProjects/gsa_framework/dev/write_files/")

    # 1. Models
    num_params = 10000
    num_influential = max(num_params // 100, 10)
    write_dir = path_base / "{}_morris4".format(num_params)
    model = Morris4(num_params=num_params, num_influential=num_influential)
    gsa_seed = 3407
    validation_seed = 7043

    fig_format = []  # can have elements "pdf", "html", "pickle"

    iterations = 4 * num_params
    test_size = 0.2

    option = "no tuning"
    if "tuning" in option:
        # 1. Preparations
        np.random.seed(gsa_seed)
        X = np.random.rand(iterations, num_params)
        Y = model(X)
        # 2. Prepare training and testing sets for  gradient boosting trees
        X_train, X_test, Y_train, Y_test = train_test_split(
            X,
            Y,
            test_size=test_size,
            random_state=gsa_seed,
        )

        dtrain = xgb.DMatrix(X_train, Y_train)
        X_dtest = xgb.DMatrix(X_test)

        if option == "tuning":
            ### ROUND 1 ###
            # xgb.train uses parameter `num_boost_round`, while XGBRegressor needs `n_estimators`. These two are the same.
            param_grid = {
                "learning_rate": [0.15],
                "gamma": [0],
                "min_child_weight": [60, 100, 140],
                "max_depth": [2],
                "reg_lambda": [0, 10],
                "reg_alpha": [0, 10],
                "n_estimators": [500, 800, 1100],
                "subsample": [0.3, 0.6],
                "colsample_bytree": [0.3, 0.6],
            }

            optimal_params = GridSearchCV(
                estimator=xgb.XGBRegressor(
                    objective="reg:squarederror",
                    seed=gsa_seed,
                ),
                param_grid=param_grid,
                scoring="explained_variance",  # explained_variance takes into account mean squared error, r2 does not. former is unbiasede, so better than r2
                cv=3,
            )
            optimal_params.fit(
                X_train,
                Y_train,
                early_stopping_rounds=10,
                eval_set=[(X_test, Y_test)],
                verbose=False,
            )

            print(optimal_params.best_params_)

            import pickle

            filepath = write_dir / "arrays" / "optimal_params_round_1.pickle"
            if filepath.exists():
                filepath = write_dir / "arrays" / "optimal_params_round_2.pickle"
            with open(filepath, "wb") as f:
                pickle.dump(optimal_params, f)

        elif option == "no tuning":
            np.random.seed(None)
            reg = xgb.XGBRegressor(
                verbosity=1,  # 0 (silent), 1 (warning), 2 (info), 3 (debug)
                objective="reg:squarederror",
                seed=gsa_seed,
                learning_rate=0.2,
                gamma=0,
                min_child_weight=600,
                max_depth=2,
                reg_lambda=0,
                reg_alpha=0,
                n_estimators=1500,
                subsample=0.2,
                colsample_bytree=0.2,
            )
            reg.fit(X_train, Y_train)
            ev_train = explained_variance_score(reg.predict(X_train), Y_train)
            ev_test = explained_variance_score(reg.predict(X_test), Y_test)
            print(ev_train, ev_test)

    else:
        tuning_parameters = dict(
            learning_rate=0.15,
            gamma=0,
            min_child_weight=100,
            max_depth=2,
            reg_lambda=5,
            reg_alpha=0,
            n_estimators=800,
            subsample=0.3,
            colsample_bytree=0.3,
        )
        gsa = GradientBoosting(
            iterations=iterations,
            model=model,
            write_dir=write_dir,
            seed=gsa_seed,
            tuning_parameters=tuning_parameters,
            test_size=test_size,
            xgb_model=None,
        )

        S_dict = gsa.perform_gsa(flag_save_S_dict=True)
        print(S_dict["stat.r2"], S_dict["stat.explained_variance"])
        gsa.plot_sa_results(
            {"fscores": S_dict["fscores"]},
            fig_format=fig_format,
        )


In [None]:
from setups_paper_gwp import *

In [None]:
path_base = Path('/data/user/kim_a/paper_gsa')
setup_xgbo = setup_xgbo_morris4

num_params = 5000
iter_xgbo = 4*num_params
gsa_xgbo = setup_xgbo(num_params, iter_xgbo, setup_morris4_model, path_base)
gsa_xgbo.perform_gsa()