# DASK

In [13]:
from dask.distributed import Client, LocalCluster
from dask_jobqueue import SLURMCluster
import os

In [14]:
which_pc = "merlin_paper_gsa"
if 'merlin' in which_pc:
    path_dask_logs = '/data/user/kim_a/dask_logs'
    if not os.path.exists(path_dask_logs):
        os.makedirs(path_dask_logs)
    cluster = SLURMCluster(cores     = 10,
                           processes = 6,
                           memory    ="80GB", 
                           walltime  = '12:00:00',
                           interface ='ib0',
                           local_directory = path_dask_logs,
                           log_directory   = path_dask_logs,
                           queue="daily",
                           ) 
elif 'local' in which_pc:
    cluster = LocalCluster(memory_limit='7GB') 

In [15]:
client = Client(cluster)

In [16]:
n_workers = 60
cluster.scale(n_workers)

In [18]:
client

0,1
Client  Scheduler: tcp://192.168.196.21:35128  Dashboard: http://192.168.196.21:8787/status,Cluster  Workers: 60  Cores: 60  Memory: 799.80 GB


In [None]:
# client.close()
# cluster.close()

# Stability delta

In [21]:
from gsa_framework.lca import LCAModel
from gsa_framework.methods.delta_moment import DeltaMoment
from gsa_framework.sensitivity_analysis.delta_moment import delta_moment
from gsa_framework.convergence import Convergence
from pathlib import Path
import brightway2 as bw
import time
import numpy as np
from gsa_framework.utils import write_pickle, read_hdf5_array, write_hdf5_array, read_pickle
import dask

In [22]:
def compute_per_worker(iterations_current, seed):
    path_base = Path('/data/user/kim_a/paper_gsa/gsa_framework_files')
    num_params = 10000
    write_dir = path_base / "lca_model_{}".format(num_params)
    stability_dir = write_dir / "stability_intermediate_deltaGsaNr1"
    filepath_S = stability_dir / "S.step{}.seed{}.pickle".format(iterations_current, seed)
    if not filepath_S.exists():
        filepath_X_rescaled = stability_dir / "X.rescaled.step{}.seed{}.hdf5".format(iterations_current, seed)
        filepath_Y = stability_dir / "Y.step{}.seed{}.hdf5".format(iterations_current, seed)
        S_dict = delta_moment(
            filepath_Y=filepath_Y,
            filepath_X=filepath_X_rescaled,
            iterations=iterations_current,
            num_params=num_params,
            seed=seed,
            num_resamples=1,
        )
        write_pickle(S_dict, filepath_S)
        return S_dict
    else:
        print("{} already exists".format(filepath_S))

In [23]:
if __name__ == "__main__":

    path_base = Path('/data/user/kim_a/paper_gsa/gsa_framework_files')

    # LCA model
    bw.projects.set_current("GSA for paper")
    co = bw.Database("CH consumption 1.0")
    act = [act for act in co if "Food" in act["name"]][0]
    demand = {act: 1}
    method = ("IPCC 2013", "climate change", "GTP 100a")

    # Define some variables
    num_params = 10000
    num_influential = num_params // 100
    iterations_validation = 500
    write_dir = path_base / "lca_model_{}".format(num_params)
    model = LCAModel(demand, method, write_dir, num_params=num_params)
    gsa_seed = 3403
    validation_seed = 7043
    fig_format = ["html", "pickle"]

    iterations = 2 * num_params
    num_resamples = 1
    
    gsa = DeltaMoment(
        iterations=iterations,
        model=model,
        write_dir=write_dir,
        num_resamples=num_resamples,
        seed=gsa_seed,
    )
    conv = Convergence(
        gsa.filepath_Y,
        gsa.num_params,
        gsa.generate_gsa_indices,
        gsa.gsa_label,
        write_dir,
        num_steps=25,
    )
    num_bootstrap = 10
    np.random.seed(gsa_seed)
    stability_seeds = np.random.randint(
        low=0,
        high=2147483647,
        size=(len(conv.iterations_for_convergence), num_bootstrap),
    )
    
    stability_dir = write_dir / "stability_intermediate_{}".format(gsa.gsa_label)
    stability_dir.mkdir(parents=True, exist_ok=True)
    
#     X_rescaled = read_hdf5_array(gsa.filepath_X_rescaled)
#     Y = read_hdf5_array(gsa.filepath_Y).flatten()
    
#     for i, iterations_current in enumerate(conv.iterations_for_convergence):
#         print(iterations_current)
#         for seed in stability_seeds[i,:]:
#             np.random.seed(seed)
#             r = np.random.choice(np.arange(iterations), size=iterations_current, replace=False)
#             r.sort()
#             assert len(set(r)) == len(r)
#             Xcurrent = X_rescaled[r,:]
#             Ycurrent = Y[r]
#             fp_X = stability_dir / "X.rescaled.step{}.seed{}.hdf5".format(iterations_current, seed)
#             fp_Y = stability_dir / "Y.step{}.seed{}.hdf5".format(iterations_current, seed)
#             write_hdf5_array(Xcurrent, fp_X)
#             write_hdf5_array(Ycurrent, fp_Y)
    model_evals = []
    model_evals_all = []
    task_per_worker = dask.delayed(compute_per_worker)
    for i, iterations_current in enumerate(conv.iterations_for_convergence):
        if i%6==0 and i>0:
            print(len(model_evals))
            model_evals_all.append(model_evals)
            model_evals = []
        for seed in stability_seeds[i,:]:
            model_eval = task_per_worker(iterations_current, seed)
            model_evals.append(model_eval)
        
    model_evals_all.append(model_evals)

60
60
60


In [24]:
# %%time
# #Test
# compute_per_worker(800, 1800109006)

In [None]:
%%time
for model_evals in model_evals_all:
    print(len(model_evals))
    dask.compute(model_evals)

60
60
60


# Convergence XGBoost

In [None]:
from gsa_framework.lca import LCAModel
from gsa_framework.methods.gradient_boosting import GradientBoosting
from gsa_framework.convergence import Convergence
from pathlib import Path
import brightway2 as bw
import time
import numpy as np
from gsa_framework.utils import write_pickle, read_hdf5_array, write_hdf5_array
import dask

In [None]:
def setup(filepath_Y_worker=None,lca=None):
    path_base = Path('/data/user/kim_a/paper_gsa/gsa_framework_files')

    # LCA model
    bw.projects.set_current("GSA for paper")
    co = bw.Database("CH consumption 1.0")
    act = [act for act in co if "Food" in act["name"]][0]
    demand = {act: 1}
    method = ("IPCC 2013", "climate change", "GTP 100a")

    # Define some variables
    num_params = 10000
    num_influential = num_params // 100
    iterations_validation = 500
    write_dir = path_base / "lca_model_{}".format(num_params)
    
    model = LCAModel(demand, method, write_dir, num_params=num_params,lca=lca)
    gsa_seed = 3403
    validation_seed = 7043
    fig_format = ["html", "pickle"]

    iterations = 2 * num_params
    
    num_boost_round = 400
    tuning_parameters = {
        "max_depth": 6,
        "eta": 0.1,
        "objective": "reg:squarederror",
        "n_jobs": -1,
        "refresh_leaf": True,
        "subsample": 0.6,
        "min_child_weight": 0.5,
    }
    iterations = 2 * num_params
    gsa = GradientBoosting(
        iterations=iterations,
        model=model,
        write_dir=write_dir,
        seed=gsa_seed,
        tuning_parameters=tuning_parameters,
        num_boost_round=num_boost_round,
        xgb_model=None,
    )
    
    convergence_dir = write_dir / "convergence_intermediate_{}".format(gsa.gsa_label)
    convergence_dir.mkdir(parents=True, exist_ok=True)
    
    if filepath_Y_worker is None:
        filepath_Y_worker = gsa.filepath_Y
    conv = Convergence(
        filepath_Y_worker,
        gsa.num_params,
        gsa.generate_gsa_indices,
        gsa.gsa_label,
        write_dir,
        num_steps=100,
    )
    return gsa, conv, convergence_dir

In [None]:
def compute_per_worker(iterations_current, filepath_Y_worker, lca):
    gsa, conv, convergence_dir = setup(filepath_Y_worker, lca)
    filepath_convergence_dict = convergence_dir / conv.create_convergence_dict_filepath()
    selected_iterations = conv.iterations_order[0:iterations_current]
    parameters_convergence_dict = {
        "iterations": iterations_current,
        "iterations_step": conv.iterations_step,
        "selected_iterations": selected_iterations,
        "flag_convergence": True,
    }
    gsa_indices_dict = conv.gsa_func(**parameters_convergence_dict)
    write_pickle(gsa_indices_dict, filepath_convergence_dict)

In [None]:
if __name__ == "__main__":
#     path_base = Path(
#         "/Users/akim/PycharmProjects/gsa_framework/dev/write_files/paper_gsa/"
#     )
    gsa, conv, convergence_dir = setup()
    task_per_worker = dask.delayed(compute_per_worker)
    Y = read_hdf5_array(gsa.filepath_Y).flatten()
    
    model_evals = []
    
    for i, iterations_current in enumerate(conv.iterations_for_convergence):
        
        filename = "S.{}.{}.{}Step{}.{}.pickle".format(
            gsa.gsa_label, gsa.sampling_label, iterations_current, 800, 3403
        )
        filepath = (
            convergence_dir / filename
        )
        if not filepath.exists() and iterations_current!=200 and iterations_current!=400:
            filepath_Y_worker = convergence_dir / "Y.{}.hdf5".format(iterations_current)
            write_hdf5_array(Y,filepath_Y_worker)
            model_eval = task_per_worker(
                iterations_current,
                filepath_Y_worker,
                gsa.model.lca,
            )
            model_evals.append(model_eval)
        else:
            print(filepath.name)

In [None]:
# %%time
# # Test
# gsa, conv, convergence_dir = setup()
# filepath_Y_worker = convergence_dir / "Y.{}.hdf5".format(200)
# compute_per_worker(200, filepath_Y_worker, gsa.model.lca)

In [None]:
%%time
res = dask.compute(model_evals[1:])

In [None]:
len(model_evals)

In [None]:
path_base = Path('/data/user/kim_a/paper_gsa/gsa_framework_files')

# LCA model
bw.projects.set_current("GSA for paper")
co = bw.Database("CH consumption 1.0")
act = [act for act in co if "Food" in act["name"]][0]
demand = {act: 1}
method = ("IPCC 2013", "climate change", "GTP 100a")

# Define some variables
num_params = 10000
num_influential = num_params // 100
iterations_validation = 500
write_dir = path_base / "lca_model_{}".format(num_params)

model = LCAModel(demand, method, write_dir, num_params=num_params)

In [None]:
model.static_output

# Stability for xgboost

In [None]:
from gsa_framework.lca import LCAModel
from gsa_framework.methods.gradient_boosting import GradientBoosting
from gsa_framework.sensitivity_analysis.gradient_boosting import xgboost_scores
from gsa_framework.convergence import Convergence
from pathlib import Path
import brightway2 as bw
import time
import numpy as np
from gsa_framework.utils import write_pickle, read_hdf5_array, write_hdf5_array, read_pickle
import dask

In [None]:
def compute_per_worker(iterations_current, seed):
    path_base = Path('/data/user/kim_a/paper_gsa/gsa_framework_files')
    num_params = 10000
    write_dir = path_base / "lca_model_{}".format(num_params)
    stability_dir = write_dir / "stability_intermediate_xgboostGsaN400D6E10S60"
    
    filepath_S = stability_dir / "S.step{}.seed{}.pickle".format(iterations_current, seed)
    if not filepath_S.exists():
        filepath_X = stability_dir / "X.step{}.seed{}.hdf5".format(iterations_current, seed)
        filepath_Y = stability_dir / "Y.step{}.seed{}.hdf5".format(iterations_current, seed)
        num_boost_round = 400
        tuning_parameters = {
            "max_depth": 6,
            "eta": 0.1,
            "objective": "reg:squarederror",
            "n_jobs": -1,
            "refresh_leaf": True,
            "subsample": 0.6,
            "min_child_weight": 0.5,
        }
        S_dict = xgboost_scores(
            filepath_Y,
            filepath_X,
            iterations_current,
            tuning_parameters=tuning_parameters,
            train_test_ratio=0.8,
            num_boost_round=num_boost_round,
            xgb_model=None,
        )

        write_pickle(S_dict, filepath_S)
        return S_dict
    else:
        print("{} already exists".format(filepath_S))
    

In [None]:
if __name__ == "__main__":
#     path_base = Path(
#         "/Users/akim/PycharmProjects/gsa_framework/dev/write_files/paper_gsa/"
#     )
    path_base = Path('/data/user/kim_a/paper_gsa/gsa_framework_files')

    # LCA model
    bw.projects.set_current("GSA for paper")
    co = bw.Database("CH consumption 1.0")
    act = [act for act in co if "Food" in act["name"]][0]
    demand = {act: 1}
    method = ("IPCC 2013", "climate change", "GTP 100a")

    # Define some variables
    num_params = 10000
    num_influential = num_params // 100
    iterations_validation = 500
    write_dir = path_base / "lca_model_{}".format(num_params)
    model = LCAModel(demand, method, write_dir, num_params=num_params)
    gsa_seed = 3403
    validation_seed = 7043
    fig_format = ["html", "pickle"]

    iterations = 2 * num_params
    
    num_boost_round = 400
    tuning_parameters = {
        "max_depth": 6,
        "eta": 0.1,
        "objective": "reg:squarederror",
        "n_jobs": -1,
        "refresh_leaf": True,
        "subsample": 0.6,
        "min_child_weight": 0.5,
    }
    gsa = GradientBoosting(
        iterations=iterations,
        model=model,
        write_dir=write_dir,
        seed=gsa_seed,
        tuning_parameters=tuning_parameters,
        num_boost_round=num_boost_round,
        xgb_model=None,
    )
    
    conv = Convergence(
        gsa.filepath_Y,
        gsa.num_params,
        gsa.generate_gsa_indices,
        gsa.gsa_label,
        write_dir,
        num_steps=25,
    )
    num_bootstrap = 10
    np.random.seed(gsa_seed)
    stability_seeds = np.random.randint(
        low=0,
        high=2147483647,
        size=(len(conv.iterations_for_convergence), num_bootstrap),
    )
    
    stability_dir = write_dir / "stability_intermediate_{}".format(gsa.gsa_label)
    stability_dir.mkdir(parents=True, exist_ok=True)
    
#     X_rescaled = read_hdf5_array(gsa.filepath_X_rescaled)
#     Y = read_hdf5_array(gsa.filepath_Y).flatten()
    
#     for i, iterations_current in enumerate(conv.iterations_for_convergence):
#         print(iterations_current)
#         for seed in stability_seeds[i,:]:
#             np.random.seed(seed)
#             r = np.random.choice(np.arange(iterations), size=iterations_current, replace=False)
#             r.sort()
#             assert len(set(r)) == len(r)
#             Xcurrent = X_rescaled[r,:]
#             Ycurrent = Y[r]
#             fp_X = stability_dir / "X.step{}.seed{}.hdf5".format(iterations_current, seed)
#             fp_Y = stability_dir / "Y.step{}.seed{}.hdf5".format(iterations_current, seed)
#             write_hdf5_array(Xcurrent, fp_X)
#             write_hdf5_array(Ycurrent, fp_Y)
    model_evals = []
    model_evals_all = []
    task_per_worker = dask.delayed(compute_per_worker)
    for i, iterations_current in enumerate(conv.iterations_for_convergence):
        if i%6==0 and i>0:
            print(len(model_evals))
            model_evals_all.append(model_evals)
            model_evals = []
        for seed in stability_seeds[i,:]:
            model_eval = task_per_worker(iterations_current, seed)
            model_evals.append(model_eval)
        
    model_evals_all.append(model_evals)

In [None]:
# %%time
# #Test
# compute_per_worker(800, 449190993)

In [None]:
%%time
for model_evals in model_evals_all:
    print(len(model_evals))
    dask.compute(model_evals)