# Generate parameterized datapackage

In [27]:
import bw2data as bd
import bw2io as bi
import bw2calc as bc
from pprint import pprint
from tqdm import tqdm
import bw2parameters as bwp
import numpy as np
import traceback
import sys
import re
from gsa_framework.utils import write_pickle, read_pickle
sys.path.append('/Users/akim/PycharmProjects/akula')
from akula.markets import DATA_DIR

In [None]:
assert bi.__version__ >= (0, 9, "DEV7")

In [None]:
from asteval import Interpreter
from numbers import Number
from bw2parameters.errors import BroadcastingError
from stats_arrays import uncertainty_choices


MC_ERROR_TEXT = """Formula returned array of wrong shape:
Name: {}
Formula: {}
Expected shape: {}
Returned shape: {}"""


class PatchedParameterSet(bwp.ParameterSet):
    def evaluate_monte_carlo(self, iterations=1000):
        """Evaluate each formula using Monte Carlo and variable uncertainty data, if present.

        Formulas **must** return a one-dimensional array, or ``BroadcastingError`` is raised.

        Returns dictionary of ``{parameter name: numpy array}``."""
        interpreter = Interpreter()
        result = {}

        def get_rng_sample(obj):
            if isinstance(obj, np.ndarray):
                # Already a Monte Carlo sample
                return obj
            if 'uncertainty_type' not in obj:
                if 'uncertainty type' not in obj:
                    obj = obj.copy()
                    obj['uncertainty_type'] = 0
                    obj['loc'] = obj['amount']
                else:
                    obj['uncertainty_type'] = obj['uncertainty type']
            kls = uncertainty_choices[obj['uncertainty_type']]
            return kls.bounded_random_variables(kls.from_dicts(obj), iterations).ravel()

        def fix_shape(array):
            # This is new
            if array is None:
                return np.zeros((iterations,))
            elif isinstance(array, Number):
                return np.ones((iterations,)) * array
            elif not isinstance(array, np.ndarray):
                return np.zeros((iterations,))
            # End new section
            elif array.shape in {(1, iterations), (iterations, 1)}:
                return array.reshape((iterations,))
            else:
                return array

        for key in self.order:
            if key in self.global_params:
                interpreter.symtable[key] = result[key] = get_rng_sample(self.global_params[key])
            elif self.params[key].get('formula'):
                sample = fix_shape(interpreter(self.params[key]['formula']))
                if sample.shape != (iterations,):
                    raise BroadcastingError(MC_ERROR_TEXT.format(
                        key, self.params[key]['formula'], (iterations,), sample.shape)
                    )
                interpreter.symtable[key] = result[key] = sample
            else:
                interpreter.symtable[key] = result[key] = get_rng_sample(self.params[key])
        return result

In [None]:
bd.projects.set_current('GSA for archetypes')
bd.databases

In [None]:
ei = bd.Database("ecoinvent 3.8 cutoff")

# Takes forever and not necessary, skip it...
if not ei.metadata.get('fixed chemical formula name') and False:
    from bw2data.backends.schema import ExchangeDataset as ED
    
    qs = ED.select().where(ED.output_database == "ecoinvent 3.8 cutoff")
    print("this will take a while, maybe 30 minutes")
    
    for exc in tqdm(qs, total=629959):
        if 'formula' in exc.data:
            exc.data['chemical formula'] = exc.data.pop('formula')
            exc.save()

    ei.metadata['fixed chemical formula name'] = True
    bd.databases.flush()

In [None]:
# fp_ecoinvent_38 = "/Users/cmutel/Documents/lca/Ecoinvent/3.8/cutoff/datasets"
fp_ecoinvent_38 = "/Users/akim/Documents/LCA_files/ecoinvent_38_cutoff/datasets"
fp_ei = DATA_DIR / "ecoinvent.pickle"
if fp_ei.exists():
    eii = read_pickle(fp_ei)
else:
    eii = bi.SingleOutputEcospold2Importer(fp_ecoinvent_38, "ecoinvent 3.8 cutoff")
    eii.apply_strategies()
    write_pickle(eii, fp_ei)

In [None]:
found = set()

for act in eii.data:
    if any(exc.get('formula') for exc in act['exchanges']):
        found.add(
            (sum(1 for exc in act['exchanges'] if exc.get('formula')), 
             act['name'], 
             act['reference product'], 
             act['location'], 
             act['unit'])
        )
        
len(found)

In [None]:
# list(sorted(found, reverse=True))[:25]

Don't trust pedigree uncertainty increases for variables

In [None]:
def drop_pedigree_uncertainty(dct):
    if 'scale' in dct and 'scale with pedigree' in dct:
        dct['scale with pedigree'] = dct.pop('scale')
        dct['scale'] = dct.pop('scale without pedigree')
    return dct

Change `10,43` to `10.42`.

In [None]:
test = "0,034 * 10,42"
result = re.sub(r'(\d)\,(\d)', r'\1.\2', test)
assert result == '0.034 * 10.42'

Fix Python reserved words used as variable names

In [None]:
substitutions = {
    'yield': 'yield_',
    'import': 'import_',
    'load': 'load_',
}

Apply above fixes and a few others

In [None]:
def clean_formula(string):
    string = string.strip().replace("%", " / 100").replace("^", " ** ").replace("\r\n", " ").replace("\n", "")

    for k, v in substitutions.items():
        string = string.replace(k, v)
    
    string = re.sub(r'(\d)\,(\d)', r'\1.\2', string)
    return string

In [None]:
def clean_dct(dct):
    if dct.get('formula'):
        dct['formula'] = clean_formula(dct['formula'])
    if dct.get('name') in substitutions:
        dct['name'] = substitutions[dct['name']]
    return dct

In [None]:
def reformat_parameters(act):
    parameters = {
        substitutions.get(dct['name'], dct['name']): clean_dct(drop_pedigree_uncertainty(dct)) 
        for dct in act['parameters'] if 'name' in dct
    }
    
    for index, exc in enumerate(act['exchanges']):
        if exc.get('formula'):
            pn = f'__exchange_{index}'
            exc['parameter_name'] = pn
            parameters[pn] = {'formula': clean_formula(exc['formula'])}
    
    return parameters

In [None]:
def stochastic_parameter_set_for_activity(act, iterations=250):
    ps = PatchedParameterSet(reformat_parameters(act))
    return ps.evaluate_monte_carlo(iterations=iterations)

In [None]:
def check_that_parameters_are_reasonable(act, results, rtol=0.1):
    for exc in act['exchanges']:
        if exc.get('formula'):
            arr = results[exc['parameter_name']]
            if not np.isclose(exc['amount'], np.median(arr), rtol=rtol):
                print(
                    act['name'], 
                    exc['name'],
                    act['location'], 
                    act['unit'],
                )
                print("\t", exc['amount'], np.median(arr), exc['formula'])
                return False
    return True

In [None]:
from bw2data.backends.schema import ActivityDataset as AD

lookup_cache = {(x, y): z 
                for x, y, z in AD.select(AD.database, AD.code, AD.id)
                .where(AD.database << ("biosphere3", "ecoinvent 3.8 cutoff"))
                .tuples()
               }                                                            

In [None]:
tech_data, bio_data = [], []

In [None]:
found, errors, unreasonable, missing = 0, 0, 0, 0

error_log = open("error.log", "w")
missing_reference_log = open("undefined_reference.log", "w")

for act in tqdm(eii.data):
    if any(exc.get('formula') for exc in act['exchanges']):
        try:
            params = stochastic_parameter_set_for_activity(act, iterations=25000)
            break
            if check_that_parameters_are_reasonable(act, params):
                found += 1
                
                for exc in act['exchanges']:
                    if not exc.get('formula'):
                        continue
                    if exc['input'][0] ==  "ecoinvent 3.8 cutoff":   
                        tech_data.append((
                            (lookup_cache[exc['input']], lookup_cache[(act['database'], act['code'])]),
                            params[exc['parameter_name']],
                            exc['type'] != 'production'  # TODO Chris please check, changed this from == to !=
                        ))
                    else:
                        bio_data.append((
                            (lookup_cache[exc['input']], lookup_cache[(act['database'], act['code'])]),
                            params[exc['parameter_name']],
                            False
                        ))
            else:
                unreasonable += 1
        except (ValueError, SyntaxError, bwp.errors.DuplicateName):
            error_log.write(act['filename'] + "\n")
            traceback.print_exc(file=error_log)
            errors += 1
        except bwp.errors.ParameterError:
            missing_reference_log.write(act['filename'] + "\n")
            traceback.print_exc(file=missing_reference_log)
            missing += 1
            
error_log.close()
missing_reference_log.close()

In [None]:
found, errors, unreasonable, missing

In [None]:
len(tech_data), len(bio_data)

Uncertain production exchanges. Could be bad data.

In [None]:
np.hstack([z for x, y, z in tech_data]).sum()

In [None]:
import bw_processing as bp
from fs.zipfs import ZipFS

In [None]:
dp = bp.create_datapackage(
    fs=ZipFS("ecoinvent-parameterization.zip", write=True),
    name="ecoinvent-parameterization",
    seed=42,
)

indices = np.empty(len(tech_data), dtype=bp.INDICES_DTYPE)
indices[:] = [x for x, y, z in tech_data]

dp.add_persistent_array(
    matrix="technosphere_matrix",
    data_array=np.vstack([y for x, y, z in tech_data]),
    name="ecoinvent-parameterization-tech",
    indices_array=indices,
    flip_array=np.hstack([z for x, y, z in tech_data]),
)

indices = np.empty(len(bio_data), dtype=bp.INDICES_DTYPE)
indices[:] = [x for x, y, z in bio_data]

dp.add_persistent_array(
    matrix="biosphere_matrix",
    data_array=np.vstack([y for x, y, z in bio_data]),
    name="ecoinvent-parameterization-bio",
    indices_array=indices,
    flip_array=np.hstack([z for x, y, z in bio_data]),
)

dp.finalize_serialization()

# [archived] Check values in the parameterized datapackage

In [None]:
from pathlib import Path
import numpy as np
from fs.zipfs import ZipFS
import bw2calc as bc
import bw2data as bd
import bw_processing as bwp
import sys
sys.path.append('/Users/akim/PycharmProjects/akula')

from akula.virtual_markets import DATA_DIR

fp_ei_parameterization = DATA_DIR / "ecoinvent-parameterization.zip"
dp_params = bwp.load_datapackage(ZipFS(fp_ei_parameterization))

In [None]:
project = "GSA for archetypes"
bd.projects.set_current(project)
method = ("IPCC 2013", "climate change", "GWP 100a", "uncertain")
me = bd.Method(method)
bs = bd.Database("biosphere3")
ei = bd.Database("ecoinvent 3.8 cutoff")
co_name = "swiss consumption 1.0"
co = bd.Database(co_name)

list_ = [me, bs, ei, co]
dps = [
    bwp.load_datapackage(ZipFS(db.filepath_processed()))
    for db in list_
]
    
hh_average = [act for act in co if "ch hh average consumption aggregated" == act['name']]
assert len(hh_average) == 1
demand_act = hh_average[0]
demand = {demand_act: 1}
demand_id = {demand_act.id: 1}

iterations = 5

In [None]:
lca = bc.LCA(
    demand_id,
    data_objs=dps,
    use_distributions=True,
    use_arrays=True,
    seed_override=11111000
)
lca.lci()
lca.lcia()

scores = [lca.score for _, _ in zip(lca, range(iterations))]
scores

In [None]:
lca_params = bc.LCA(
    demand_id,
    data_objs=dps + [dp_params],
    use_distributions=True,
    use_arrays=True,
    seed_override=11111000,
)
lca_params.lci()
lca_params.lcia()

scores_params = [lca_params.score for _, _ in zip(lca_params, range(iterations))]
scores_params

In [None]:
dp_params_bio = dp_params.filter_by_attribute("group", "ecoinvent-parameterization-bio")

lca_params_bio = bc.LCA(
    demand_id,
    data_objs=dps + [dp_params_bio],
    use_distributions=True,
    use_arrays=True,
    seed_override=11111000,
)
lca_params_bio.lci()
lca_params_bio.lcia()

scores_params_bio = [lca_params_bio.score for _, _ in zip(lca_params_bio, range(iterations))]
scores_params_bio

In [None]:
dp_params_tech = dp_params.filter_by_attribute("group", "ecoinvent-parameterization-tech")

lca_params_tech = bc.LCA(
    demand_id,
    data_objs=dps + [dp_params_tech],
    use_distributions=True,
    use_arrays=True,
    seed_override=11111000,
)
lca_params_tech.lci()
lca_params_tech.lcia()

scores_params_tech = [lca_params_tech.score for _, _ in zip(lca_params_tech, range(iterations))]
scores_params_tech

In [None]:
dp_ei = bd.Database("ecoinvent 3.8 cutoff").datapackage()
ei_indices = dp_ei.get_resource("ecoinvent_3.8_cutoff_technosphere_matrix.indices")[0]
ei_data = dp_ei.get_resource("ecoinvent_3.8_cutoff_technosphere_matrix.data")[0]
ei_flip_raw = dp_ei.get_resource("ecoinvent_3.8_cutoff_technosphere_matrix.flip")[0]
ei_selected = []
ei_flip = []
for i, inds in enumerate(dp_params_tech.data[0]):
    ei_where = np.where(ei_indices==inds)[0][0]
    ei_selected.append(ei_data[ei_where])
    ei_flip.append(ei_flip_raw[ei_where])
params_selected = dp_params_tech.data[1][:,0]
ei_selected = np.array(ei_selected)
ei_flip = np.array(ei_flip)
indices_selected = dp_params_tech.data[0]

In [None]:
wdiff = abs(params_selected - ei_selected)
# np.where(wdiff==min(wdiff))

In [None]:
%%time
res = bc.GraphTraversal().calculate(
    lca, cutoff=1e-3, max_calc=1e3
)

In [None]:
%%time
res_params_tech = bc.GraphTraversal().calculate(
    lca_params_tech, cutoff=1e-3, max_calc=1e3
)

In [None]:
import pandas as pd
df = pd.DataFrame.from_dict(res['edges'])
df_params = pd.DataFrame.from_dict(res_params_tech['edges'])
df_both = df.merge(df_params, on=['to', 'from'], how='outer')
df_both.to_excel("sct.xlsx")

In [None]:
lca_params_tech1 = bc.LCA(
    {4916: 1212.188043},
    data_objs=dps + [dp_params_tech],
    use_distributions=False,
    use_arrays=True,
    seed_override=11111000,
)
lca_params_tech1.lci()
lca_params_tech1.lcia()
lca_params_tech1.score

In [None]:
lca_params_tech1 = bc.LCA(
    {4916: 1212.188043},
    data_objs=dps,
    use_distributions=False,
    use_arrays=True,
    seed_override=11111000,
)
lca_params_tech1.lci()
lca_params_tech1.lcia()
lca_params_tech1.score

In [None]:
params_flip = dp_params_tech.get_resource('ecoinvent-parameterization-tech.flip')[0]

In [None]:
sum(ei_flip), sum(params_flip)