In [156]:
import bw2data as bd
import bw2io as bi
from pprint import pprint
from tqdm import tqdm
import bw2parameters as bwp
import numpy as np
import traceback
import sys
import re

In [2]:
assert bi.__version__ >= (0, 9, "DEV7")

In [100]:
from asteval import Interpreter
from numbers import Number
from bw2parameters.errors import BroadcastingError
from stats_arrays import uncertainty_choices


MC_ERROR_TEXT = """Formula returned array of wrong shape:
Name: {}
Formula: {}
Expected shape: {}
Returned shape: {}"""


class PatchedParameterSet(bwp.ParameterSet):
    def evaluate_monte_carlo(self, iterations=1000):
        """Evaluate each formula using Monte Carlo and variable uncertainty data, if present.

        Formulas **must** return a one-dimensional array, or ``BroadcastingError`` is raised.

        Returns dictionary of ``{parameter name: numpy array}``."""
        interpreter = Interpreter()
        result = {}

        def get_rng_sample(obj):
            if isinstance(obj, np.ndarray):
                # Already a Monte Carlo sample
                return obj
            if 'uncertainty_type' not in obj:
                if 'uncertainty type' not in obj:
                    obj = obj.copy()
                    obj['uncertainty_type'] = 0
                    obj['loc'] = obj['amount']
                else:
                    obj['uncertainty_type'] = obj['uncertainty type']
            kls = uncertainty_choices[obj['uncertainty_type']]
            return kls.bounded_random_variables(kls.from_dicts(obj), iterations).ravel()

        def fix_shape(array):
            # This is new
            if array is None:
                return np.zeros((iterations,))
            elif isinstance(array, Number):
                return np.ones((iterations,)) * array
            elif not isinstance(array, np.ndarray):
                return np.zeros((iterations,))
            elif array.shape in {(1, iterations), (iterations, 1)}:
                return array.reshape((iterations,))
            else:
                return array

        for key in self.order:
            if key in self.global_params:
                interpreter.symtable[key] = result[key] = get_rng_sample(self.global_params[key])
            elif self.params[key].get('formula'):
                sample = fix_shape(interpreter(self.params[key]['formula']))
                if sample.shape != (iterations,):
                    raise BroadcastingError(MC_ERROR_TEXT.format(
                        key, self.params[key]['formula'], (iterations,), sample.shape)
                    )
                interpreter.symtable[key] = result[key] = sample
            else:
                interpreter.symtable[key] = result[key] = get_rng_sample(self.params[key])
        return result

In [3]:
bd.projects.set_current('GSA for archetypes')

In [4]:
ei = bd.Database("ecoinvent 3.8 cutoff")
if not ei.metadata.get('fixed chemical formula name'):
    from bw2data.backends.schema import ExchangeDataset as ED
    
    qs = ED.select().where(ED.output_database == "ecoinvent 3.8 cutoff")
    print("this will take a while, maybe 30 minutes")
    
    for exc in tqdm(qs, total=629959):
        if 'formula' in exc.data:
            exc.data['chemical formula'] = exc.data.pop('formula')
            exc.save()

    ei.metadata['fixed chemical formula name'] = True
    bd.databases.flush()

In [36]:
fp_ecoinvent_38 = "/Users/cmutel/Documents/lca/Ecoinvent/3.8/cutoff/datasets"

eii = bi.SingleOutputEcospold2Importer(fp_ecoinvent_38, "ecoinvent 3.8 cutoff")
eii.apply_strategies()

Extracting XML data from 19565 datasets
Extracted 19565 datasets in 223.61 seconds
Applying strategy: normalize_units
Applying strategy: update_ecoinvent_locations
Applying strategy: remove_zero_amount_coproducts
Applying strategy: remove_zero_amount_inputs_with_no_activity
Applying strategy: remove_unnamed_parameters
Applying strategy: es2_assign_only_product_with_amount_as_reference_product
Applying strategy: assign_single_product_as_activity
Applying strategy: create_composite_code
Applying strategy: drop_unspecified_subcategories
Applying strategy: fix_ecoinvent_flows_pre35
Applying strategy: drop_temporary_outdated_biosphere_flows
Applying strategy: link_biosphere_by_flow_uuid
Applying strategy: link_internal_technosphere_by_composite_code
Applying strategy: delete_exchanges_missing_activity
Applying strategy: delete_ghost_exchanges
Applying strategy: remove_uncertainty_from_negative_loss_exchanges
Applying strategy: fix_unreasonably_high_lognormal_uncertainties
Applying strategy:

In [37]:
found = set()

for act in eii.data:
    if any(exc.get('formula') for exc in act['exchanges']):
        found.add(
            (sum(1 for exc in act['exchanges'] if exc.get('formula')), 
             act['name'], 
             act['reference product'], 
             act['location'], 
             act['unit'])
        )
        
len(found)

8449

In [38]:
list(sorted(found, reverse=True))[:25]

[(122, 'cobalt production', 'zinc sulfide', 'GLO', 'kilogram'),
 (122, 'cobalt production', 'sulfuric acid', 'GLO', 'kilogram'),
 (122, 'cobalt production', 'sulfur', 'GLO', 'kilogram'),
 (122, 'cobalt production', 'nickel, class 1', 'GLO', 'kilogram'),
 (122, 'cobalt production', 'nickel sulfate', 'GLO', 'kilogram'),
 (122, 'cobalt production', 'nickel concentrate, 16% Ni', 'GLO', 'kilogram'),
 (122, 'cobalt production', 'ferronickel', 'GLO', 'kilogram'),
 (122, 'cobalt production', 'electrolyte, nickel-rich', 'GLO', 'cubic meter'),
 (122, 'cobalt production', 'electrolyte, copper-rich', 'GLO', 'kilogram'),
 (122,
  'cobalt production',
  'electricity, high voltage',
  'GLO',
  'kilowatt hour'),
 (122, 'cobalt production', 'copper, cathode', 'GLO', 'kilogram'),
 (122, 'cobalt production', 'copper, anode', 'GLO', 'kilogram'),
 (122,
  'cobalt production',
  'copper concentrate, sulfide ore',
  'GLO',
  'kilogram'),
 (122, 'cobalt production', 'copper cake', 'GLO', 'kilogram'),
 (122,
 

In [101]:
def drop_pedigre_uncertainty(dct):
    if 'scale' in dct and 'scale with pedigree' in dct:
        dct['scale with pedigree'] = dct.pop('scale')
        dct['scale'] = dct.pop('scale without pedigree')
    return dct

In [164]:
test = "0,034 * 10,42"
result = re.sub(r'(\d)\,(\d)', r'\1.\2', test)
assert result == '0.034 * 10.42'

In [180]:
substitutions = {
    'yield': 'yield_',
    'import': 'import_',
    'load': 'load_',
}

In [195]:
def clean_formula(string):
    string = string.strip().replace("%", " / 100").replace("^", " ** ").replace("\r\n", " ").replace("\n", "")

    for k, v in substitutions.items():
        string = string.replace(k, v)
    
    string = re.sub(r'(\d)\,(\d)', r'\1.\2', string)
    return string

In [196]:
def clean_dct(dct):
    if dct.get('formula'):
        dct['formula'] = clean_formula(dct['formula'])
    if dct.get('name') in substitutions:
        dct['name'] = substitutions[dct['name']]
    return dct

In [197]:
def reformat_parameters(act):
    parameters = {substitutions.get(dct['name'], dct['name']): clean_dct(drop_pedigre_uncertainty(dct)) for dct in act['parameters'] if 'name' in dct}
    
    for index, exc in enumerate(act['exchanges']):
        if exc.get('formula'):
            pn = f'__exchange_{index}'
            exc['parameter_name'] = pn
            parameters[pn] = {'formula': clean_formula(exc['formula'])}
    
    return parameters

In [198]:
def stochastic_parameter_set_for_activity(act, iterations=250):
    ps = PatchedParameterSet(reformat_parameters(act))
    return ps.evaluate_monte_carlo(iterations=iterations)


In [199]:
def check_that_parameters_are_reasonable(act, results, rtol=0.1):
    for exc in act['exchanges']:
        if exc.get('formula'):
            arr = results[exc['parameter_name']]
            if not np.isclose(exc['amount'], np.median(arr), rtol=rtol):
                # print(
                #     act['name'], 
                #     exc['name'],
                #     act['location'], 
                #     act['unit'],
                # )
                # print("\t", exc['amount'], np.median(arr), exc['formula'])
                return False
    return True

In [200]:
found, errors, unreasonable, missing = 0, 0, 0, 0

error_log = open("error.log", "w")
missing_reference_log = open("undefined_reference.log", "w")

for act in tqdm(eii.data):
    if any(exc.get('formula') for exc in act['exchanges']):
        try:
            params = stochastic_parameter_set_for_activity(act)
            if check_that_parameters_are_reasonable(act, params):
                found += 1
            else:
                unreasonable += 1
        except (ValueError, SyntaxError, bwp.errors.DuplicateName):
            error_log.write(act['filename'] + "\n")
            traceback.print_exc(file=error_log)
            errors += 1
        except bwp.errors.ParameterError:
            missing_reference_log.write(act['filename'] + "\n")
            traceback.print_exc(file=missing_reference_log)
            missing += 1
            
error_log.close()
missing_reference_log.close()

100%|████████████████████████████████████| 19565/19565 [01:14<00:00, 262.37it/s]


In [201]:
found, errors, unreasonable, missing

(1436, 47, 233, 6733)