# Compare similar datasets

Often, ecoinvent datasets for different regions use the same input data, but because the energy, materials, and transport mixes are split between the region-specific providers, it is difficult to tell if the same sums are present. For example, a European activity would consume 1 kWh from the ENTSO grid, while a "Rest of World" activity would consume 0.25 kWh from 4 different region-specific grids (outside Europe). The information is the same - 1 kWh consumed - but you have to think for a bit to see this. Thinking is hard, so we make a utility function that makes the computer think for us.

In [1]:
import bw2data as bd
import math
from pandas import DataFrame

In [2]:
def aggregated_dict(activity):
    """Return dictionary of inputs aggregated by input reference product."""
    results = {}
    for exc in activity.technosphere():
        results[exc.input['reference product']] = results.setdefault(exc.input['reference product'], 0) + exc['amount']

    for exc in activity.biosphere():
        results[exc.input['name']] = results.setdefault(exc.input['name'], 0) + exc['amount']

    return results

In [3]:
def compare_dictionaries(one, two, rel_tol=1e-4, abs_tol=1e-9):
    """Compare two dictionaries with form ``{str: float}``, and return a set of keys where differences where present.

    Tolerance values are inputs to `math.isclose <https://docs.python.org/3/library/math.html#math.isclose>`__."""
    return (set(one).symmetric_difference(set(two)).union( 
            {key for key in one 
             if key in two 
             and not math.isclose(a=one[key], b=two[key], rel_tol=rel_tol, abs_tol=abs_tol)}
           ))

In [4]:
def find_differences(activity, rel_tol=1e-4, abs_tol=1e-9, locations=None, as_dataframe=False):
    """Given an ``Activity``, try to see if other activities in the same database (with the same name and 
    reference product) have the same input levels.
    
    Tolerance values are inputs to `math.isclose <https://docs.python.org/3/library/math.html#math.isclose>`__.
    
    Locations is an optional list of locations to include (all others are filtered out; applies only to similar activities).
    
    If differences are present, a difference dictionary is constructed, with the form:
    
    .. code-block:: python
    
        {Activity instance: [(name of input flow (str), amount)]}
    
    Note that this doesn't reference a specific exchange, but rather sums all exchanges with the same input reference product.
    
    Returns ``(x, y)``, where ``x`` is the number of similar activities, and ``y`` is a dictionary of the differences. 
    This dictionary is empty if no differences are found.
    
    Assumes that all similar activities produce the same amount of reference product.
    
    """
    assert isinstance(activity, bd.backends.peewee.proxies.Activity)
    
    similar = [obj 
               for obj in bd.Database(act['database']) 
               if obj != act 
               and obj['reference product'] == act['reference product']
               and obj['name'] == act['name']
               and (not locations or obj['location'] in locations)]
    
    result = {}
    
    origin_dict = aggregated_dict(activity)
    
    for target in similar:
        target_dict = aggregated_dict(target)
        difference = compare_dictionaries(origin_dict, target_dict, rel_tol, abs_tol)
        if difference:
            if activity not in result:
                result[activity] = {}
            result[activity].update({key: value for key, value in origin_dict.items() if key in difference})
            result[target] = {key: value for key, value in target_dict.items() if key in difference}
    
    if as_dataframe:
        return DataFrame([{'location': obj['location'], **result[obj]} for obj in result])
    else:
        return result

Examples of application:

In [5]:
bd.projects.set_current("ecoinvent 3.7.1 bw2")

In [6]:
act = next(obj for obj in bd.Database("ecoinvent 3.7.1") if obj['name'] == 'polyethylene production, low density, granulate')
act

'polyethylene production, low density, granulate' (kilogram, RoW, None)

In [7]:
find_differences(act)

{}

In [8]:
act = next(obj for obj in bd.Database("ecoinvent 3.7.1") if obj['name'] == 'electricity production, hard coal' and obj['location'] == 'DE')
act    

'electricity production, hard coal' (kilowatt hour, DE, None)

In [9]:
find_differences(act, locations=['LV', 'PL', 'EE', 'CZ', 'LT', 'AU', 'BG', 'RU'], as_dataframe=True)

Unnamed: 0,location,"SOx retained, in hard coal flue gas desulfurisation","chlorine, liquid","NOx retained, by selective catalytic reduction",residue from cooling tower,"water, decarbonised",hard coal,hard coal ash,light fuel oil,"water, completely softened",...,Uranium-238,Lead-210,Toluene,Hexane,"Hydrocarbons, chlorinated","Hydrocarbons, aliphatic, unsaturated","Methane, monochloro-, R-40",Pentane,Boron,Thorium-228
0,DE,0.005945,9.7e-05,0.002258,-4.8e-05,1.447721,0.402466,-0.002538,0.000164,0.057909,...,3e-06,1.6e-05,1.052011e-06,0.0,0.0,2e-06,0.0,1.418767e-06,1e-06,1.100268e-06
1,BG,0.00201,,0.00073,-9.3e-05,3.396226,0.743774,-0.127019,0.000289,0.101887,...,0.00027,0.001177,1.129245e-06,9.424528e-09,2.835849e-08,2e-06,7.420755e-08,1.477358e-06,7e-06,5.603774e-05
2,RU,0.00201,,0.00073,-4.7e-05,1.710214,0.374537,-0.063962,0.000145,0.051306,...,0.000136,0.000593,5.686461e-07,4.745843e-09,1.428029e-08,1e-06,3.736817e-08,7.43943e-07,3e-06,2.821853e-05
3,EE,0.00201,,0.00073,-5.5e-05,2.005571,0.43922,-0.075008,0.00017,0.060167,...,0.000159,0.000695,6.668524e-07,5.56546e-09,1.674652e-08,1e-06,4.382173e-08,8.724234e-07,4e-06,3.309192e-05
4,AU,0.000362,,0.000222,-6e-05,2.175227,0.476375,-0.081353,0.000185,0.065257,...,0.000173,0.000754,7.232628e-07,6.036254e-09,1.816314e-08,1e-06,4.75287e-08,9.462236e-07,4e-06,3.589124e-05
5,LV,0.00201,,0.00073,-4.7e-05,1.710214,0.374537,-0.063962,0.000145,0.051306,...,0.000136,0.000593,5.686461e-07,4.745843e-09,1.428029e-08,1e-06,3.736817e-08,7.43943e-07,3e-06,2.821853e-05
6,CZ,0.004646,0.000108,,-5.4e-05,1.616766,0.487186,-0.061976,0.000183,0.064671,...,1e-06,6e-06,1.17485e-06,,,2e-06,,1.584431e-06,3e-06,9.129341e-07
