In [11]:
import wurst
import bw2data as bd
import pandas as pd

In [2]:
bd.projects.set_current("ei 3.8 cutoff")

In [3]:
data = wurst.extract_brightway2_databases(["ecoinvent 3.8 cutoff"], add_identifiers=True)

Getting activity data


100%|██████████████████████████████████████████| 19565/19565 [00:00<00:00, 122014.46it/s]


Adding exchange data to activities


100%|█████████████████████████████████████████| 629959/629959 [00:46<00:00, 13465.05it/s]


Filling out exchange data


100%|████████████████████████████████████████████| 19565/19565 [00:02<00:00, 6933.41it/s]


In [4]:
data[0]

{'classifications': [('ISIC rev.4 ecoinvent',
   '3510:Electric power generation, transmission and distribution'),
  ('CPC', '17100: Electrical energy')],
 'comment': 'This dataset changes the names of the (internally used) electricity product of waste incineration and connects it with the external grid and the respective average energy markets (grid electricity).\nTime period:  The Annual Production volume is valid for the year 2012.',
 'location': 'GR',
 'database': 'ecoinvent 3.8 cutoff',
 'code': '00014e7e2dd160027166b7274d58b7cc',
 'name': 'electricity, from municipal waste incineration to generic market for electricity, medium voltage',
 'reference product': 'electricity, medium voltage',
 'unit': 'kilowatt hour',
 'exchanges': [{'uncertainty type': 0,
   'loc': 1.0,
   'amount': 1.0,
   'type': 'production',
   'production volume': 60000000.0,
   'product': 'electricity, medium voltage',
   'name': 'electricity, from municipal waste incineration to generic market for electricity

In [22]:
def as_naive_df(data, categorical=True):
    result = []
    
    for target in data:
        for edge in target['exchanges']:
            result.append({
                "target_id": target['id'],
                "target_database": target['database'],
                "target_code": target['code'],
                "target_activity": target.get('name'),
                "target_reference_product": target.get('reference product'),
                "target_location": target.get('location'),
                "target_unit": target.get('unit'),
                "target_type": target.get('type', 'process'),
                "source_id": edge['id'],
                "source_database": edge['database'],
                "source_code": edge['code'],
                "source_activity": edge.get('name'),
                "source_product": edge.get('product'),
                "source_location": edge.get('location'),
                "source_unit": edge.get('unit'),
                "source_categories": "::".join(edge.get('categories', ('',))),
                "edge_amount": edge['amount'],
                "edge_type": edge['type'],
            })
    
    df = pd.DataFrame(result)
    
    if categorical:
        categorical_columns = [
            "target_database",
            "target_activity",
            "target_reference_product",
            "target_location",
            "target_unit",
            "target_type",
            "source_database",
            "source_code",
            "source_activity",
            "source_product",
            "source_location",
            "source_unit",
            "source_categories",
        ]
        for column in categorical_columns:
            df[column] = df[column].astype("category")  

    return df

In [25]:
%timeit as_naive_df(data)

5.85 s ± 79.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [26]:
%timeit as_naive_df(data, False)

3.71 s ± 69.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [27]:
df_compressed = as_naive_df(data)
df_full = as_naive_df(data, False)

Memory in MB

In [28]:
df_compressed.memory_usage().sum() / 1024 ** 2, df_full.memory_usage().sum() / 1024 ** 2

(37.66990280151367, 86.51182556152344)

In [29]:
df_compressed.dtypes

target_id                      int64
target_database             category
target_code                   object
target_activity             category
target_reference_product    category
target_location             category
target_unit                 category
target_type                 category
source_id                      int64
source_database             category
source_code                 category
source_activity             category
source_product              category
source_location             category
source_unit                 category
source_categories           category
edge_amount                  float64
edge_type                     object
dtype: object