### 0.0 follow setup instructions

ℹ️ use [`pylcaio.yml`](https://github.com/michaelweinold/config_conda/blob/main/pylcaio.yml) to set up working conda environment.

### 0.1. imports
#### 0.1.1. regular imports

In [1]:
# i/o
import sys
import os
from pathlib import Path
import gzip
import pickle
import git
import json
# os specific settings
import platform
# configuration
import yaml
# lca
import ecospold2matrix as e2m
import pymrio
#import brightway2 as bw
# type hints
from ecospold2matrix import ecospold2matrix
from pymrio import IOSystem
# data science
import pandas as pd
import numpy as np
# deep copy
import copy

#### 0.1.2. load configuration file

In [2]:
with open('../config.yaml', 'r') as filestream:
    config = yaml.load(filestream, Loader = yaml.FullLoader)

#### 0.1.3. local imports

In [3]:
sys.path.append(os.path.join(Path.home(), config['pylcaio'])) # required for local import of pylcaio
import pylcaio

In [4]:
from hybridize_preparation_functions import (
    identify_rest_of_world_regions,
    identify_rows
)

### 0.2. file paths
#### 0.2.1. directories

In [5]:
%%capture
# home directory
print(path_dir_home := Path.home())
print(path_dir_repo := git.Repo('.', search_parent_directories=True).working_tree_dir)
# input directory
print(path_dir_databases := os.path.join(path_dir_home, config['path_dir_databases']))
# output directories
print(path_dir_data := os.path.join(path_dir_home, config['path_dir_data']))
print(path_dir_pylcaio := os.path.join(path_dir_home, path_dir_data, config['path_dir_pylcaio']))
print(path_dir_pymrio := os.path.join(path_dir_home, path_dir_data, config['path_dir_pymrio']))
print(path_dir_e2m := os.path.join(path_dir_home, path_dir_data, config['path_dir_e2m']))

#### 0.2.2. files

In [6]:
%%capture
# databases
print(path_exiobase := os.path.join(path_dir_home, path_dir_databases, config['exiobase']))
print(path_dir_ecoinvent := os.path.join(path_dir_home, path_dir_databases, config['ecoinvent']))
# pylcaio output
print(path_pylcaio_database_loader_class_instance := os.path.join(path_dir_pylcaio, config['pylcaio_database_loader_class_instance']))
print(path_pylcaio_class_instance_before_hybrid := os.path.join(path_dir_pylcaio, config['pylcaio_class_instance_before_hybrid']))
print(path_pylcaio_class_instance_after_hybrid := os.path.join(path_dir_pylcaio, config['pylcaio_class_instance_after_hybrid']))
# pymrio output
print(path_pymrio_class_instance := os.path.join(path_dir_pymrio, config['pymrio_class_instance']))
# e2m output
print(e2m_project_name := config['e2m_project_name'])
print(path_file_e2m_pickle := os.path.join(path_dir_e2m, e2m_project_name + config['e2m_pickle_filename']))

In [7]:
%%capture
print(path_dict_io_countries_per_lca_region := os.path.join(path_dir_repo, config['path_dict_io_countries_per_lca_region']))
print(path_dict_io_countries_list_per_lca_region := os.path.join(path_dir_repo, config['path_dict_io_countries_list_per_lca_region']))
print(path_list_io_countries_and_regions := os.path.join(path_dir_repo, config['path_list_io_countries_and_regions']))
print(path_list_io_countries := os.path.join(path_dir_repo, config['path_list_io_countries']))
print(path_list_electricity_prices := os.path.join(path_dir_repo, config['path_list_electricity_prices']))

In [8]:
with open(file = path_dict_io_countries_per_lca_region, mode = 'r', encoding = 'utf-8') as filestream:
    dict_io_countries_per_lca_region: dict = json.load(fp = filestream)
with open(file = path_dict_io_countries_list_per_lca_region, mode = 'r', encoding = 'utf-8') as filestream:
    dict_io_countries_list_per_lca_region: dict = json.load(fp = filestream)
with open(file = path_list_io_countries_and_regions, mode = 'r', encoding = 'utf-8') as filestream:
    list_io_countries_and_regions: list = json.load(fp = filestream)
with open(file = path_list_io_countries, mode = 'r', encoding = 'utf-8') as filestream:
    list_io_countries: list = json.load(fp = filestream)
with open(file = path_list_electricity_prices, mode = 'r', encoding = 'utf-8') as filestream:
    electricity_prices: pd.DataFrame = pd.read_csv(
        filestream,
        header = 'infer',
        sep = ';',
        decimal = '.'
    )

In [9]:
with open(path_pylcaio_class_instance_before_hybrid, 'rb') as file_in:
    pylcaio_object_before_hybrid: pylcaio.LCAIO = pd.read_pickle(file_in)
PRO_f = pylcaio_object_before_hybrid.PRO_f

#### 1.1. function implementation

In [10]:
%%capture
from pathlib import Path
%%capture
# home directory
print(path_dir_home := Path.home())
# input directory
print(path_dir_databases := os.path.join(path_dir_home, config['path_dir_databases']))
# output directories
print(path_dir_data := os.path.join(path_dir_home, config['path_dir_data']))
print(path_dir_pylcaio := os.path.join(path_dir_home, path_dir_data, config['path_dir_pylcaio']))
print(path_dir_pymrio := os.path.join(path_dir_home, path_dir_data, config['path_dir_pymrio']))
print(path_dir_e2m := os.path.join(path_dir_home, path_dir_data, config['path_dir_e2m']))
print(path_dir_home := Path.home())
print(path_exiobase := os.path.join(path_dir_home, path_dir_databases, config['exiobase']))
print(path_dir_ecoinvent := os.path.join(path_dir_home, path_dir_databases, config['ecoinvent']))
# pylcaio output
print(path_pylcaio_database_loader_class_instance := os.path.join(path_dir_pylcaio, config['pylcaio_database_loader_class_instance']))
print(path_pylcaio_class_instance_before_hybrid := os.path.join(path_dir_pylcaio, config['pylcaio_class_instance_before_hybrid']))
print(path_pylcaio_class_instance_after_hybrid := os.path.join(path_dir_pylcaio, config['pylcaio_class_instance_after_hybrid']))
# pymrio output
print(path_pymrio_class_instance := os.path.join(path_dir_pymrio, config['pymrio_class_instance']))
# e2m output
print(e2m_project_name := config['e2m_project_name'])
print(path_file_e2m_pickle := os.path.join(path_dir_e2m, e2m_project_name + config['e2m_pickle_filename']))
with open(path_pymrio_class_instance, 'rb') as file_in:
    exiobase: pymrio.core.mriosystem.IOSystem = pd.read_pickle(file_in)

In [11]:
with open(path_pymrio_class_instance, 'rb') as file_in:
    exiobase: pymrio.core.mriosystem.IOSystem = pd.read_pickle(file_in)

In [12]:
%%timeit
df_test = identify_rest_of_world_regions(df_in = PRO_f, list_io_countries=list_io_countries, dict_io_countries_per_lca_region=dict_io_countries_per_lca_region)

68.1 ms ± 1.02 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [13]:
%%timeit
df_test = identify_rows(pylcaio_object_before_hybrid)

281 ms ± 7.89 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


PRO_f dataframe:

| product | product_group | location |
| ----- | -------------- | ------------ |
| 10 | 1 | RoW |
| 11 | 1 | US |
| 12 | 1 | CA |
| 13 | 2 | RoW |
| 14 | 2 | JP |
| 15 | 2 | US |
| 16 | 3 | FR |
| 17 | 3 | BE |
| 18 | 4 | RoW |
| 19 | 4 | US |
| 20 | 4 | CA |

should look like:

list_locations = ['US', 'CA', 'JP', 'BE', 'FR']

| product_group | location_list | rest_of_world_location_list | rest_of_world_index |
| -------------- | ----------------- | ---------- | ----|
| 1 | RoW, US, CA | JP, BE, FR | RoW_1 |
| 2 | RoW, JP, US | CA, BE, FR | RoW_2 |
| 4 | RoW, US, CA | CA, BE, FR | RoW_1 |

where for product_group == 1, RoW region is list_io_countries - [CH, AT]


In [20]:
lca_activities = PRO_f
lca_master_activities_rest_of_world: pd.DataFrame = lca_activities[lca_activities['io_geography'] == 'RoW']['activityNameId']
lca_activities_rest_of_world: pd.DataFrame = lca_activities[lca_activities['activityNameId'].isin(lca_master_activities_rest_of_world)]

In [21]:
lca_activities_rest_of_world

Unnamed: 0,activityId,productId,activityName,ISIC,price,priceUnit,EcoSpoldCategory,geography,technologyLevel,macroEconomicScenario,...,cpc,dry mass [kg]_y,wet mass [kg]_y,activityNameId,activityType,startDate,endDate,activityName_duplicate,io_geography,ProductTypeName
a96cb241-a4a9-4980-a16a-ba4b6a80175e_aeaf5266-3f9c-4074-bd34-eba76a61760c,a96cb241-a4a9-4980-a16a-ba4b6a80175e,aeaf5266-3f9c-4074-bd34-eba76a61760c,"barley grain, feed production","0111:Growing of cereals (except rice), legumin...",0.12400,EUR2005,agricultural means of production/feed,CA-QC,Current,Business-as-Usual,...,23319: Preparations used in animal feeding n.e.c.,0.86,1,2816d5b0-50fd-42fd-9bc9-aed0bd1913e7,0,1996-01-01,2018-12-31,"barley grain, feed production",CA,Cereal grains nec
6885fd40-ff73-40a4-8f71-225577ec684e_aeaf5266-3f9c-4074-bd34-eba76a61760c,6885fd40-ff73-40a4-8f71-225577ec684e,aeaf5266-3f9c-4074-bd34-eba76a61760c,"barley grain, feed production","0111:Growing of cereals (except rice), legumin...",0.12400,EUR2005,agricultural means of production/feed,RoW,Current,Business-as-Usual,...,23319: Preparations used in animal feeding n.e.c.,0.86,1,2816d5b0-50fd-42fd-9bc9-aed0bd1913e7,0,1996-01-01,2018-12-31,"barley grain, feed production",RoW,Cereal grains nec
92068396-88c7-45ed-9008-622008a299f3_0d860eb4-1a25-41b4-a821-81f5726d86e5,92068396-88c7-45ed-9008-622008a299f3,0d860eb4-1a25-41b4-a821-81f5726d86e5,"barley grain, feed production, organic","0111:Growing of cereals (except rice), legumin...",0.15900,EUR2005,agricultural means of production/feed,CH,Current,Business-as-Usual,...,23319: Preparations used in animal feeding n.e.c.,0.85,1,6a49a0c6-c114-4471-b6c4-8aa4006e4b0e,0,1996-01-01,2018-12-31,"barley grain, feed production, organic",CH,Cereal grains nec
35ddb020-9812-4808-bdfb-6845a454a73c_0d860eb4-1a25-41b4-a821-81f5726d86e5,35ddb020-9812-4808-bdfb-6845a454a73c,0d860eb4-1a25-41b4-a821-81f5726d86e5,"barley grain, feed production, organic","0111:Growing of cereals (except rice), legumin...",0.15900,EUR2005,agricultural means of production/feed,RoW,Current,Business-as-Usual,...,23319: Preparations used in animal feeding n.e.c.,0.85,1,6a49a0c6-c114-4471-b6c4-8aa4006e4b0e,0,1996-01-01,2018-12-31,"barley grain, feed production, organic",RoW,Cereal grains nec
558a3696-1009-44a7-8753-da7217f46c77_3f6dada9-2497-4e1c-9e1b-eabafa6920f8,558a3696-1009-44a7-8753-da7217f46c77,3f6dada9-2497-4e1c-9e1b-eabafa6920f8,barley production,"0111:Growing of cereals (except rice), legumin...",0.11700,EUR2005,,ES,Current,Business-as-Usual,...,"01152: Barley, other",0.86,1,aaef8a55-0d7b-47a4-a207-b601959682b7,0,2000-01-01,2018-12-31,barley production,ES,Cereal grains nec
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7ebbe8ae-04e2-4542-a005-1565a30fa587_8691615f-9152-44db-b3a1-98d7f87ae7a1,7ebbe8ae-04e2-4542-a005-1565a30fa587,8691615f-9152-44db-b3a1-98d7f87ae7a1,"packing, fibre cement product",8292:Packaging activities,0.00661,EUR2005,construction materials/others,RoW,Current,Business-as-Usual,...,85400: Packaging services,,,5bf2bfa9-559f-400f-8fab-80d6ab55860b,0,2000-01-01,2018-12-31,"packing, fibre cement product",RoW,
befb2329-0397-4bf2-8c6a-36b08cc2416a_7c2f8db1-1d7f-4528-a232-8203587d4a4d,befb2329-0397-4bf2-8c6a-36b08cc2416a,7c2f8db1-1d7f-4528-a232-8203587d4a4d,"packing, lime product",8292:Packaging activities,0.00423,EUR2005,construction materials/others,RoW,Current,Business-as-Usual,...,85400: Packaging services,,,604af3bb-f700-41e6-a743-f4179f91ee37,0,2000-01-01,2018-12-31,"packing, lime product",RoW,
aeb56927-2306-4f9d-a2d5-b5647c0a9b77_7c2f8db1-1d7f-4528-a232-8203587d4a4d,aeb56927-2306-4f9d-a2d5-b5647c0a9b77,7c2f8db1-1d7f-4528-a232-8203587d4a4d,"packing, lime product",8292:Packaging activities,0.00423,EUR2005,construction materials/others,CH,Current,Business-as-Usual,...,85400: Packaging services,,,604af3bb-f700-41e6-a743-f4179f91ee37,0,2000-01-01,2018-12-31,"packing, lime product",CH,
fd4f76d0-34a7-4816-b8ae-537ff8cdaf95_4550c693-37f5-465b-99d1-3f3497380dfb,fd4f76d0-34a7-4816-b8ae-537ff8cdaf95,4550c693-37f5-465b-99d1-3f3497380dfb,"maintenance, bicycle",9529:Repair of other personal and household goods,4.64000,EUR2005,transport systems/road,CH,Current,Business-as-Usual,...,87290: Maintenance and repair services of othe...,,,a072b8b7-6d16-4e3a-9846-8c0204e6fbe6,0,2007-01-01,2018-12-31,"maintenance, bicycle",CH,


In [35]:
def new_identify_rest_of_world_regions(
    lca_activities_in: pd.DataFrame,
    dict_io_countries_per_lca_region: dict,
    list_io_countries: list
) -> pd.DataFrame:

    lca_activities = lca_activities_in.copy()

    lca_master_activities_rest_of_world: pd.DataFrame = lca_activities[lca_activities['io_geography'] == 'RoW']['activityNameId']
    lca_activities_rest_of_world: pd.DataFrame = lca_activities[lca_activities['activityNameId'].isin(lca_master_activities_rest_of_world)]

    # replace LCA regions (eg. 'Europe without Switzerland') with the associated IO countries (eg. '"AT", "BE", "BG", ...')
    lca_activities_rest_of_world['io_geography'] = lca_activities_rest_of_world['io_geography'].map(dict_io_countries_per_lca_region).fillna(lca_activities_rest_of_world['io_geography'])
    agg_lca_activities_rest_of_world = pd.DataFrame(lca_activities_rest_of_world.groupby('activityNameId')['io_geography'].apply(tuple)).reset_index()

    series_rest_of_world_regions = agg_lca_activities_rest_of_world.apply(
        lambda row: tuple(set(list_io_countries) - set(row['io_geography']) - set(['RoW'])),  # for performance, compare https://stackoverflow.com/a/53343046
        axis = 1
    ).unique()

    rest_of_world_regions = pd.DataFrame(
        data = series_rest_of_world_regions,
        columns = ['rest_of_world_region']
    )
    rest_of_world_regions['rest_of_world_region_index'] = rest_of_world_regions['rest_of_world_region'].factorize()[0]
    rest_of_world_regions['rest_of_world_region_index'] = 'RoW_' + rest_of_world_regions['rest_of_world_region_index'].astype(str)

    return rest_of_world_regions

In [23]:
test = identify_rest_of_world_regions(df_in=PRO_f, dict_io_countries_per_lca_region=dict_io_countries_per_lca_region, list_io_countries=list_io_countries)

In [36]:
test = new_identify_rest_of_world_regions(lca_activities_in=PRO_f, dict_io_countries_per_lca_region=dict_io_countries_per_lca_region, list_io_countries=list_io_countries)

In [37]:
test

Unnamed: 0,rest_of_world_region,rest_of_world_region_index
0,"(LU, IT, BE, CY, CN, KR, HU, NO, CA, NL, US, G...",RoW_0
1,"(LU, IT, LT, BE, CY, CN, TR, KR, HU, TW, NO, C...",RoW_1
2,"(LU, IT, LT, BE, CY, CN, TR, KR, HU, TW, MT, N...",RoW_2
3,"(LU, IT, LT, BE, CY, CN, TR, KR, HU, TW, NO, C...",RoW_3
4,"(LU, IT, LT, BE, CY, CN, TR, KR, HU, TW, CA, U...",RoW_4
...,...,...
139,"(NO, IE, BR, IN, EE, LV)",RoW_139
140,"(EE, NO)",RoW_140
141,"(LU, ZA, CH, SE, DK, LT, CY, EE, LV, PL, MT)",RoW_141
142,"(LU, PT, US, SI, CH, GR, DK, IN, LT, ID, CY, Z...",RoW_142


In [67]:
def test_identify_rest_of_world_regions(
    df_in: pd.DataFrame,
    list_io_countries: list,
    dict_io_countries_per_lca_region: dict
) -> pd.DataFrame:
    # Identify the rest of the world regions

    # create dataframe copy to ensure the input dataframe is not manipulated (else repeated exection will likely fail)
    df = df_in.copy()

    # get array of those master activities (key 'activityNameId') where one defined geography is 'RoW'
    master_activities_with_rest_of_world_geography = pd.DataFrame(df[df['io_geography'] == 'RoW']['activityNameId'])
    # copy dataframe index (='index') to column for merging later
    master_activities_with_rest_of_world_geography['index'] = master_activities_with_rest_of_world_geography.index
    
    # replace LCA regions (eg. 'Europe without Switzerland') with the associated IO countries (eg. '"AT", "BE", "BG", ...')
    df['io_geography'] = df['io_geography'].map(dict_io_countries_per_lca_region).fillna(df['io_geography'])

    # remove all activities associated with master activities for which no defined geography is 'RoW'
    df_rest_of_world = df[df['activityNameId'].isin(master_activities_with_rest_of_world_geography['activityNameId'])]
    # remove all activities where 'io_geography' == 'RoW' to ensure 'RoW' does not appear in aggregation
    df_rest_of_world = df_rest_of_world.drop(master_activities_with_rest_of_world_geography['index'])

    # aggregate by master activities (key 'activityNameId')
    df_rest_of_world_agg = pd.DataFrame(data = df_rest_of_world.groupby('activityNameId')['io_geography'].apply(tuple))
    # copy dataframe index (='activityNameId') to column for merging later
    df_rest_of_world_agg.reset_index(inplace = True)
    # add activity index column to aggregate dataframe for dataframe update later
    df_rest_of_world_agg = df_rest_of_world_agg.merge(
        right = master_activities_with_rest_of_world_geography,
        how = 'left',
        on = 'activityNameId'
    )
    # set activity index column as dataframe index for dataframe update later
    df_rest_of_world_agg.set_index(keys = 'index', inplace = True)

    # calculate 'rest of world' region per activity
    df_rest_of_world_agg['io_geography_rest_of_world'] = df_rest_of_world_agg.apply(
        lambda row: tuple(set(list_io_countries) - set(row['io_geography'])), #https://stackoverflow.com/a/53343046
        axis = 1
    )
    # remove activities where 'rest of world' region is null because all countries are described
    df_rest_of_world_agg = df_rest_of_world_agg.dropna(subset ='io_geography_rest_of_world')

    # enumerate unique rest of world regions
    df_rest_of_world_agg['io_geography'] = pd.factorize(df_rest_of_world_agg['io_geography_rest_of_world'])[0]
    df_rest_of_world_agg['io_geography'] = 'RoW_' + df_rest_of_world_agg['io_geography'].astype(str)
    
    df.update(
        other = df_rest_of_world_agg['io_geography'],
        overwrite = True
    )

    return df

In [60]:
def calculate_productions_per_lca_region(
    io_x_in: pd.DataFrame,
    dict_io_countries_list_per_lca_regions: dict
) -> pd.DataFrame:

    io_x = io_x_in.reset_index().copy()
    io_x = io_x.rename(columns={"region": "io_region"})

    df_io_countries_per_lca_region = pd.DataFrame(
        data = dict_io_countries_list_per_lca_regions.items(),
        columns = ['lca_region', 'io_countries_list']
    )

    df_io_countries_per_lca_region = df_io_countries_per_lca_region.assign(
        io_country = df_io_countries_per_lca_region['io_countries_list']
    ).explode('io_country').drop(columns = ['io_countries_list'])
    
    io_x = io_x.merge(
        right = df_io_countries_per_lca_region,
        left_on = 'io_region',
        right_on = 'io_country',
        how = 'left'
    )

    io_x_per_lca_regions = io_x.groupby(['lca_region', 'sector'])['indout'].sum()
    io_x_per_lca_regions = io_x_per_lca_regions.unstack('lca_region', fill_value=0)

    return io_x_per_lca_regions