### 0.0 follow setup instructions

ℹ️ use [`pylcaio.yml`](https://github.com/michaelweinold/config_conda/blob/main/pylcaio.yml) to set up working conda environment.

### 0.1. imports
#### 0.1.1. regular imports

In [1]:
# i/o
import sys
import os
from pathlib import Path
import gzip
import pickle
import git
import json
# os specific settings
import platform
# configuration
import yaml
# lca
import ecospold2matrix as e2m
import pymrio
#import brightway2 as bw
# type hints
from ecospold2matrix import ecospold2matrix
from pymrio import IOSystem
# data science
import pandas as pd
import numpy as np
# deep copy
import copy

#### 0.1.2. load configuration file

In [2]:
with open('../config.yaml', 'r') as filestream:
    config = yaml.load(filestream, Loader = yaml.FullLoader)

#### 0.1.3. local imports

In [3]:
sys.path.append(os.path.join(Path.home(), config['pylcaio'])) # required for local import of pylcaio
import pylcaio

In [4]:
from hybridize_preparation_functions import (
    identify_rest_of_world_regions,
    identify_rows
)

### 0.2. file paths
#### 0.2.1. directories

In [5]:
%%capture
# home directory
print(path_dir_home := Path.home())
print(path_dir_repo := git.Repo('.', search_parent_directories=True).working_tree_dir)
# input directory
print(path_dir_databases := os.path.join(path_dir_home, config['path_dir_databases']))
# output directories
print(path_dir_data := os.path.join(path_dir_home, config['path_dir_data']))
print(path_dir_pylcaio := os.path.join(path_dir_home, path_dir_data, config['path_dir_pylcaio']))
print(path_dir_pymrio := os.path.join(path_dir_home, path_dir_data, config['path_dir_pymrio']))
print(path_dir_e2m := os.path.join(path_dir_home, path_dir_data, config['path_dir_e2m']))

#### 0.2.2. files

In [6]:
%%capture
# databases
print(path_exiobase := os.path.join(path_dir_home, path_dir_databases, config['exiobase']))
print(path_dir_ecoinvent := os.path.join(path_dir_home, path_dir_databases, config['ecoinvent']))
# pylcaio output
print(path_pylcaio_database_loader_class_instance := os.path.join(path_dir_pylcaio, config['pylcaio_database_loader_class_instance']))
print(path_pylcaio_class_instance_before_hybrid := os.path.join(path_dir_pylcaio, config['pylcaio_class_instance_before_hybrid']))
print(path_pylcaio_class_instance_after_hybrid := os.path.join(path_dir_pylcaio, config['pylcaio_class_instance_after_hybrid']))
# pymrio output
print(path_pymrio_class_instance := os.path.join(path_dir_pymrio, config['pymrio_class_instance']))
# e2m output
print(e2m_project_name := config['e2m_project_name'])
print(path_file_e2m_pickle := os.path.join(path_dir_e2m, e2m_project_name + config['e2m_pickle_filename']))

In [7]:
%%capture
print(path_dict_io_countries_per_lca_region := os.path.join(path_dir_repo, config['path_dict_io_countries_per_lca_region']))
print(path_list_io_countries_and_regions := os.path.join(path_dir_repo, config['path_list_io_countries_and_regions']))
print(path_list_io_countries := os.path.join(path_dir_repo, config['path_list_io_countries']))
print(path_list_electricity_prices := os.path.join(path_dir_repo, config['path_list_electricity_prices']))

In [8]:
with open(file = path_dict_io_countries_per_lca_region, mode = 'r', encoding = 'utf-8') as filestream:
    dict_io_countries_per_lca_region: dict = json.load(fp = filestream)
with open(file = path_list_io_countries_and_regions, mode = 'r', encoding = 'utf-8') as filestream:
    list_io_countries_and_regions: list = json.load(fp = filestream)
with open(file = path_list_io_countries, mode = 'r', encoding = 'utf-8') as filestream:
    list_io_countries: list = json.load(fp = filestream)
with open(file = path_list_electricity_prices, mode = 'r', encoding = 'utf-8') as filestream:
    electricity_prices: pd.DataFrame = pd.read_csv(
        filestream,
        header = 'infer',
        sep = ';',
        decimal = '.'
    )

In [9]:
with open(path_pylcaio_class_instance_before_hybrid, 'rb') as file_in:
    pylcaio_object_before_hybrid: pylcaio.LCAIO = pd.read_pickle(file_in)
PRO_f = pylcaio_object_before_hybrid.PRO_f

#### 1.1. function implementation

In [10]:
%%capture
from pathlib import Path
%%capture
# home directory
print(path_dir_home := Path.home())
# input directory
print(path_dir_databases := os.path.join(path_dir_home, config['path_dir_databases']))
# output directories
print(path_dir_data := os.path.join(path_dir_home, config['path_dir_data']))
print(path_dir_pylcaio := os.path.join(path_dir_home, path_dir_data, config['path_dir_pylcaio']))
print(path_dir_pymrio := os.path.join(path_dir_home, path_dir_data, config['path_dir_pymrio']))
print(path_dir_e2m := os.path.join(path_dir_home, path_dir_data, config['path_dir_e2m']))
print(path_dir_home := Path.home())
print(path_exiobase := os.path.join(path_dir_home, path_dir_databases, config['exiobase']))
print(path_dir_ecoinvent := os.path.join(path_dir_home, path_dir_databases, config['ecoinvent']))
# pylcaio output
print(path_pylcaio_database_loader_class_instance := os.path.join(path_dir_pylcaio, config['pylcaio_database_loader_class_instance']))
print(path_pylcaio_class_instance_before_hybrid := os.path.join(path_dir_pylcaio, config['pylcaio_class_instance_before_hybrid']))
print(path_pylcaio_class_instance_after_hybrid := os.path.join(path_dir_pylcaio, config['pylcaio_class_instance_after_hybrid']))
# pymrio output
print(path_pymrio_class_instance := os.path.join(path_dir_pymrio, config['pymrio_class_instance']))
# e2m output
print(e2m_project_name := config['e2m_project_name'])
print(path_file_e2m_pickle := os.path.join(path_dir_e2m, e2m_project_name + config['e2m_pickle_filename']))
with open(path_pymrio_class_instance, 'rb') as file_in:
    exiobase: pymrio.core.mriosystem.IOSystem = pd.read_pickle(file_in)

In [11]:
with open(path_pymrio_class_instance, 'rb') as file_in:
    exiobase: pymrio.core.mriosystem.IOSystem = pd.read_pickle(file_in)

In [12]:
df = exiobase.x

In [16]:
dict_io_countries_per_lca_region.items()

dict_items([('Europe without Switzerland', 'AT, BE, BG, CY, CZ, DE, DK, EE, ES, FI, FR, GR, HR, HU, IE, IT, LT, LU, LV, MT, NL, PL, PT, RO, SE, SI, SK, GB, NO'), ('Europe without Austria', 'BE, BG, CY, CZ, DE, DK, EE, ES, FI, FR, GR, HR, HU, IE, IT, LT, LU, LV, MT, NL, PL, PT, RO, SE, SI, SK, GB, CH, NO'), ('Europe without Switzerland and Austria', 'BE, BG, CY, CZ, DE, DK, EE, ES, FI, FR, GR, HR, HU, IE, IT, LT, LU, LV, MT, NL, PL, PT, RO, SE, SI, SK, GB, NO'), ('GLO', 'AT, BE, BG, CY, CZ, DE, DK, EE, ES, FI, FR, GR, HR, HU, IE, IT, LT, LU, LV, MT, NL, PL, PT, RO, SE, SI, SK, GB, US, JP, CN, CA, KR, BR, IN, MX, RU, AU, CH, TR, TW, NO, ID, ZA'), ('IAI Area, Asia, without China and GCC', 'JP, KR, ID, IN, TW'), ('IAI Area, EU27 & EFTA', 'AT, BE, BG, CY, CZ, DE, DK, EE, ES, FI, FR, GR, HU, IE, IT, LT, LU, LV, MT, NL, PL, PT, RO, SE, SI, SK, CH, NO'), ('IAI Area, Gulf Cooperation Council', 'WM'), ('IAI Area, Russia & RER w/o EU27 & EFTA', 'RU, TR, GB'), ('NORDEL', 'DK, FI, NO, SE'), ('RAF',

In [28]:
dict_io_countries_per_lca_region

{'Europe without Switzerland': 'AT, BE, BG, CY, CZ, DE, DK, EE, ES, FI, FR, GR, HR, HU, IE, IT, LT, LU, LV, MT, NL, PL, PT, RO, SE, SI, SK, GB, NO',
 'Europe without Austria': 'BE, BG, CY, CZ, DE, DK, EE, ES, FI, FR, GR, HR, HU, IE, IT, LT, LU, LV, MT, NL, PL, PT, RO, SE, SI, SK, GB, CH, NO',
 'Europe without Switzerland and Austria': 'BE, BG, CY, CZ, DE, DK, EE, ES, FI, FR, GR, HR, HU, IE, IT, LT, LU, LV, MT, NL, PL, PT, RO, SE, SI, SK, GB, NO',
 'GLO': 'AT, BE, BG, CY, CZ, DE, DK, EE, ES, FI, FR, GR, HR, HU, IE, IT, LT, LU, LV, MT, NL, PL, PT, RO, SE, SI, SK, GB, US, JP, CN, CA, KR, BR, IN, MX, RU, AU, CH, TR, TW, NO, ID, ZA',
 'IAI Area, Asia, without China and GCC': 'JP, KR, ID, IN, TW',
 'IAI Area, EU27 & EFTA': 'AT, BE, BG, CY, CZ, DE, DK, EE, ES, FI, FR, GR, HU, IE, IT, LT, LU, LV, MT, NL, PL, PT, RO, SE, SI, SK, CH, NO',
 'IAI Area, Gulf Cooperation Council': 'WM',
 'IAI Area, Russia & RER w/o EU27 & EFTA': 'RU, TR, GB',
 'NORDEL': 'DK, FI, NO, SE',
 'RAF': 'ZA, WF',
 'RAS': 'C

In [48]:
pd.DataFrame.from_records(
    data = list(dict_io_countries_per_lca_region.values()),
    #columns = ['region', 'country'],
    index = pd.RangeIndex(start = 0, stop = len(dict_io_countries_per_lca_region))
)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,Unnamed: 9_level_0,Unnamed: 10_level_0,Unnamed: 11_level_0,Unnamed: 12_level_0,Unnamed: 13_level_0,Unnamed: 14_level_0,Unnamed: 15_level_0,Unnamed: 16_level_0,Unnamed: 17_level_0,Unnamed: 18_level_0,Unnamed: 19_level_0,Unnamed: 20_level_0,21,22,23,24,25,26,27,28,29,30,...,164,165,166,167,168,169,170,171,172,173
0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1
A,T,",",,B,E,",",,B,G,",",,C,Y,",",,C,Z,",",,D,E,",",,D,K,",",,E,E,",",...,,,,,,,,,,
B,E,",",,B,G,",",,C,Y,",",,C,Z,",",,D,E,",",,D,K,",",,E,E,",",,E,S,",",...,,,,,,,,,,
B,E,",",,B,G,",",,C,Y,",",,C,Z,",",,D,E,",",,D,K,",",,E,E,",",,E,S,",",...,,,,,,,,,,
A,T,",",,B,E,",",,B,G,",",,C,Y,",",,C,Z,",",,D,E,",",,D,K,",",,E,E,",",...,N,O,",",,I,D,",",,Z,A
J,P,",",,K,R,",",,I,D,",",,I,N,",",,T,W,,,,,,,,,,,,,,...,,,,,,,,,,
A,T,",",,B,E,",",,B,G,",",,C,Y,",",,C,Z,",",,D,E,",",,D,K,",",,E,E,",",...,,,,,,,,,,
W,M,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,
R,U,",",,T,R,",",,G,B,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,
D,K,",",,F,I,",",,N,O,",",,S,E,,,,,,,,,,,,,,,,,,...,,,,,,,,,,
Z,A,",",,W,F,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,


In [43]:
pd.DataFrame(
    data = list(dict_io_countries_per_lca_region),
    columns = ['region', 'country'],
    index = pd.RangeIndex(start = 0, stop = len(dict_io_countries_per_lca_region))
)

ValueError: Shape of passed values is (21, 1), indices imply (21, 2)

In [49]:
df_io_countries_per_lca_region  = pd.DataFrame.from_dict(
    data = dict_io_countries_per_lca_region,
    orient = 'index',
    columns = ['countries']
).rename_axis('region').reset_index()

In [50]:
df_io_countries_per_lca_region

Unnamed: 0,region,countries
0,Europe without Switzerland,"AT, BE, BG, CY, CZ, DE, DK, EE, ES, FI, FR, GR..."
1,Europe without Austria,"BE, BG, CY, CZ, DE, DK, EE, ES, FI, FR, GR, HR..."
2,Europe without Switzerland and Austria,"BE, BG, CY, CZ, DE, DK, EE, ES, FI, FR, GR, HR..."
3,GLO,"AT, BE, BG, CY, CZ, DE, DK, EE, ES, FI, FR, GR..."
4,"IAI Area, Asia, without China and GCC","JP, KR, ID, IN, TW"
5,"IAI Area, EU27 & EFTA","AT, BE, BG, CY, CZ, DE, DK, EE, ES, FI, FR, GR..."
6,"IAI Area, Gulf Cooperation Council",WM
7,"IAI Area, Russia & RER w/o EU27 & EFTA","RU, TR, GB"
8,NORDEL,"DK, FI, NO, SE"
9,RAF,"ZA, WF"


In [13]:
%%timeit
df_test = identify_rest_of_world_regions(df_in = PRO_f, list_io_countries=list_io_countries, dict_io_countries_per_lca_region=dict_io_countries_per_lca_region)

69.5 ms ± 419 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [12]:
%%timeit
df_test = identify_rows(pylcaio_object_before_hybrid)

273 ms ± 4.41 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


PRO_f dataframe:

| index | activityNameId | io_geography |
| ----- | -------------- | ------------ |
| 10 | 1 | RoW |
| 11 | 1 | CH |
| 12 | 1 | AT |
| 13 | 2 | RoW |
| 14 | 2 | DE |
| 15 | 2 | CH |
| 16 | 3 | FR |
| 17 | 3 | BE |
| 18 | 4 | RoW |
| 19 | 4 | CH |
| 20 | 4 | AT |

should look like:

| activityNameId | io_geography_list | RoW_region |
| -------------- | ----------------- | ---------- |
| 1 | RoW, CH, AT | RoW(1) |
| 2 | RoW, DE, CH | RoW(2) |
| 4 | RoW, CH, AT | RoW(1) |

where for activityNameId == 1, RoW region is list_io_countries - [CH, AT]


| country | sector | production |
| ------ | ------ | ---------- |
| US | automotive | 100 |
| US | aviation | 50 |
| CA | automotive | 30 |
| CA | aviation | 15 |
| JP | automotive | 95 |
| JP | aviation | 25 |

is converted to

|   | region_1 | region_2 |
| - | -------- | -------- |
| automotive | 115 | 195 |
| aviation | 1200 | 1400 |


# what is "PRO_f" and why is it called that?

In [64]:
df_io_countries_per_lca_region['countries'].str.split(', ').explode('countries')

0      AT
1      BE
2      BG
3      CY
4      CZ
       ..
337    FR
338    IE
339    NL
340    CH
341    GB
Name: countries, Length: 342, dtype: object

In [66]:
df_io_countries_per_lca_region = df_io_countries_per_lca_region.assign(
        country=df_io_countries_per_lca_region['countries'].str.split(', ')
).explode('country')

In [67]:
df_io_countries_per_lca_region

Unnamed: 0,region,countries,country
0,Europe without Switzerland,"AT, BE, BG, CY, CZ, DE, DK, EE, ES, FI, FR, GR...",AT
0,Europe without Switzerland,"AT, BE, BG, CY, CZ, DE, DK, EE, ES, FI, FR, GR...",BE
0,Europe without Switzerland,"AT, BE, BG, CY, CZ, DE, DK, EE, ES, FI, FR, GR...",BG
0,Europe without Switzerland,"AT, BE, BG, CY, CZ, DE, DK, EE, ES, FI, FR, GR...",CY
0,Europe without Switzerland,"AT, BE, BG, CY, CZ, DE, DK, EE, ES, FI, FR, GR...",CZ
...,...,...,...
20,WEU,"BE, FR, IE, NL, CH, GB",FR
20,WEU,"BE, FR, IE, NL, CH, GB",IE
20,WEU,"BE, FR, IE, NL, CH, GB",NL
20,WEU,"BE, FR, IE, NL, CH, GB",CH


In [78]:
def calculate_productions(
    io_x_in: pd.DataFrame,
    dict_io_countries_per_lca_region: dict
) -> pd.DataFrame:

    io_x = io_x_in.copy()

    #@todo
    # what happens to the 5 io regions (W*)

    #@todo
    # calculate absent countries

    list_lca_regions: list = dict_io_countries_per_lca_region.keys()

    df_io_countries_per_lca_region  = pd.DataFrame.from_dict(
        data = dict_io_countries_per_lca_region,
        orient = 'index',
        columns = ['countries']
    ).rename_axis('region').reset_index()

    # split the country strings, then explode
    df_io_countries_per_lca_region = df_io_countries_per_lca_region.assign(
        country = df_io_countries_per_lca_region['countries'].str.split(', ')
    ).explode('country')
    
    (
        io_x.merge(
            right = df_io_countries_per_lca_region,
            left_on = 'region',
            right_on = 'country'
        ).groupby(['region','sector'])['indout'].sum().unstack('region', fill_value=0)
    )

    return io_x

In [80]:
exiobase.x

Unnamed: 0,region,sector,indout
0,AT,Paddy rice,0.000000
1,AT,Wheat,326.649338
2,AT,Cereal grains nec,801.574425
3,AT,"Vegetables, fruit, nuts",1436.618224
4,AT,Oil seeds,158.696632
...,...,...,...
9795,WM,Membership organisation services n.e.c. (91),13932.869192
9796,WM,"Recreational, cultural and sporting services (92)",36589.968894
9797,WM,Other services (93),23344.854386
9798,WM,Private households with employed persons (95),4361.212870


In [None]:
def calc_productions(self):
    """ Calculates the different total productions for either countries, regions or RoWs

    Returns:
    -------
        The updated self.total_prod_country, self.total_prod_region and self.total_prod_RoW dataframe

    """

    # the user needs to determine the total demand before being able to calculate productions
    listdrop = []

    # dict where
    # keys = lca region
    # values = countries not in the lca region
    #@audit - this could be moved to RoW calculation function
    absent_countries = {}
    for i in range(0, len(list(self.countries_per_regions.values()))):
        absent_country = [item for item in self.listcountry if
                            item not in list(self.countries_per_regions.values())[i]]
        absent_countries[list(self.countries_per_regions.keys())[i]] = absent_country

    # just take the exiobase.x dataframe here, no need to convert back from sparse matrix
    self.total_prod_country =  pd.DataFrame(
        data = self.X_io.todense(), # equal to 'x' in Exiobase, compare https://pymrio.readthedocs.io/en/latest/terminology.html
        index=pd.MultiIndex.from_product([self.regions_of_IO, self.sectors_of_IO],
        names=['region', 'sector']),
        columns=['production']
    )

    listmatrixxx = []
    listlisteee = []
    listdfff = []

    for i in range(0, len(absent_countries)): # = number of lca regions (eg. 'Europe without Switzerland', ...)
        listmatrixxx.append('matrixxx' + str(i)) # ('matrixxx_1', 'matrixxx_2', ...)
        listlisteee.append('listeee' + str(i))
        listdfff.append('dfff' + str(i))
    listact = [] # simply a list of all io sectors
    for i in range(0, self.number_of_products_IO):
        listact.append(self.total_prod_country.index[i][1])

    for lca_region in range(0, len(list(absent_countries.values()))): # i = number of lca regions (eg. 'Europe without Switzerland', ...)
        listadd = []
        listmatrixxx[lca_region] = self.total_prod_country.drop(list(absent_countries.values())[lca_region], axis=0, level=0)
        for k in range(0, self.number_of_products_IO):
            somme = 0
            for j in range(0, len(listmatrixxx[lca_region]), self.number_of_products_IO):
                somme += listmatrixxx[lca_region].iloc[j + k, 0]
            listadd.append(somme)
        listlisteee[lca_region] = listadd
        listdfff[lca_region] = pd.DataFrame(listlisteee[lca_region], listact, [list(absent_countries.keys())[lca_region]])
        self.total_prod_region = self.total_prod_region.join(listdfff[lca_region], how='outer')

    # self.total_prod_region:
    # columns: LCA regions
    # rows: IO sectors
    # the dataframe summed up the country productions to the LCA regions

    # next step we will consider the rest-of-the-World geographies, so the user has to run 'identify_RoWs' first
    if len(self.dictRoW) == 0:
        print('You need to run "identify_rows" before calculating the productions')
        return

    listmatrixxxx = []
    listlisteeee = []
    listdffff = []
    for k in range(0, len(list(self.dictRoW.keys()))):
        listmatrixxxx.append('matrixxxx' + str(k))
        listlisteeee.append('listeeee' + str(k))
        listdffff.append('dfff' + str(k))
        listdrop = []
        for i in range(0, len(self.dictRoW)):
            listadd = []
            for j in range(0, len(self.listcountry)):
                if self.listcountry[j] not in list(self.dictRoW.values())[i]:
                    listadd.append(self.listcountry[j])
            listdrop.append(listadd)

    # listdrop:
    # rows: countries that DO NOT appear in RoW regions

    for i in range(0, len(list(self.dictRoW.keys()))):
        listadd = []
        listmatrixxxx[i] = self.total_prod_country.drop(listdrop[i], axis=0, level=0)
        for k in range(0, self.number_of_products_IO):
            somme = 0
            for j in range(0, len(listmatrixxxx[i]), self.number_of_products_IO):
                somme += listmatrixxxx[i].iloc[j + k, 0]
            listadd.append(somme)
        listlisteeee[i] = listadd
        listdffff[i] = pd.DataFrame(listlisteeee[i], listact, [list(self.dictRoW.keys())[i]])
        self.total_prod_RoW = self.total_prod_RoW.join(listdffff[i], how='outer')

    # total_prod_RoW:
    # columns: RoW regions
    # rows: io sectors (sum of production across all countries within IO region)

In [None]:
def remove_low_production_volume_processes(
    df_in: pd.DataFrame
) -> pd.DataFrame:

    

In [None]:
def low_production_volume_processes(self):

    list_low_prod_sectors = self.total_prod_country[self.total_prod_country < 10].dropna().index.tolist()

    df = self.PRO_f.loc[[i for i in self.PRO_f.index if (self.PRO_f.io_geography[i], self.PRO_f.ProductTypeName[i])
                            in list_low_prod_sectors and i in self.list_to_hyb]]
    dict_ = ast.literal_eval(
        pkg_resources.resource_string(__name__, '/Data/Classing_countries.txt').decode('utf-8'))
    for process in self.PRO_f.index:
        if process in df.index:
            # dict_ does not work with aggregated countries (it contains custom region descrptions,
            # therefore if countries were aggregated, leave the aggregated io region in the list
            if self.aggregationFlag:
                pass
            else:
                if dict_[self.PRO_f.io_geography[process]] == 'RER':
                    self.PRO_f.io_geography[process] = dict_[self.PRO_f.io_geography[process]]