### 0.0 follow setup instructions

ℹ️ use [`pylcaio.yml`](https://github.com/michaelweinold/config_conda/blob/main/pylcaio.yml) to set up working conda environment.

### 0.1. imports
#### 0.1.1. regular imports

In [11]:
# i/o
import sys
import os
from pathlib import Path
import gzip
import pickle
import git
import json
# os specific settings
import platform
# configuration
import yaml
# lca
import ecospold2matrix as e2m
import pymrio
#import brightway2 as bw
# type hints
from ecospold2matrix import ecospold2matrix
from pymrio import IOSystem
# data science
import pandas as pd
import numpy as np
# deep copy
import copy

#### 0.1.2. load configuration file

In [12]:
with open('../config.yaml', 'r') as filestream:
    config = yaml.load(filestream, Loader = yaml.FullLoader)

#### 0.1.3. load `pylcaio`

In [13]:
sys.path.append(os.path.join(Path.home(), config['pylcaio'])) # required for local import of pylcaio
import pylcaio

### 0.2. file paths
#### 0.2.1. directories

In [14]:
%%capture
# home directory
print(path_dir_home := Path.home())
print(path_dir_repo := git.Repo('.', search_parent_directories=True).working_tree_dir)
# input directory
print(path_dir_databases := os.path.join(path_dir_home, config['path_dir_databases']))
# output directories
print(path_dir_data := os.path.join(path_dir_home, config['path_dir_data']))
print(path_dir_pylcaio := os.path.join(path_dir_home, path_dir_data, config['path_dir_pylcaio']))
print(path_dir_pymrio := os.path.join(path_dir_home, path_dir_data, config['path_dir_pymrio']))
print(path_dir_e2m := os.path.join(path_dir_home, path_dir_data, config['path_dir_e2m']))

#### 0.2.2. files

In [15]:
%%capture
# databases
print(path_exiobase := os.path.join(path_dir_home, path_dir_databases, config['exiobase']))
print(path_dir_ecoinvent := os.path.join(path_dir_home, path_dir_databases, config['ecoinvent']))
# pylcaio output
print(path_pylcaio_database_loader_class_instance := os.path.join(path_dir_pylcaio, config['pylcaio_database_loader_class_instance']))
print(path_pylcaio_class_instance_before_hybrid := os.path.join(path_dir_pylcaio, config['pylcaio_class_instance_before_hybrid']))
print(path_pylcaio_class_instance_after_hybrid := os.path.join(path_dir_pylcaio, config['pylcaio_class_instance_after_hybrid']))
# pymrio output
print(path_pymrio_class_instance := os.path.join(path_dir_pymrio, config['pymrio_class_instance']))
# e2m output
print(e2m_project_name := config['e2m_project_name'])
print(path_file_e2m_pickle := os.path.join(path_dir_e2m, e2m_project_name + config['e2m_pickle_filename']))

In [16]:
%%capture
print(path_dict_io_countries_per_lca_region := os.path.join(path_dir_repo, config['path_dict_io_countries_per_lca_region']))
print(path_list_io_countries_and_regions := os.path.join(path_dir_repo, config['path_list_io_countries_and_regions']))
print(path_list_io_countries := os.path.join(path_dir_repo, config['path_list_io_countries']))

In [17]:
with open(file = path_dict_io_countries_per_lca_region, mode = 'r', encoding = 'utf-8') as filestream:
    dict_io_countries_per_lca_region: dict = json.load(fp = filestream)
with open(file = path_list_io_countries_and_regions, mode = 'r', encoding = 'utf-8') as filestream:
    list_io_countries_and_regions: list = json.load(fp = filestream)
with open(file = path_list_io_countries, mode = 'r', encoding = 'utf-8') as filestream:
    list_io_countries: list = json.load(fp = filestream)

PRO_f dataframe:

| index | activityNameId | io_geography |
| ----- | -------------- | ------------ |
| 10 | 1 | RoW |
| 11 | 1 | CH |
| 12 | 1 | AT |
| 13 | 2 | RoW |
| 14 | 2 | DE |
| 15 | 2 | CH |
| 16 | 3 | FR |
| 17 | 3 | BE |
| 18 | 4 | RoW |
| 19 | 4 | CH |
| 20 | 4 | AT |

should look like:

| activityNameId | io_geography_list | RoW_region |
| -------------- | ----------------- | ---------- |
| 1 | RoW, CH, AT | RoW(1) |
| 2 | RoW, DE, CH | RoW(2) |
| 4 | RoW, CH, AT | RoW(1) |

where for activityNameId == 1, RoW region is list_io_countries - [CH, AT]


In [23]:
with open(path_pylcaio_class_instance_before_hybrid, 'rb') as file_in:
    pylcaio_object_before_hybrid: pylcaio.LCAIO = pd.read_pickle(file_in)
PRO_f = pylcaio_object_before_hybrid.PRO_f
df = PRO_f

#### 1.1. function implementation

check: https://stackoverflow.com/a/53343046

In [25]:
def identify_rest_of_world_regions(
    df: pd.DataFrame,
    list_io_countries: list,
    dict_io_countries_per_lca_region: dict
) -> pd.DataFrame:

    # get array of those master activities (key 'activityNameId') where one defined geography is 'RoW'
    master_activities_with_rest_of_world_geography: np.ndarray = df[df['io_geography'] == 'RoW']['activityNameId'].unique()
    # replace LCA regions (eg. 'Europe without Switzerland') with the associated IO countries (eg. '"AT", "BE", "BG", ...')
    df['io_geography'] = df['io_geography'].map(dict_io_countries_per_lca_region).fillna(df['io_geography'])
    # remove all activities whose associated master activities where no defined geography is 'RoW'
    df = df[df['activityNameId'].isin(master_activities_with_rest_of_world_geography)]

    df_with_geography_rest_of_world = df[df['io_geography'] == 'RoW']['activityNameId']
    
    # remove all activities where 'io_geography' == 'RoW'
    df_aggregated = df.drop(master_activities_with_rest_of_world_geography.index)
    # group by master activities (key 'activityNameId')
    df_aggregated = pd.DataFrame(data = df_aggregated.groupby('activityNameId')['io_geography'].apply(tuple))
    # XXXXX
    df_aggregated = df_aggregated.merge(
        right = df[df.index.isin(master_activities_with_rest_of_world_geography.index)][['activityNameId', ]],
        how = 'left',
        on = 'activityNameId'
    )

    # calculate 'rest of world' region per activity
    df_aggregated['io_geography_rest_of_world'] = df_aggregated.apply(
        lambda row: tuple(set(list_io_countries) - set(row['io_geography'])),
        axis = 1
    )
    # remove activities where 'rest of world' region is null because all countries are described
    df_aggregated = df_aggregated.dropna(subset ='io_geography_rest_of_world')

    # get unique rest of world regions and label them (eg. [AT, CH, DE]: 'RoW(1)')
    unique_rest_of_world_regions = pd.DataFrame(
        data = df_aggregated['io_geography_rest_of_world'].unique(),
        columns = ['io_geography_rest_of_world'])
    unique_rest_of_world_regions['io_geography_rest_of_world_index'] = ['RoW({})'.format(i) for i in unique_rest_of_world_regions.index]

    # add 'io_geography_rest_of_world_index' information to aggregated dataframe
    df_aggregated = df_aggregated.merge(
        right = unique_rest_of_world_regions,
        how = 'left',
        on = 'io_geography_rest_of_world'
    )

    # update original dataframe:
    df = df.update(
        other = df_aggregated,
        join = 'left',
        overwrite = True
    )

    # add all rest of world information to original dataframe
    df_activities_with_rest_of_world_geography = df_activities_with_rest_of_world_geography.merge(
        right = df_aggregated[['activityNameId', 'io_geography_rest_of_world_index']],
        how = 'left',
        on = 'activityNameId'
    )
    #df_activities_with_rest_of_world_geography.rename(columns={"io_geography_rest_of_world_index": "io_geography})

    return df

In [26]:
df_test = identify_rest_of_world_regions(df = df, list_io_countries=list_io_countries, dict_io_countries_per_lca_region=dict_io_countries_per_lca_region)

AttributeError: 'numpy.ndarray' object has no attribute 'index'

In [None]:
def identify_rows(self):
    """ Method to identify the various unique Rest of the World (RoW) regions of the LCA database

    Returns:
    --------
        The updated self.dictRoW in which unique RoW regions are identified in terms of countries they include

    """

    # contains a list of activityNameId's that are 'PRO_f.io_geography[i]=='RoW (=LCA RoW)'
    unique_activities_using_row = list(
        set( # removes duplicates
            self.PRO_f.activityNameId[ # nota bene: activityNameId is Master Data!
                [i for i in self.PRO_f.index if self.PRO_f.io_geography[i] == 'RoW']].tolist()
        )
    )

    RoW_activities = defaultdict(list)

    # how and WHERE are `geography` and `io_geography` matched?

    # list of tuples (activityNameId (=Master Data), geography)
    # where activityNameId where one of the geographis is 'RoW' and
    # where geography is a list of all regions that are associated with one activityNameId
    tupl = [  
        i for i in zip( # zip creates tuple
            self.PRO_f.activityNameId.loc[
                [
                    i for i in self.PRO_f.index
                    if self.PRO_f.activityNameId[i] in unique_activities_using_row
                ]
            ],
            self.PRO_f.io_geography.loc[
                [
                    i for i in self.PRO_f.index
                    if self.PRO_f.activityNameId[i] in unique_activities_using_row
                ]
            ]
        )
    ]

    # ok, dictionary is filled
    for activity, geography in tupl:
        RoW_activities[activity].append(geography)

    # remove 'RoW' from dict values
    RoW_activities = {activity: [geography1 for geography1 in geography if geography1 != 'RoW'] for activity, geography in RoW_activities.items()}

    # delete from RoW_activities processes which had only RoW as geography and are thus empty now
    for key in [i for i in list(RoW_activities.keys()) if RoW_activities[i] == []]:
        del RoW_activities[key]
        
    # put every element to the same level (elements that are lists are transformed to lists of lists)
    for values in list(RoW_activities.values()):
        for i in range(0, len(values)):
            if values[i] in self.countries_per_regions.keys():
                values[i] = self.countries_per_regions[values[i]]

    # for elements that are lists of lists stemming from the replacement of ['RER'] by [['AT','BE',...]],
    # add all of the together in a single list
    for keys in RoW_activities.keys():
        for item in RoW_activities[keys]:
            if isinstance(item, list):
                RoW_activities[keys] = sum_elements_list(RoW_activities[keys])
    # remove duplicates inside the elements
    for keys in list(RoW_activities.keys()):
        RoW_activities[keys] = list(set(RoW_activities[keys]))

    # why sort here? to ensure unique_RoWs does not contain duplicates that would just have a different order
    # need to sort to identify duplicates whose elements would be ordered differently and thus be treated as not duplicated
    for keys in RoW_activities.keys():
        RoW_activities[keys].sort()

    # identify the combination of countries that are NOT inside the residual of each process
    # dict where
    # key = activityNameId
    # value = list of IO countries that are NOT associated with activityNameId
    dictactrow = {}
    residual_geo_IO_to_remove = ['WA', 'WE', 'WF', 'WL', 'WM'] # exiobase 'rest of world' regions (all other are actual countries)
    for keys in RoW_activities.keys():
        dictactrow[keys] = list( # dictactrow[keys] is activityNameId
            set(self.listcountry) # countries + RoW regions IO
            - set(RoW_activities[keys]) # countries LCA associated with the activityNameId
            - set(residual_geo_IO_to_remove)) # RoW regions IO
    
    unique_RoWs = []
    for keys in dictactrow.keys():
        if dictactrow[keys] not in unique_RoWs:
            unique_RoWs.append(dictactrow[keys])

    # create name for the values of the different RoW
    listname = []
    for i in range(0, len(unique_RoWs)):
        listname.append('RoW' + '(' + str(i) + ')')
    
    # put all of that in dictRoW
    # dictRow is created empty earlier in the code
    for i in range(0, len(unique_RoWs)):
        self.dictRoW[listname[i]] = unique_RoWs[i]
    try:
        # if RoWs are empty because processes from ecoinvent are too described
        del [[k for k in self.dictRoW.keys() if len(self.dictRoW[k]) == 0][0]]
    except IndexError:
        pass

    # replace RoW list (eg. [AT, DE, ...] with RoW number (eg. RoW(1)))
    for activityNameId in dictactrow: # key = activityNameId
        for RoW_number in self.dictRoW: # key = eg. "RoW(12)"
            if dictactrow[activityNameId] == self.dictRoW[RoW_number]: # dictactrow[activityNameId] = RoW list, self.dictRoW[RoW_number]
                dictactrow[activityNameId] = RoW_number

    RoW_matrix = pd.DataFrame(
        data = list(dictactrow.values()),
        index=list(dictactrow.keys()),
        columns=['RoW_geography']
    )

    # adds RoW information to matrix
    self.PRO_f = self.PRO_f.merge(RoW_matrix, left_on='activityNameId', right_on=RoW_matrix.index, how='outer')

    self.PRO_f.index = self.PRO_f.activityId + '_' + self.PRO_f.productId
    self.PRO_f = self.PRO_f.reindex(self.processes_in_order)
    
    self.PRO_f.io_geography.update(self.PRO_f.RoW_geography[self.PRO_f.io_geography == 'RoW'])
    self.PRO_f = self.PRO_f.drop('RoW_geography', axis=1)

    # might be some RoW or empty lists left in PRO_f
    self.PRO_f.io_geography[self.PRO_f.io_geography == 'RoW'] = 'GLO'
    self.PRO_f.io_geography.loc[[i for i in self.PRO_f.index if type(self.PRO_f.io_geography[i]) == list]] = 'GLO'