## Create manifest file for use in SageMaker GroundTruth for crowd-sourced annotations

Use the ML model to find the top-5 EIF matches for every activity. Generate a manifest file in JSON format with this information to get annotations with the Amazon SageMaker Ground Truth service.

In [20]:
# Use a GPU instance if possible, the code can take a few minutes to run otherwise.
from tqdm import tqdm
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer, util

import activity_config

In [3]:
# Read activities
activity_df = pd.read_csv(activity_config.activity_file)
# examine activity data
activity_df.head()

Unnamed: 0,activity_description,quantity,unit
0,carrot,1.19,kg
1,cilantro,0.2,kg
2,piano,1.0,unit
3,backpack,1.0,unit
4,keyboard,1.0,unit


In [4]:
# exact logic may change depending on the activity data format
# additional cleaning of data can be added if needed, e.g. removing special characters
activity_list = activity_df.activity_description.values
len(activity_list)

5

In [9]:
eco_df = pd.read_excel('https://ecoinvent.org/wp-content/uploads/2022/12/Database-Overview-for-ecoinvent-v3.9.1.xlsx', 
                       sheet_name=2, engine='openpyxl', dtype='str')
print(eco_df.shape)
eco_df = eco_df.groupby('Reference Product Name').first().reset_index()
eco_df.head()

(21238, 17)


Unnamed: 0,Reference Product Name,Activity UUID & Product UUID,Activity UUID,EcoQuery URL,Activity Name,Geography,Time Period,Special Activity Type,Sector,ISIC Classification,ISIC Section,Product UUID,CPC Classification,Unit,Product Information,CAS Number,Cut-Off Classification
0,"1,1-difluoroethane, HFC-152a",e3ef8fe3-f463-5a64-b0a4-8898873e1ba0_807906d0-...,e3ef8fe3-f463-5a64-b0a4-8898873e1ba0,https://v391.ecoquery.ecoinvent.org/Details/UP...,"market for 1,1-difluoroethane, HFC-152a",GLO,2011 - 2022,market activity,Chemicals,2011:Manufacture of basic chemicals,C - Manufacturing,807906d0-f3cb-4a7c-a528-ae497a61bf12,341: Basic organic chemicals,kg,"'1,1-difluoroethane, HFC-152a' is an organic s...",000075-37-6,allocatable product
1,"1,1-dimethylcyclopentane",65dc71f0-59c9-5c9f-9447-802284f2dad1_41bae23f-...,65dc71f0-59c9-5c9f-9447-802284f2dad1,https://v391.ecoquery.ecoinvent.org/Details/UP...,molecular sieve separation of naphtha,RER,1998 - 2022,ordinary transforming activity,Fuels; Chemicals,1920:Manufacture of refined petroleum products,C - Manufacturing,41bae23f-237d-4ba6-9b1d-73d5f4baee55,341: Basic organic chemicals,kg,"1,1-dimethylcyclopentane' is an organic substa...",001638-26-2,allocatable product
2,1-butanol,1a092e74-9095-5393-89f6-6954fb3ed34b_5543d899-...,1a092e74-9095-5393-89f6-6954fb3ed34b,https://v391.ecoquery.ecoinvent.org/Details/UP...,"synthetic fuel production, from coal, high tem...",ZA,2016 - 2022,ordinary transforming activity,Fuels,1920:Manufacture of refined petroleum products,C - Manufacturing,5543d899-1cbd-4acf-a770-befd41102943,"34139: Other alcohols, phenols, phenol-alcohol...",kg,'1-butanol' is an organic substance with a CAS...,000071-36-3,allocatable product
3,1-methoxy-2-propanol,7407f869-5681-5b26-8d97-7cc9a2d5e274_95aedd09-...,7407f869-5681-5b26-8d97-7cc9a2d5e274,https://v391.ecoquery.ecoinvent.org/Details/UP...,1-methoxy-2-propanol production,GLO,2020 - 2022,ordinary transforming activity,Chemicals,2011:Manufacture of basic chemicals,C - Manufacturing,95aedd09-379e-5ee5-b2af-d220968c898b,"34170: Ethers, alcohol peroxides, ether peroxi...",kg,'1-methoxy-2-propanol' is an organic substanc...,000107-98-2,allocatable product
4,1-methylcyclopropene,82f9a985-cd94-5696-b40e-f66ecebdf8ce_e24e6570-...,82f9a985-cd94-5696-b40e-f66ecebdf8ce,https://v391.ecoquery.ecoinvent.org/Details/UP...,market for 1-methylcyclopropene,GLO,2020 - 2022,market activity,Chemicals,2011:Manufacture of basic chemicals,C - Manufacturing,e24e6570-ecf4-5e34-8d59-a9066687bf17,"34663: Herbicides, anti-sprouting products and...",kg,'1-methylcyclopropene' is an organic substanc...,003100-04-7,allocatable product


In [10]:
eco_df = eco_df.rename(columns={'Reference Product Name': 'reference_product'})
eco_df.shape

(3550, 17)

In [11]:
hs_df = pd.read_excel("https://unstats.un.org/unsd/classifications/Econ/download/In%20Text/HSCodeandDescription.xlsx", dtype='str', engine='openpyxl', sheet_name=1)
print(hs_df.shape)
hs_df.head()

(6709, 6)


Unnamed: 0,Classification,Code,Description,Parent Code,Level,IsBasicLevel
0,H5,1,Animals; live,TOTAL,2,0
1,H5,101,"Horses, asses, mules and hinnies; live",01,4,0
2,H5,10121,"Horses; live, pure-bred breeding animals",0101,6,1
3,H5,10129,"Horses; live, other than pure-bred breeding an...",0101,6,1
4,H5,10130,Asses; live,0101,6,1


In [12]:
hs_df = hs_df[hs_df.Level == '6']
hs_df.shape

(5388, 6)

In [13]:
eco_list = eco_df['reference_product'].unique()
hs_list = hs_df.Description.values
len(eco_list), len(hs_list)

(3550, 5388)

In [14]:
model = SentenceTransformer('all-mpnet-base-v2')
eco_emb = model.encode(eco_list)
hs_emb = model.encode(hs_list)
cosine_scores_eco = util.cos_sim(eco_emb, hs_emb)
sorted_cs_eco, indices_eco = cosine_scores_eco.sort(dim=1, descending=True)

In [15]:
activity_embedding = model.encode(activity_list)
cosine_scores = util.cos_sim(activity_embedding, eco_emb)
sorted_cs, indices = cosine_scores.sort(dim=1, descending=True)

In [16]:
cosine_scores_hs = util.cos_sim(activity_embedding, hs_emb)
sorted_cs_hs, indices_hs = cosine_scores_hs.sort(dim=1, descending=True)

In [17]:
ranked_eifs_list = []
result_df = pd.DataFrame()
for activity_ix in tqdm(range(len(activity_df))):
    activity_text = activity_df.iloc[activity_ix].activity_description
    sorted_activity_cs = sorted_cs[activity_ix].cpu().numpy()
    eco_ix = indices[activity_ix].cpu().numpy()

    result_df.loc[activity_ix, 'activity'] = activity_text    
    result_df.loc[activity_ix, 'eco_ref_prod'] = eco_df.loc[eco_ix[0], 'reference_product']
    result_df.loc[activity_ix, 'cosine_score'] = float("{:.3f}".format(sorted_activity_cs[0]))
    eco_hs_ix = indices_eco[eco_ix[0]].cpu().numpy()
    eco_hs_code = hs_df.iloc[eco_hs_ix[0]].Code
    activity_hs_ix = indices_hs[activity_ix].cpu().numpy()
    activity_hs_code = hs_df.iloc[activity_hs_ix[0]].Code
    result_df.loc[activity_ix, 'does_eif_match'] = 'match' if eco_hs_code[:4] == activity_hs_code[:4] else 'no match'

    # Create a ranked list
    similarity_scores = pd.DataFrame()
    eco_match_set = set([])
    for hs_digits in [6,4,2,0]:
        for i in range(10):            
            eco_hs_score = sorted_cs_eco[eco_ix[i]][0].cpu().numpy()
            eco_hs_ix = indices_eco[eco_ix[i]].cpu().numpy()
            eco_ref_prod = eco_df.loc[eco_ix[i], 'reference_product']
            eco_hs_code = hs_df.iloc[eco_hs_ix[0]].Code
            eco_hs_desc = hs_df.iloc[eco_hs_ix[0]].Description
            activity_hs_ix = indices_hs[activity_ix].cpu().numpy()
            activity_hs_code = hs_df.iloc[activity_hs_ix[0]].Code
            activity_hs_desc = hs_df.iloc[activity_hs_ix[0]].Description
            activity_hs_score = sorted_cs_hs[activity_ix][0].cpu().numpy()
            if eco_hs_code[:hs_digits] == activity_hs_code[:hs_digits]:
                if eco_ref_prod in eco_match_set:
                    continue
                eco_match_set.add(eco_ref_prod)
                similarity_scores.loc[i,'cosine_score'] = float("{:.8f}".format(sorted_activity_cs[i]))
                similarity_scores.loc[i, 'eco_ref_prod'] = eco_ref_prod
                similarity_scores.loc[i, 'eco_hs_code'] = eco_hs_code
                similarity_scores.loc[i, 'eco_hs_desc'] = eco_hs_desc
                similarity_scores.loc[i, 'hs_level'] = hs_digits
                similarity_scores.loc[i, 'eco_hs_score'] = eco_hs_score
                similarity_scores.loc[i, 'activity_hs_code'] = activity_hs_code
                similarity_scores.loc[i, 'activity_hs_desc'] = activity_hs_desc
                similarity_scores.loc[i, 'activity_hs_score'] = activity_hs_score
                if len(similarity_scores) >= 5:
                    break
        if len(similarity_scores) >= 5:
            break

    similarity_scores['activity_text'] = activity_text
    ranked_eifs_list += [similarity_scores]

100%|██████████| 5/5 [00:00<00:00,  8.50it/s]


In [18]:
# Summary of EIFs chosen. Note that the model can make mistakes.
# If all the results look wrong, either there is a bug in the code,
# or the activity descriptions are not clear enough
result_df

Unnamed: 0,activity,eco_ref_prod,cosine_score,does_eif_match
0,carrot,carrot,1.0,match
1,cilantro,coriander,0.714,no match
2,piano,keyboard,0.595,no match
3,backpack,paper sack,0.625,no match
4,keyboard,keyboard,1.0,match


In [21]:
#Ranked list of EIFs for one of the activities sampled from the given list
ranked_eifs_list[np.random.choice(range(len(activity_list)))]

Unnamed: 0,cosine_score,eco_ref_prod,eco_hs_code,eco_hs_desc,hs_level,eco_hs_score,activity_hs_code,activity_hs_desc,activity_hs_score,activity_text
5,0.377926,mattress,940421,"Mattresses; of cellular rubber or plastics, wh...",4.0,0.645783,940430,Sleeping bags,0.648277,backpack
0,0.62528,paper sack,481940,"Paper and paperboard; sacks and bags, includin...",0.0,0.636699,940430,Sleeping bags,0.648277,backpack
1,0.453151,"stone wool, packed",510510,Wool; carded,0.0,0.686279,940430,Sleeping bags,0.648277,backpack
2,0.40126,shed,442110,Wood; clothes hangers,0.0,0.343096,940430,Sleeping bags,0.648277,backpack
3,0.388671,lithium,282520,Lithium oxide and hydroxide,0.0,0.601661,940430,Sleeping bags,0.648277,backpack
