# Walkthrough Notebook
This sample code is based on the following paper: [CaML: Carbon Footprinting of Household Products with Zero-Shot Semantic Text Similarity](https://www.amazon.science/publications/caml-carbon-footprinting-of-household-products-with-zero-shot-semantic-text-similarity) You can also view the repository [here](https://github.com/amazon-science/carbon-assessment-with-ml).
The datasets included in S3 were obtained from the [US Census Bureau North American Industry Classification System Reference Files](https://www.census.gov/naics/?48967) and from the [US EPA](https://edg.epa.gov/metadata/catalog/search/resource/details.page?uuid=https://doi.org/10.23719/1528686).

In [1]:
## Imports
# dataframe tools
import pandas as pd

# custom package
from caml.similarity import MLModel

# for interaction with AWS resources
import boto3
import io

In [2]:
## Path Definitions and Client Initialization
# set cloud_mode to False if you want to run this notebook locally
cloud_mode = True
if not cloud_mode:
    # input products file
    input_products_file = '../s3_files/input/products.csv'

    # emission factors file
    ef_file = '../s3_files/datasets/SupplyChainGHGEmissionFactors_v1.2_NAICS_CO2e_USD2021.csv'

    # NAICS index item descriptions file
    naics_desc_file = '../s3_files/datasets/2017_NAICS_Index_File.xlsx'

    # output file
    output_file = '../s3_files/outputs/output.csv'
else:
    # get SSM client
    ssm_client = boto3.client("ssm")

    # get S3 details from AWS Systems Manager Parameter Store
    input_output_bucket_name = ssm_client.get_parameter(Name="input-output-bucket-name")['Parameter']['Value']
    # https://docs.aws.amazon.com/AmazonS3/latest/userguide/privatelink-interface-endpoints.html#accessing-bucket-and-aps-from-interface-endpoints
    s3_interface_endpoint_url = ssm_client.get_parameter(Name="s3-interface-endpoint-url")['Parameter']['Value']

    # get S3 Client using interface endpoint url
    s3_client = boto3.client(service_name="s3", endpoint_url=s3_interface_endpoint_url)

    # input products file
    products_file_key = 'input/products.csv'

    # emission factors file
    ef_file_key = 'datasets/SupplyChainGHGEmissionFactors_v1.2_NAICS_CO2e_USD2021.csv'

    # NAICS index item descriptions file
    naics_desc_file_key = 'datasets/2017_NAICS_Index_File.xlsx'

    # output file
    output_file_key = 'outputs/output.csv'


In [3]:
## Prepare the NAICS codes
# read v1.2 Supply Chain GHG Emission Factors
if cloud_mode:
    ef_file = io.BytesIO(s3_client.get_object(Bucket=input_output_bucket_name, Key=ef_file_key).get('Body').read())
usepa_df = pd.read_csv(ef_file)

# read NAICS17 index item descriptions
if cloud_mode:
    naics_desc_file = io.BytesIO(s3_client.get_object(Bucket=input_output_bucket_name, Key=naics_desc_file_key).get('Body').read())
naics_desc_df = pd.read_excel(naics_desc_file, engine='openpyxl')

# join emission factors with NAICS17 index item descriptions based on the NAICS codes
naics_df = pd.merge(naics_desc_df, usepa_df, how='left', left_on='NAICS17', right_on='2017 NAICS Code').dropna()
naics_df = naics_df.reset_index()
naics_df = naics_df[['NAICS17', 'INDEX ITEM DESCRIPTION', 'Supply Chain Emission Factors with Margins', '2017 NAICS Title']]
naics_df = naics_df.rename(columns={
    'NAICS17': 'naics_code', 'INDEX ITEM DESCRIPTION': 'naics_desc', 'Supply Chain Emission Factors with Margins': 'eio_co2', '2017 NAICS Title': 'naics_title'
})

# read products list and costs
if cloud_mode:
    input_products_file = io.BytesIO(s3_client.get_object(Bucket=input_output_bucket_name, Key=products_file_key).get('Body').read())
products_df = pd.read_csv(input_products_file)
products_df.head(5)

naics_df.head(5)

Unnamed: 0,naics_code,naics_desc,eio_co2,naics_title
0,111110,"Soybean farming, field and seed production",1.326,Soybean Farming
1,111120,"Canola farming, field and seed production",1.326,Oilseed (except Soybean) Farming
2,111120,"Flaxseed farming, field and seed production",1.326,Oilseed (except Soybean) Farming
3,111120,"Mustard seed farming, field and seed production",1.326,Oilseed (except Soybean) Farming
4,111120,"Oilseed farming (except soybean), field and se...",1.326,Oilseed (except Soybean) Farming


In [4]:
## Run Model
# set up the model
model = MLModel()

# pull in the NAICS codes from above
naics_list = naics_df.naics_desc.values

# pull in the product descriptions from above
products_list = products_df.name.values

# compute scores for all NAICS codes
cosine_scores = model.compute_similarity_scores(products_list, naics_list)

# sort scores
sorted_cs, indices = cosine_scores.sort(dim=1, descending=True)


In [5]:
## Prepare Outputs
# create output dataframe
result_df = pd.DataFrame()

# write an output prediction for each product
for ix, product in enumerate(products_list):
    # get the cosine scores for this product
    sorted_product_cs = sorted_cs[ix].cpu().numpy()
    # get the corresponding NAICS indexes
    naics_ix = indices[ix].cpu().numpy()
    product_cost = products_df.loc[ix, 'cost']
    for indx in range(5):
        result_df.loc[ix, 'product'] = product
        # write the attributes of the NAICS row with the highest cosine score
        result_df.loc[ix, 'naics_code'] = naics_df.loc[naics_ix[0], 'naics_code']
        result_df.loc[ix, 'naics_title'] = naics_df.loc[naics_ix[0], 'naics_title']
        result_df.loc[ix, 'naics_desc'] = naics_df.loc[naics_ix[0], 'naics_desc']
        result_df.loc[ix, 'eio_co2'] = naics_df.loc[naics_ix[0], 'eio_co2']
        result_df.loc[ix, 'cosine_score'] = float("{:.3f}".format(sorted_product_cs[0]))
        result_df.loc[ix, 'cost'] = float("{:.3f}".format(product_cost))
        result_df.loc[ix, 'footprint'] = float("{:.3f}".format(sorted_product_cs[0] * product_cost))

if cloud_mode:    
    # write the output to an s3 file
    with io.StringIO() as csv_buffer:
        result_df.to_csv(csv_buffer)
        s3_client.put_object(Bucket=input_output_bucket_name, Key=output_file_key, Body=csv_buffer.getvalue())
else:
    # write the output to a local file
    result_df.to_csv(output_file)

# print out the predictions here
result_df.head(5)


Unnamed: 0,product,naics_code,naics_title,naics_desc,eio_co2,cosine_score,cost,footprint
0,chocolate chip cookie,311352.0,Confectionery Manufacturing from Purchased Cho...,"Fudge, chocolate, made from purchased chocolate",0.543,0.647,2.0,1.295
1,mint tea,311920.0,Coffee and Tea Manufacturing,"Tea, instant, manufacturing",0.549,0.556,1.0,0.556
2,bottled water,445299.0,All Other Specialty Food Stores,"Water stores, bottled",0.238,0.768,1.25,0.96
3,wet canned cat food,311111.0,Dog and Cat Food Manufacturing,Cat food manufacturing,0.478,0.629,1.25,0.786
4,apple juice,311421.0,Fruit and Vegetable Canning,"Juices, fruit or vegetable, fresh, manufacturing",0.518,0.546,1.5,0.819
