# Match missing by validated CAS numbers

Validate and then use CAS numbers for matching where possible.

In [1]:
import pandas as pd
from pathlib import Path
import numpy as np
from numbers import Number
from notebook_utils import *

In [2]:
unmatched_data_dir = (Path.cwd().parent / "Mapping" / "Output" / "Unmatched").resolve()
output_dir = (Path.cwd().parent / "Contribute").resolve()

Read input dataframes

In [3]:
sp = pd.read_csv(unmatched_data_dir / 'SimaProv9.4.csv')

In [4]:
ei = pd.read_csv(unmatched_data_dir / 'ecoinventEFv3.7.csv')

## Validating CAS numbers

Just because we have them doesn't make them correct.

Based on code from [happy_family](https://github.com/Depart-de-Sentier/happy_family/blob/main/Elementary%20flow%20lists/Generate%20elementary%20flow%20lists.ipynb).

In [5]:
def validate_cas(s):
    ERROR = "CAS Check Digit error: CAS '{}' has check digit of {}, but it should be {}"
    
    if isinstance(s, str):
        s = s.strip()
    if not s:
        return None
    elif isinstance(s, Number) and np.isnan(s):
        return None
    
    total = sum((a + 1) * int(b) for a, b in zip(range(9), s.replace("-", "")[-2::-1]))
    if not total % 10 == int(s[-1]):
        print("CAS not valid: {} ({})".format(s, ERROR.format(s, s[-1], total % 10)))
        return None
    return no_padding_cas(s)
                

def check_cas(s):
    if not s:
        return None
    assert s.count("-") == 2
    check_digit(s)
    return True


def zero_pad_cas(s):
    if not s:
        return s
    zeros = "0" * (12- len(s))
    return zeros + s
    
    
def no_padding_cas(s):
    if not s:
        return s
    return s.lstrip("0")

In [6]:
ei["Valid CAS"] = ei["CASNo"].apply(validate_cas)
ei = ei[ei["Valid CAS"].notnull()]

In [7]:
sp["Valid CAS"] = sp["CAS No"].apply(validate_cas)
sp = sp[sp["Valid CAS"].notnull()]

Need to do the three main base contexts separately

In [8]:
for c in ("air", "water", "soil"):
    sp_filtered = expand_simapro_context(sp, c)
    sp_filtered = add_ecoinvent_context_column(sp_filtered, label="EcoinventContext", kind=c)
    df = sp_filtered.merge(ei, how="inner", left_on=["Valid CAS", "EcoinventContext"], right_on=["Valid CAS", "Context"])
    finish_notebook(
        df=df, 
        author="Chris Mutel",
        notebook_name="Match missing by validated CAS numbers",
        filename=f'validated-cas-in-{c}'
    )