In [1]:
import pandas as pd
from pathlib import Path
import numpy as np
import os

In [2]:
contaminants_data = []

contaminants_list = ["2,3,7,8-Tetrachlorodibenzo-p-dioxin","Dieldrin","Hexachlorobiphenyl; 3,3',4,4',5,5'- (PCB 169)","Pentachlorobiphenyl; 3,3',4,4',5- (PCB 126)","Mercury","Lead","Cyanide"]
clean_data_dir = '../cleandata'

for filename in os.listdir(clean_data_dir):
    if filename.endswith(".csv"):
        csv_data = pd.read_csv(filename, parse_dates=True, infer_datetime_format=True)
        for item in contaminants_list:
            contaminants_filtered = csv_data[csv_data['CHEMICAL_NAME'] == item]
            contaminants_filtered = contaminants_filtered.iloc[: , 1:]
            contaminants_data.append(contaminants_filtered)

contaminants_df = pd.concat(contaminants_data)
contaminants_df

Unnamed: 0,SAMPLE_DATE,TASK_CODE,ANALYTIC_METHOD,CAS_RN,CHEMICAL_NAME,REPORT_RESULT_VALUE,REPORT_RESULT_UNIT,REPORT_RESULT_LIMIT,DETECT_FLAG,REPORTABLE_RESULT,LONGITUDE,LATITUDE,LOC_NAME
54,2017-10-10 08:00:00,2017-2019 OU2 PDI Sediment,E1613,1746-01-6,"2,3,7,8-Tetrachlorodibenzo-p-dioxin",39.3,pg/g,0.189,Y,Yes,-74.118448,40.708445,
130,2017-10-19 14:50:00,2017-2019 OU2 PDI Sediment,E1613,1746-01-6,"2,3,7,8-Tetrachlorodibenzo-p-dioxin",106.0,pg/g,0.654,Y,Yes,-74.120683,40.707897,
222,2017-10-19 14:50:00,2017-2019 OU2 PDI Sediment,E1613,1746-01-6,"2,3,7,8-Tetrachlorodibenzo-p-dioxin",199.0,pg/g,0.654,Y,Yes,-74.120683,40.707897,
298,2017-10-19 14:50:00,2017-2019 OU2 PDI Sediment,E1613,1746-01-6,"2,3,7,8-Tetrachlorodibenzo-p-dioxin",548.0,pg/g,1.090,Y,Yes,-74.120683,40.707897,
390,2017-10-19 14:50:00,2017-2019 OU2 PDI Sediment,E1613,1746-01-6,"2,3,7,8-Tetrachlorodibenzo-p-dioxin",98.9,pg/g,0.986,Y,Yes,-74.120683,40.707897,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
15876,2019-07-09 11:00:00,2019 OU2 PDI Porewater Passive Sampler,SW6010,7439-92-1,Lead,400.0,mg/kg,0.990,Y,Yes,-74.145593,40.735957,LPR-0403-01
15881,2019-07-10 10:45:00,2019 OU2 PDI Porewater Passive Sampler,SW6010,7439-92-1,Lead,370.0,mg/kg,1.100,Y,Yes,-74.149512,40.734412,LPR-0430-07
15884,2019-07-09 16:10:00,2019 OU2 PDI Porewater Passive Sampler,SW6010,7439-92-1,Lead,11.0,mg/kg,0.730,Y,Yes,-74.155944,40.770182,LPR-0752-01R
15889,2019-07-09 14:15:00,2019 OU2 PDI Porewater Passive Sampler,SW6010,7439-92-1,Lead,51.0,mg/kg,0.640,Y,Yes,-74.152242,40.774933,LPR-0790-03R


In [3]:

contaminants_df.to_csv('../../notebooks/chemical_data.csv')







In [5]:
molar_mass={
        "2,3,7,8-Tetrachlorodibenzo-p-dioxin":321.97,
        "Cyanide":26.02,
        "Dieldrin":380.91,
        "Hexachlorobiphenyl; 3,3',4,4',5,5'- (PCB 169)":360.878,
        "Lead":207.2,
        "Mercury":200.59,
        "Pentachlorobiphenyl; 3,3',4,4',5- (PCB 126)":326.433
}

conversion_factor={
        'pg/g':10**-6,
        'pg/sample':10**-6,
        'ppt':10**-6,
        'ppb':10**-3,
        'ppm':1,
        'ug/kg':10**-3,
        'UMOLES/G':1,
        'ng/g':10**-3,
        'umol/g':1,
        'mg/kg':1,
        'ng/kg':10**-6,
        'pg':10**-4,
        'pg/l':10**-9,
        'ng/l':10**-6,
        'ug/l':10**-3,
        'mg/l':1
}
contaminants_df['REPORT_RESULT_UNIT'].fillna('ug/kg',inplace=True)
contaminants_df.dropna(subset=['REPORT_RESULT_VALUE'],inplace=True)
contaminants_df.isnull().sum()

def unit_conversion(row):
    REPORT_RESULT_VALUE, REPORT_RESULT_UNIT = row.REPORT_RESULT_VALUE, row.REPORT_RESULT_UNIT
    conversion = conversion_factor[REPORT_RESULT_UNIT]
    
    if REPORT_RESULT_UNIT == 'UMOLES/G' or REPORT_RESULT_UNIT == 'umol/g':
        return REPORT_RESULT_VALUE**2
    else:
    
        return REPORT_RESULT_VALUE*conversion


contaminants_df['SCALED_VALUE'] = contaminants_df.apply(unit_conversion,axis=1)

def value_moles(row):
        CHEMICAL_NAME, SCALED_VALUE = row.CHEMICAL_NAME, row.SCALED_VALUE
        molar = molar_mass[CHEMICAL_NAME]

        return SCALED_VALUE / molar

contaminants_df['VALUE_MUMOL_PER_GRAM'] = contaminants_df.apply(value_moles,axis=1)

contaminants_df.sample(5)


Unnamed: 0,SAMPLE_DATE,TASK_CODE,ANALYTIC_METHOD,CAS_RN,CHEMICAL_NAME,REPORT_RESULT_VALUE,REPORT_RESULT_UNIT,REPORT_RESULT_LIMIT,DETECT_FLAG,REPORTABLE_RESULT,LONGITUDE,LATITUDE,LOC_NAME,SCALED_VALUE,VALUE_MUMOL_PER_GRAM
149594,2008-12-02 07:26:00,2008 CPG LPRS - Low Res Coring Samples,E1668A,32774-16-6,"Hexachlorobiphenyl; 3,3',4,4',5,5'- (PCB 169)",106.0,ng/kg,106.0,N,Yes,-74.119181,40.725141,,0.000106,2.937281e-07
16437,2019-04-16 08:32:00,2019 OU2 PDI Sediment (EPA Split),SW6010D,7439-92-1,Lead,510.0,mg/kg,3.64,Y,Yes,-74.156462,40.767034,LPR-0729-01,510.0,2.46139
70084,2019-03-30 11:00:00,2017-2019 OU2 PDI Sediment,E1699,60-57-1,Dieldrin,2170.0,pg/g,139.0,Y,Yes,-74.118424,40.711867,,0.00217,5.696884e-06
102714,2017-12-04 13:51:00,2017-2019 OU2 PDI Sediment,E1699,60-57-1,Dieldrin,23.5,pg/g,23.5,N,Yes,-74.117094,40.717003,,2.3e-05,6.169436e-08
15458,2017-12-01 08:38:00,2017-2019 OU2 PDI Sediment,SW6010,7439-92-1,Lead,18.4,mg/kg,5.1,Y,Yes,-74.122639,40.714993,,18.4,0.08880309


In [5]:

# create a funciton that takes chemical_df 'REPORT_RESULT_VALUE' and 
# multiplies it by value in unit_conversion key of chemical_conversion_dict 
# that corresponds to 'REPORT_RESULT_UNIT' for that row and finally divides by molar_mass key in 
# chemical_conversion_dict that corrsponds to 'CHEMICAL_NAME'

def chemical_conversion(row):
    CHEMICAL_NAME, REPORT_RESULT_VALUE, REPORT_RESULT_UNIT = row.CHEMICAL_NAME, row.REPORT_RESULT_VALUE, row.REPORT_RESULT_UNIT
    conversion = conversion_factor[REPORT_RESULT_UNIT]
    molar = molar_mass[contaminants_df[6]]
    amount = contaminants_df[REPORT_RESULT_VALUE]
    return (amount*conversion)/molar

contaminants_df['SAMPLE_AMOUNT_CONVERTED'] = contaminants_df.apply(chemical_conversion, axis=1)


KeyError: 6