In [1]:
import pandas as pd
from pathlib import Path
import numpy as np
import os

In [2]:
contaminants_data = []

contaminants_list = ["2,3,7,8-Tetrachlorodibenzo-p-dioxin","Dieldrin","Hexachlorobiphenyl; 3,3',4,4',5,5'- (PCB 169)","Pentachlorobiphenyl; 3,3',4,4',5- (PCB 126)","Mercury","Lead","Cyanide"]
clean_data_dir = '../cleandata'

for filename in os.listdir(clean_data_dir):
    if filename.endswith(".csv"):
        csv_data = pd.read_csv(filename, parse_dates=True, infer_datetime_format=True)
        for item in contaminants_list:
            contaminants_filtered = csv_data[csv_data['CHEMICAL_NAME'] == item]
            contaminants_filtered = contaminants_filtered.iloc[: , 1:]
            contaminants_data.append(contaminants_filtered)

contaminants_df = pd.concat(contaminants_data)
contaminants_df

Unnamed: 0,SAMPLE_DATE,TASK_CODE,ANALYTIC_METHOD,CAS_RN,CHEMICAL_NAME,REPORT_RESULT_VALUE,REPORT_RESULT_UNIT,REPORT_RESULT_LIMIT,DETECT_FLAG,REPORTABLE_RESULT,LONGITUDE,LATITUDE,LOC_NAME
54,2017-10-10 08:00:00,2017-2019 OU2 PDI Sediment,E1613,1746-01-6,"2,3,7,8-Tetrachlorodibenzo-p-dioxin",39.3,pg/g,0.189,Y,Yes,-74.118448,40.708445,
130,2017-10-19 14:50:00,2017-2019 OU2 PDI Sediment,E1613,1746-01-6,"2,3,7,8-Tetrachlorodibenzo-p-dioxin",106.0,pg/g,0.654,Y,Yes,-74.120683,40.707897,
222,2017-10-19 14:50:00,2017-2019 OU2 PDI Sediment,E1613,1746-01-6,"2,3,7,8-Tetrachlorodibenzo-p-dioxin",199.0,pg/g,0.654,Y,Yes,-74.120683,40.707897,
298,2017-10-19 14:50:00,2017-2019 OU2 PDI Sediment,E1613,1746-01-6,"2,3,7,8-Tetrachlorodibenzo-p-dioxin",548.0,pg/g,1.090,Y,Yes,-74.120683,40.707897,
390,2017-10-19 14:50:00,2017-2019 OU2 PDI Sediment,E1613,1746-01-6,"2,3,7,8-Tetrachlorodibenzo-p-dioxin",98.9,pg/g,0.986,Y,Yes,-74.120683,40.707897,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
15876,2019-07-09 11:00:00,2019 OU2 PDI Porewater Passive Sampler,SW6010,7439-92-1,Lead,400.0,mg/kg,0.990,Y,Yes,-74.145593,40.735957,LPR-0403-01
15881,2019-07-10 10:45:00,2019 OU2 PDI Porewater Passive Sampler,SW6010,7439-92-1,Lead,370.0,mg/kg,1.100,Y,Yes,-74.149512,40.734412,LPR-0430-07
15884,2019-07-09 16:10:00,2019 OU2 PDI Porewater Passive Sampler,SW6010,7439-92-1,Lead,11.0,mg/kg,0.730,Y,Yes,-74.155944,40.770182,LPR-0752-01R
15889,2019-07-09 14:15:00,2019 OU2 PDI Porewater Passive Sampler,SW6010,7439-92-1,Lead,51.0,mg/kg,0.640,Y,Yes,-74.152242,40.774933,LPR-0790-03R


In [3]:

contaminants_df.to_csv('../../notebooks/chemical_data.csv')







In [4]:
molar_mass={
        "2,3,7,8-Tetrachlorodibenzo-p-dioxin":321.97,
        "Cyanide":26.02,
        "Dieldrin":380.91,
        "Hexachlorobiphenyl; 3,3',4,4',5,5'- (PCB 169)":360.878,
        "Lead":207.2,
        "Mercury":200.59,
        "Pentachlorobiphenyl; 3,3',4,4',5- (PCB 126)":326.433
}

conversion_factor={
        'pg/g':10**-6,
        'pg/sample':10**-6,
        'ppt':10**-6,
        'ppb':10**-3,
        'ppm':1,
        'ug/kg':10**-3,
        'UMOLES/G':1,
        'ng/g':10**-3,
        'umol/g':1,
        'mg/kg':1,
        'ng/kg':10**-6,
        'pg':10**-4,
        'pg/l':10**-9,
        'ng/l':10**-6,
        'ug/l':10**-3,
        'mg/l':1
}
contaminants_df['REPORT_RESULT_UNIT'].fillna('ug/kg',inplace=True)
contaminants_df.dropna(subset=['REPORT_RESULT_VALUE'],inplace=True)
contaminants_df.isnull().sum()

def unit_conversion(row):
    REPORT_RESULT_VALUE, REPORT_RESULT_UNIT = row.REPORT_RESULT_VALUE, row.REPORT_RESULT_UNIT
    conversion = conversion_factor[REPORT_RESULT_UNIT]
    
    if REPORT_RESULT_UNIT == 'UMOLES/G' or REPORT_RESULT_UNIT == 'umol/g':
        return REPORT_RESULT_VALUE**2
    else:
    
        return REPORT_RESULT_VALUE*conversion


contaminants_df['SCALED_VALUE'] = contaminants_df.apply(unit_conversion,axis=1)

def value_moles(row):
        CHEMICAL_NAME, SCALED_VALUE = row.CHEMICAL_NAME, row.SCALED_VALUE
        molar = molar_mass[CHEMICAL_NAME]

        return SCALED_VALUE / molar

contaminants_df['VALUE_MUMOL_PER_GRAM'] = contaminants_df.apply(value_moles,axis=1)

contaminants_df.sample(20)


Unnamed: 0,SAMPLE_DATE,TASK_CODE,ANALYTIC_METHOD,CAS_RN,CHEMICAL_NAME,REPORT_RESULT_VALUE,REPORT_RESULT_UNIT,REPORT_RESULT_LIMIT,DETECT_FLAG,REPORTABLE_RESULT,LONGITUDE,LATITUDE,LOC_NAME,SCALED_VALUE,VALUE_MUMOL_PER_GRAM
88832,2017-10-27 10:18:00,2018-OCC-GSH-Sediment,E1668A,32774-16-6,"Hexachlorobiphenyl; 3,3',4,4',5,5'- (PCB 169)",185.0,pg/g,185.0,N,Yes,-74.117558,40.734839,,0.000185,5.126386e-07
151766,2017-10-20 10:30:00,2018-OCC-GSH-Sediment,E1613,1746-01-6,"2,3,7,8-Tetrachlorodibenzo-p-dioxin",93.3,pg/g,0.214,Y,Yes,-74.120973,40.707828,,9.33e-05,2.897786e-07
65247,2017-10-26 14:02:00,2017-2019 OU2 PDI Sediment,SW7471,7439-97-6,Mercury,8.0,mg/kg,0.44,Y,Yes,-74.128992,40.74144,,8.0,0.03988235
72145,2018-01-20 09:43:00,2017-2019 OU2 PDI Sediment,E1613,1746-01-6,"2,3,7,8-Tetrachlorodibenzo-p-dioxin",445.0,pg/g,0.316,Y,Yes,-74.149467,40.734176,,0.000445,1.382116e-06
150506,2017-12-13 09:13:00,2017 Passaic,SW6010,7439-92-1,Lead,68.7,mg/kg,4.5,Y,Yes,-74.156905,40.768054,,68.7,0.3315637
52886,2019-04-11 10:00:00,2017-2019 OU2 PDI Water Column,E1613,1746-01-6,"2,3,7,8-Tetrachlorodibenzo-p-dioxin",30.0,pg/l,13.9,Y,Yes,-74.152681,40.774853,,3e-08,9.317638e-11
36585,2017-11-14 11:45:00,2017 Passaic,E1613,1746-01-6,"2,3,7,8-Tetrachlorodibenzo-p-dioxin",511.0,pg/g,0.179,Y,Yes,-74.164858,40.757204,,0.000511,1.587104e-06
189318,2017-12-18 11:59:00,2017-2019 OU2 PDI Sediment,E1668A,57465-28-8,"Pentachlorobiphenyl; 3,3',4,4',5- (PCB 126)",237.0,pg/g,62.9,Y,Yes,-74.118961,40.72917,,0.000237,7.260295e-07
11466,1999-10-10 12:00:00,1999 Late Summer-Early Fall ESP Sampling,MET,7439-97-6,Mercury,2300.0,ug/kg,,Y,Yes,-74.136977,40.742111,PR9905SDL,2.3,0.01146617
3137,2018-01-27 08:58:00,2017-2019 OU2 PDI Sediment,E1699,60-57-1,Dieldrin,25500.0,pg/g,1110.0,Y,Yes,-74.163848,40.758575,,0.0255,6.694495e-05


In [5]:

# create a funciton that takes chemical_df 'REPORT_RESULT_VALUE' and 
# multiplies it by value in unit_conversion key of chemical_conversion_dict 
# that corresponds to 'REPORT_RESULT_UNIT' for that row and finally divides by molar_mass key in 
# chemical_conversion_dict that corrsponds to 'CHEMICAL_NAME'

def chemical_conversion(row):
    CHEMICAL_NAME, REPORT_RESULT_VALUE, REPORT_RESULT_UNIT = row.CHEMICAL_NAME, row.REPORT_RESULT_VALUE, row.REPORT_RESULT_UNIT
    conversion = conversion_factor[REPORT_RESULT_UNIT]
    molar = molar_mass[contaminants_df[6]]
    amount = contaminants_df[REPORT_RESULT_VALUE]
    return (amount*conversion)/molar

contaminants_df['SAMPLE_AMOUNT_CONVERTED'] = contaminants_df.apply(chemical_conversion, axis=1)


KeyError: 6