In [1]:
import pandas as pd
from pathlib import Path
import numpy as np
import os

######

# LISTS

# List of 8 Contaminants of Concern flagged by EPA as most dangerous
contaminant_list_8COCs = [
    # Metals
    "Copper",
    "Lead", 
    "Mercury", 
    # Dioxin (example compound - worst in dioxin class)
    "2,3,7,8-Tetrachlorodibenzo-p-dioxin",  
    # Dieldrin
    "Dieldrin",
    # DDT (example compound - most recognizable name in DDT class)
    "p,p'-DDT",
    # PAHs (example compound - worst in PAH class)
    "Benzo(a)pyrene",
    # PCBs (example compound - among the worst in PCB class)
    "Pentachlorobiphenyl; 3,3',4,4',5- (PCB 126)"
]

# List of the nicknames for the 8 COCs, as used in variables (not inside dataframes)
coc_variable_list = [
    "copper",
    "lead",
    "mercury",
    "dioxin",
    "dieldrin",
    "DDT",
    "pah",
    "pcb"
]

# List of 30 top contaminants (including 8 COCs)
contaminant_list_30 = [
    '2,3,7,8-Tetrachlorodibenzo-p-dioxin',
    'Cyanide',
    'Dieldrin',
    "Hexachlorobiphenyl; 3,3',4,4',5,5'- (PCB 169)",
    'Lead',
    'Mercury',
    'Copper',
    'Benzo(a)pyrene',
    "Pentachlorobiphenyl; 3,3',4,4',5- (PCB 126)",
    '1,2-Dichlorobenzene',
    '1,4-Dichlorobenzene',
    '2-Chlorophenol',
    'Chlorobenzene',
    "p,p'-DDD",
    'Benzene',
    'Chloroform',
    "Pentachlorobiphenyl; 2',3,4,4',5- (PCB 123)",
    "p,p'-DDT",
    "p,p'-DDE",
    'Aldrin',
    'Aroclor 1016',
    'Aroclor 1221',
    'Aroclor 1232',
    'Aroclor 1242',
    'Aroclor 1248',
    'Aroclor 1254',
    'Aroclor 1260',
    "Pentachlorobiphenyl; 2,3,3',4,4'- (PCB 105)",
    "Pentachlorobiphenyl; 2,3,3',4',6- (PCB 110)",
    "Pentachlorobiphenyl; 2,3,4,4',5- (PCB 114)",
    "Pentachlorobiphenyl; 2,3',4,4',5- (PCB 118)",
    'Chromium'
]

######

# DICTIONARIES

# 8 COC Dictionary matches the coc name used in all variables 
# to the official chemical name used inside all sampling dataframes.
eight_coc_dict = {
   # Metals
    "copper":"Copper",
    "lead":"Lead",
    "mercury":"Mercury",
    # Dioxin (example compound - worst in dioxin class)
    "dioxin":"2,3,7,8-Tetrachlorodibenzo-p-dioxin",
    # Dieldrin
    "dieldrin":"Dieldrin",
    # DDT (example compound - most recognizable name in DDT class)
    "DDT":"p,p'-DDT",
    # PAHs (example compound - worst in PAH class)
    "pah":"Benzo(a)pyrene",
    # PCBs (example compound - among the worst in PCB class)
    "pcb":"Pentachlorobiphenyl; 3,3',4,4',5- (PCB 126)"
}

# Molar Mass Dictionary matches top each of the 30 compounds (including 8 COCs) to its molar mass.
molar_mass_dict={
        "2,3,7,8-Tetrachlorodibenzo-p-dioxin":321.97,
        "Cyanide":26.02,
        "Dieldrin":380.91,
        "Hexachlorobiphenyl; 3,3',4,4',5,5'- (PCB 169)":360.878,
        "Lead":207.20,
        "Mercury":200.59,
        "Pentachlorobiphenyl; 3,3',4,4',5- (PCB 126)":326.433,
        "1,2-Dichlorobenzene":147.01,
        "1,4-Dichlorobenzene":147.00,
        "2-Chlorophenol":128.56,
        "Chlorobenzene":112.56,
        "p,p'-DDD":320.00,
        "Benzene":78.11,
        "Chloroform":119.38,
        "Pentachlorobiphenyl; 2',3,4,4',5- (PCB 123)":326.40,
        "p,p'-DDT":354.50,
        "p,p'-DDE":318.02,
        "Aldrin":364.90,
        "Aroclor 1016":257.543,
        "Aroclor 1221":188.653,
        "Aroclor 1232":188.653,
        "Aroclor 1242":260.57,
        "Aroclor 1248":291.988,
        "Aroclor 1254":326.40,
        "Aroclor 1260":376,
        "Pentachlorobiphenyl; 2,3,3',4,4'- (PCB 105)":326.40,
        "Pentachlorobiphenyl; 2,3,3',4',6- (PCB 110)":326.40,
        "Pentachlorobiphenyl; 2,3,4,4',5- (PCB 114)":326.40,
        "Pentachlorobiphenyl; 2,3',4,4',5- (PCB 118)":323.883,
        "Chromium":51.996,
        "Benzo(a)pyrene":252.31,
        "Copper":63.546
    }

# Conversion Factor Dictionary matches each type of sampling unit to its amount in microgram/gram (ug/g)
conversion_factor_dict={
            'pg/g':10**-6,
            'pg/sample':10**-6,
            'ppt':10**-6,
            'ppb':10**-3,
            'ppm':1,
            'ug/kg':10**-3,
            'UMOLES/G':1,
            'ng/g':10**-3,
            'umol/g':1,
            'mg/kg':1,
            'ng/kg':10**-6,
            'pg':10**-4,
            'pg/l':10**-9,
            'ng/l':10**-6,
            'ug/l':10**-3,
            'mg/l':1,
            'ng/ml':10**-3
        }

######

# FUNCTIONS

# All Clean Data Function - returns a dataframe.
# This function spins through the filepath containing all clean data 
# csvs and creates a dataframe of all information. 
# WARNING: resulting dataframe is almost 4 million rows.
def clean_data_df(file_path):
    clean_data = []

    for filename in os.listdir(file_path):
        if filename.endswith(".csv"):
            csv_data = pd.read_csv(file_path + '/' + filename, parse_dates=True, infer_datetime_format=True)
            clean_data.append(csv_data)

    return pd.concat(clean_data)

# Chemical Filter Function - returns a dataframe.
# This function spins through the filepath containing all clean data 
# csvs and creates a dataframe of only the specified chemicals.
def chemical_filter(file_path, chemical_list):
    chemical_data = []
    
    for filename in os.listdir(file_path):
        if filename.endswith(".csv"):
            csv_data = pd.read_csv(file_path +'/'+ filename, parse_dates=True, infer_datetime_format=True)
            for item in chemical_list:
                chemicals_filtered = csv_data[csv_data['CHEMICAL_NAME'] == item]
                chemicals_filtered = chemicals_filtered.iloc[: , 1:]
                chemical_data.append(chemicals_filtered)

    return pd.concat(chemical_data)


# Convert Sampling Units to Moles function - returns a dataframe.
# This function ingests a dataframe, reads the sample values and add columns
# with the converted value to ug/g and umol/g.
def chemical_to_moles(df):

    units_to_convert = list(conversion_factor_dict.keys())
    
    df.dropna(subset=['REPORT_RESULT_VALUE','REPORT_RESULT_UNIT'],inplace=True)
    df = df[df['REPORT_RESULT_UNIT'].isin(units_to_convert)]

    def unit_conversion(row):
        REPORT_RESULT_VALUE, REPORT_RESULT_UNIT = row.REPORT_RESULT_VALUE, row.REPORT_RESULT_UNIT
        conversion = conversion_factor_dict[REPORT_RESULT_UNIT]
    
        if REPORT_RESULT_UNIT == 'UMOLES/G' or REPORT_RESULT_UNIT == 'umol/g':
            return REPORT_RESULT_VALUE**2
        else:
            return REPORT_RESULT_VALUE*conversion

    df['VALUE_MUGRAM_PER_GRAM'] = df.apply(unit_conversion,axis=1)

    def value_moles(row):
            CHEMICAL_NAME, VALUE_MUGRAM_PER_GRAM = row.CHEMICAL_NAME, row.VALUE_MUGRAM_PER_GRAM
            molar = molar_mass_dict[CHEMICAL_NAME]

            return VALUE_MUGRAM_PER_GRAM / molar

    df['VALUE_MUMOL_PER_GRAM'] = df.apply(value_moles,axis=1)

    return df


# Convert Sampling Units to ug/g function - returns a dataframe.
# This function ingests a dataframe, reads the sample values and adds
# one column with the value converted to ug/g.
def chemical_to_ugrams(df):

    units_to_convert = list(conversion_factor_dict.keys())
    
    df.dropna(subset=['REPORT_RESULT_VALUE','REPORT_RESULT_UNIT'],inplace=True)
    df = df[df['REPORT_RESULT_UNIT'].isin(units_to_convert)]

    def unit_conversion(row):
        REPORT_RESULT_VALUE, REPORT_RESULT_UNIT = row.REPORT_RESULT_VALUE, row.REPORT_RESULT_UNIT
        conversion = conversion_factor_dict[REPORT_RESULT_UNIT]
    
        if REPORT_RESULT_UNIT == 'UMOLES/G' or REPORT_RESULT_UNIT == 'umol/g':
            return REPORT_RESULT_VALUE**2
        else:
            return REPORT_RESULT_VALUE*conversion

    df['VALUE_MUGRAM_PER_GRAM'] = df.apply(unit_conversion,axis=1)
    df.drop('Unnamed: 0',axis=1,inplace=True)
    
    return df

# Call Dataframe Filtered to List of Contaminants
def call_filtered_df(filepath, chem_filter_list):
    df = chemical_filter(filepath, chem_filter_list)
    # Convert all values to standardized measurements
    df = chemical_to_moles(df)
    df = df.fillna('0')
    # Add sampling year
    df['SAMPLE_YEAR'] = pd.to_datetime(df['SAMPLE_DATE']).dt.year
    # Add sampling year/month
    df['SAMPLE_YEAR_MONTH'] = df['SAMPLE_DATE'].dt.strftime('%Y-%m')
    return df

# Create 8 COC Dataframe function - returns a dataframe
def call_8coc_df(filepath):
    subset8_df = chemical_filter(filepath, contaminant_list_8COCs)
    # Convert all values to standardized measurements
    subset8_df = chemical_to_moles(subset8_df)
    subset8_df = subset8_df.fillna('0')
    # Add sampling year
    subset8_df['SAMPLE_YEAR'] = pd.to_datetime(subset8_df['SAMPLE_DATE']).dt.year
    return subset8_df

# Call Average Sample Value at Location Individual Dataframe for each of 8 COCs
def create_coc_density_df(subset8_df, coc):
    coc_df = subset8_df[subset8_df['CHEMICAL_NAME'] == eight_coc_dict[coc]]
    coc_df = coc_df.groupby(["LATITUDE", "LONGITUDE"]).mean()
    coc_df['CHEMICAL_NAME'] = eight_coc_dict[coc]
    coc_df['SAMPLE_YEAR'] = coc_df['SAMPLE_YEAR'].astype('int')
    coc_df.reset_index(inplace=True)
    return coc_df


# STOCK VARIABLES
filepath = '../data/cleandata'

# 8 COC full dataframe and individual dataframes
subset8_df = call_8coc_df(filepath)
copper_df = subset8_df[subset8_df['CHEMICAL_NAME'] == "Copper"]
lead_df = subset8_df[subset8_df['CHEMICAL_NAME'] == "Lead"]
mercury_df = subset8_df[subset8_df['CHEMICAL_NAME'] == "Mercury"]
dioxin_df = subset8_df[subset8_df['CHEMICAL_NAME'] == "2,3,7,8-Tetrachlorodibenzo-p-dioxin"]
dieldrin_df = subset8_df[subset8_df['CHEMICAL_NAME'] == "Dieldrin"]
DDT_df = subset8_df[subset8_df['CHEMICAL_NAME'] == "p,p'-DDT"]
pah_df = subset8_df[subset8_df['CHEMICAL_NAME'] == "Benzo(a)pyrene"]
pcb_df = subset8_df[subset8_df['CHEMICAL_NAME'] == "Pentachlorobiphenyl; 3,3',4,4',5- (PCB 126)"]

# Average Sample Values at Location Dataframes for Density Plot
copper_density_df = create_coc_density_df(subset8_df, "copper")
lead_density_df = create_coc_density_df(subset8_df, "lead")
mercury_density_df = create_coc_density_df(subset8_df, "mercury")
dioxin_density_df = create_coc_density_df(subset8_df, "dioxin")
dieldrin_density_df = create_coc_density_df(subset8_df, "dieldrin")
DDT_density_df = create_coc_density_df(subset8_df, "DDT")
pah_density_df = create_coc_density_df(subset8_df, "pah")
pcb_density_df = create_coc_density_df(subset8_df, "pcb")


In [2]:
copper_density_df

Unnamed: 0,LATITUDE,LONGITUDE,REPORT_RESULT_VALUE,VALUE_MUGRAM_PER_GRAM,VALUE_MUMOL_PER_GRAM,SAMPLE_YEAR,CHEMICAL_NAME
0,38.820562,-76.227336,155000.00000,155.000000,2.439178,1997,Copper
1,38.820562,-76.227336,0.00640,0.006400,0.000101,1997,Copper
2,39.552112,-74.480792,3800.00000,3.800000,0.059799,1999,Copper
3,39.552249,-74.407547,350.00000,0.350000,0.005508,1999,Copper
4,39.552472,-74.480759,15650.01210,15.650293,0.246283,1999,Copper
...,...,...,...,...,...,...,...
1703,40.923329,-74.135673,13.86180,13.850278,0.217957,2012,Copper
1704,40.927726,-74.141881,7.12855,7.062340,0.111137,2012,Copper
1705,40.928433,-74.142206,7.28260,7.252126,0.114124,2012,Copper
1706,40.941150,-74.161069,70.00000,70.000000,1.101564,1990,Copper
