In [None]:
import os
from pathlib import Path
import pandas as pd

bp = Path(os.getcwd())
print(bp)

In [None]:
# Create list of contaminants to search for.
# List of GAMA 'top 10 contaminants'
contaminants_1 = [
    'TCPR123',
    'DBCP',
    'AS',
    'CR6',
    'PCATE',
    'PCE',
    'TCE',
    'U',
    'BZ',
    'BZME',
    'EBZ',
    'XYLENES',
    ]

# List of contaminants from CES Drinking Water Quality index plux BTEX and MTBE.
contaminants_2 = [
    'AS',
    'BZ',
    'BZME',
    'CD',
    'CR6',
    'DBCP',
    'EBZ',
    'EDB',
    'NO3N',
    'PB',
    'PCATE',
    'PCE',
    'TCE',
    'TCPR123',
    'THM',
    'XYLENES',
    'HAA5',
    'MTBE',
    ]
    
# List of test contaminants.
contaminants_3 = ['AS', 'BZ', 'BZME', 'CD']

# List of only BTEX contaminants.
btex_list = [
    'BZ',
    'BZME',
    'EBZ',
    'XYLENES',
    ]

# Ask for county to gather data for.
county = input('Enter county: ')

In [None]:
"""
open_table() is a function that opens a csv file and returns a dataframe. 
Will try to open the file with the default encoding, if that fails it will try with unicode_escape encoding.

---------------------------------------------------------------------------------------------------------------------
Args:
    p: path to file
    dtypes: dictionary of data types
    date_cols: list of columns to parse as dates
    cols: list of columns to use
"""

def open_table(p, dtypes, date_cols, cols):

    try:

        df = pd.read_csv(p, sep='\t', dtype=dtypes, parse_dates=date_cols, usecols=cols)
        return df
        
    except:

        df = pd.read_csv(p, sep='\t', dtype=dtypes, parse_dates=date_cols, usecols=cols, encoding='unicode_escape')
        return df

In [None]:
# edf_path = Path(r"C:\Users\chief\Desktop\sample_filtering\geotracker_edf_results")
edf_path = bp / 'geotracker_edf_results'

# Dictionary of data types for geotracker edf_results for open_table().
geotracker_dtypes = {
    'GLOBAL_ID' : 'string',
    'FIELD_PT_NAME' : 'string',
    'PARLABEL' : 'string',
    'PARVAL' : 'Float64',
    'PARVQ' : 'string',
    'REPDL' : 'Float64',
    'UNITS' : 'string',
    }

# Date column of geotracker edf_results for open_table().
geotracker_date = ['LOGDATE']

# Columns of geotracker edf_results for open_table().
geotracker_cols = list(geotracker_dtypes.keys()) + geotracker_date

print('Loading Geotracker EDF results \n')

# create list of files to open
edf_files = edf_path.glob('**/*{}*.zip'.format(county))

# Use list comprehension to create a list of dataframes from the files list. Uses open_table() to open the files.
edf_results_list = [open_table(i,geotracker_dtypes,geotracker_date,geotracker_cols) for i in edf_files]

# Concatenate the list of dataframes into one dataframe if there are more than one.
if len(edf_results_list) > 1:
    edf_results = pd.concat(edf_results_list)

else:
    edf_results = edf_results_list[0]

# Create WID column.
edf_results['WID'] = edf_results['GLOBAL_ID'] + '-' + edf_results['FIELD_PT_NAME']

# Drop unnecessary columns.
edf_results = edf_results.drop(columns=['GLOBAL_ID', 'FIELD_PT_NAME'])

In [None]:
# Set path of gama_results.
gama_path = bp / 'gama_results'

# Dictionary of data types for gama_results for open_table().
gama_dtypes = {
    'GM_WELL_ID' : 'string',
    'GM_CHEMICAL_VVL' : 'string',
    'GM_RESULT_MODIFIER' : 'string',
    'GM_RESULT' : 'Float64',
    'GM_RESULT_UNITS' : 'string',
    'GM_REPORTING_LIMIT' : 'Float64',
    }

# Date column of gama_results for open_table().
gama_date = ['GM_SAMP_COLLECTION_DATE']

# Columns of gama_results for open_table().
gama_cols = list(gama_dtypes.keys()) + gama_date

print('Loading GAMA results \n')

# Create list of files to open.
gama_files = gama_path.glob('**/*{}*.zip'.format(county.lower()))

# Use list comprehension to create a list of dataframes from the files list. Uses open_table() to open the files.
gama_results_list = [open_table(i,gama_dtypes,gama_date,gama_cols) for i in gama_files]

# Concatenate the list of dataframes into one dataframe.
gama_results = pd.concat(gama_results_list)

# Dictionary to rename gama columns to match edf_results.
gama_to_edf_dict = {
    'GM_WELL_ID' : 'WID',
    'GM_CHEMICAL_VVL' : 'PARLABEL',
    'GM_RESULT_MODIFIER' : 'PARVQ',
    'GM_RESULT' : 'PARVAL',
    'GM_RESULT_UNITS' : 'UNITS',
    'GM_REPORTING_LIMIT' : 'REPDL',
    'GM_SAMP_COLLECTION_DATE' : 'LOGDATE',
}

# Rename gama columns to match edf_results.
gama_results = gama_results.rename(columns=gama_to_edf_dict)

In [None]:
# Concatenate gama_results and edf_results.
samples = pd.concat([edf_results, gama_results])

# List of columns that require a value.
samples_req_cols = ['LOGDATE', 'PARLABEL', 'PARVAL']

# Drops rows with missing values in required columns.
samples = samples.dropna(subset=samples_req_cols)

# Set multi index on WID and LOGDATE.
samples = samples.set_index(['WID', 'LOGDATE'])

In [None]:
# Group samples by WID and LOGDATE apply list function to get list of PARLABELS for each group.
sample_groups = samples.groupby(['WID', 'LOGDATE'])['PARLABEL'].apply(list)

In [None]:
# Use list comprehension to create a list of sample indexes where all contaminants in the contaminant list are present.
index_list = [i for i in sample_groups.index if all(item in sample_groups.loc[i] for item in contaminants_3)]

# Uses index_list to create a dataframe of samples that meet the criteria.
select_samples = samples.loc[index_list]

# Keeps only samples of contaminants in the contaminant list.
select_samples = select_samples[select_samples['PARLABEL'].isin(contaminants_3)]

In [None]:
pivot_table = pd.pivot_table(select_samples, index=['WID', 'LOGDATE'], columns=['PARLABEL'], values=['PARVAL'])

In [None]:
pivot_table

In [None]:
pivot_table.to_excel(bp / 'pivot_table.xlsx')

In [None]:
wells.to_excel(bp / 'wells.excel')
