## Import Packages and Set Starting Parameters

In [49]:
import os, pandas as pd, slr_pkg.clean_load_data as cld, slr_pkg.para as para
from slr_pkg.clean_load_data import open_table
from pathlib import Path


# Get current working directory
bp = Path(os.getcwd())

# Set sample data directory.
edf_path = bp / 'geotracker_edf_results'
gama_path = bp / 'gama_results'

# Set location data directory.
geo_xy_path = bp / 'geotracker_xy'
gama_xy_path = bp / "gama_xy"

# Set results directory
results_path = bp / "results"

# Ask for county to gather data for.
# area = input('Enter county: ')
area = 'SanDiego'

# List of contaminants.
chems = para.conts11

## Open and Concat Data

### Load Sample Data

In [50]:
edf_files = edf_path.glob('**/*{}*.zip'.format(area))
gama_files = gama_path.glob('**/*{}*.zip'.format(area.lower()))

samples = cld.Sample_Data.full_dataset(edf_files, gama_files)

print(samples)

Loading Geotracker file. e:\work\projects\coast_slr\scripts\slr_ground_water_quality_\geotracker_edf_results\SanDiegoEDF.zip
Loading GAMA file. e:\work\projects\coast_slr\scripts\slr_ground_water_quality_\gama_results\gama_ddw_sandiego_v2.zip
Loading GAMA file. e:\work\projects\coast_slr\scripts\slr_ground_water_quality_\gama_results\gama_dpr_sandiego_v2.zip
Loading GAMA file. e:\work\projects\coast_slr\scripts\slr_ground_water_quality_\gama_results\gama_dwr_sandiego_v2.zip
Loading GAMA file. e:\work\projects\coast_slr\scripts\slr_ground_water_quality_\gama_results\gama_gama_dom_sandiego_v2.zip
Loading GAMA file. e:\work\projects\coast_slr\scripts\slr_ground_water_quality_\gama_results\gama_gama_sp-study_sandiego_v2.zip
Loading GAMA file. e:\work\projects\coast_slr\scripts\slr_ground_water_quality_\gama_results\gama_gama_usgs_sandiego_v2.zip
Loading GAMA file. e:\work\projects\coast_slr\scripts\slr_ground_water_quality_\gama_results\gama_usgs_nwis_sandiego_v2.zip
Loading GAMA file. e:\

In [51]:
print(len(samples))
print(len(samples['SID'].unique()))

7546078
7546078


### Load Location Data

In [52]:
geo_xy_files = geo_xy_path.glob('**/*{}*.zip'.format(area))
gama_xy_files = gama_xy_path.glob('**/*.zip')

locations = cld.Location_Data.full_dataset(geo_xy_files, gama_xy_files)

print(locations)

Loading Geotracker file. e:\work\projects\coast_slr\scripts\slr_ground_water_quality_\geotracker_xy\SanDiegoGeoXY.zip
Loading Geotracker file. e:\work\projects\coast_slr\scripts\slr_ground_water_quality_\gama_xy\gama_location_construction_v2.zip
Concatenating GAMA and Geotracker dataframes. 

Checking for missing values. 

                 FIELD_PT_CLASS   LATITUDE   LONGITUDE                  WID
0                            MW  33.121798 -117.319569    T0607300319-VEW-1
1                            MW  33.121793 -117.319447    T0607300319-VEW-2
2                            MW  33.183242 -117.369407    T0607301407-OMW-1
3                            MW  33.183227 -117.369231    T0607301407-OMW-2
4                            MW  33.183421 -117.369302    T0607301407-OMW-4
...                         ...        ...         ...                  ...
298597               MONITORING  38.372833 -122.912156     T0609700197-MW-2
298598               MONITORING   38.44332 -122.674776      T060970

### Join Sample and Location Data

In [53]:
# Join well location data to sample results.
samples = samples.merge(locations, left_on='WID', right_on='WID', how='inner')
print(len(samples))
print(len(samples['SID'].unique()))

7546078
7546078


### Join MCL Table to Samples

In [54]:
print('Loading MCL table \n')

# Create path to mcl table.
mcl_path = bp / 'MCLs.xlsx'

# Open mcl table.
mcl = pd.read_excel(mcl_path,sheet_name='MCL', engine='openpyxl')

# join MCL values to sample results
print('Joining MCL values to samples \n')
samples = samples.merge(mcl, left_on='PARLABEL', right_on='chem_abrv', how='inner')
print(len(samples))
print(len(samples['SID'].unique()))

Loading MCL table 

Joining MCL values to samples 

3200356
3200356


### Join Unit Conversion Data to Samples

In [55]:
# Load conversion tables.
metric_conversion = pd.read_excel(bp / 'unit_conversion.xlsx', sheet_name='metric')

# join coversion factors to samples based on sample unit.
samples = samples.merge(metric_conversion, how='inner', left_on='UNITS', right_on='start_unit')
print(len(samples))
print(len(samples['SID'].unique()))

3188665
3188665


### Convert Units

In [56]:
# Create mask for samples with MCL units in UG/L and converts sample result units to UG/L.
mask = samples['UNITS'] != samples['units']

# Multiply sample results by conversion factor.
samples.loc[mask, 'PARVAL'] = samples['PARVAL'] * samples['coef']
samples['UNITS'] = 'UG/L'

# Drop columns that are not needed.
samples.drop(columns=['REPDL','chem_abrv', 'units','comp_conc_type','start_unit', 'coef'], inplace=True)
print(len(samples))
print(len(samples['SID'].unique()))

3188665
3188665


### Add Exceedence and Magnitude Attributes

In [57]:
# Create exceedence attribute, true if sample result exceeds reporting limit.
samples['exceedence'] = samples['PARVAL'] > samples['comp_conc_val']

# Create magnitude attribute. Sample result value divided by the comparison concentration value (MCL or Action level) minus 1.
samples['magnitude'] = (samples['PARVAL'] / samples['comp_conc_val']) - 1

In [58]:
print(len(samples))
print(len(samples['SID'].unique()))

3188665
3188665


### **All Samples**

In [59]:
# samples.to_csv(results_path / '{}_all_sample_results.csv'.format(area.lower()))

### Select Specific Wells

In [60]:
# subset of specific samples meeting parameters.
spec_samples = samples
print(len(spec_samples))
print(len(spec_samples['SID'].unique()))
# Select spec_samples taken since 2010.
spec_samples = spec_samples.loc[spec_samples['LOGDATE'] >= '2012-01-01']

# Select samples with contaminants of interest.
spec_samples = spec_samples.loc[spec_samples['PARLABEL'].isin(chems)]

# Create groups of spec_samples based on WID and PARLABEL(contaminant label).
sample_groups = spec_samples.groupby(['WID'])['PARLABEL'].apply(list).reset_index()

3188665
3188665


In [61]:
print(len(spec_samples))
print(len(spec_samples['SID'].unique()))

503035
503035


In [62]:
from collections import Counter


def select_wells(row):
    wid = row['WID']
    counter = Counter(row['PARLABEL'])
    if len(counter) == len(chems):
        print(counter.values())
        if all(i >= 4 for i in counter.values()):
            return  wid


# Create mask of sample groups meeting parameter requirements.
res = sample_groups.apply(select_wells, axis=1)

dict_values([18, 18, 18, 18, 18, 18, 18, 18, 18, 13, 1])
dict_values([1, 1, 1, 1, 1, 2, 1, 1, 3, 4, 1])
dict_values([2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 1])
dict_values([1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1])
dict_values([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])
dict_values([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
dict_values([3, 3, 3, 3, 3, 6, 3, 3, 5, 6, 2])
dict_values([3, 3, 3, 3, 3, 5, 3, 3, 5, 6, 2])
dict_values([1, 1, 1, 1, 1, 2, 1, 1, 2, 3, 1])
dict_values([1, 1, 1, 1, 1, 1, 1, 1, 3, 13, 1])
dict_values([1, 1, 1, 1, 1, 1, 1, 2, 3, 13, 1])
dict_values([3, 3, 3, 3, 3, 1, 3, 3, 3, 3, 1])
dict_values([1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1])
dict_values([4, 4, 4, 4, 4, 1, 4, 4, 2, 4, 1])
dict_values([3, 3, 3, 3, 3, 2, 3, 3, 3, 5, 2])
dict_values([3, 3, 3, 3, 3, 3, 3, 4, 3, 6, 2])
dict_values([3, 3, 3, 3, 3, 3, 3, 4, 3, 6, 2])
dict_values([2, 2, 2, 2, 2, 3, 2, 3, 3, 6, 1])
dict_values([1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1])
dict_values([1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1])
dict_values([1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1])
d

In [63]:
# Use mask to select sample results from wells that meet parameter requirements.
spec_samples = spec_samples[spec_samples['WID'].isin(res)]

In [64]:
print(len(samples))
print(len(samples['SID'].unique()))

3188665
3188665


In [65]:
# number of unique wells
nwells = len(spec_samples['WID'].unique())
print('Number of wells: ' + str(nwells))

Number of wells: 40


In [66]:
# Save sample results to csv.
#spec_samples.to_csv(results_path / '{}_spec_sample_results_11.csv'.format(area.lower()))

### Join Groundwater Elevations to Sample Results

In [67]:
### Depth to Water Data
# Load GAMA dtw data
# Create elev_path.
elev_path = bp / 'elevation'
print(elev_path, '\n')

# Dictionary of data types for gama_elev gama_elev for open_table().
gama_elev_dtypes = {
    'WELL NUMBER' : 'string',
    'DEPTH TO WATER' : 'float64',
    }

# Date column of gama_elev gama_elev for open_table().
gama_elev_date = ['MEASUREMENT DATE']

# Columns of gama_elev gama_elev for open_table().
gama_elev_cols = list(gama_elev_dtypes.keys()) + gama_elev_date


print('Loading GAMA groundwater elevations. \n')

# create list of files to open
gama_elev_files = elev_path.glob('**/*gama*.zip')
gama_elev_files = list(gama_elev_files)

# Use list comprehension to create a list of dataframes from the files list. Uses open_table() to open the files.
gama_elev_list = [open_table(i,dtypes = gama_elev_dtypes,date_cols = gama_elev_date, cols =gama_elev_cols) for i in gama_elev_files]
print(gama_elev_list)

# Concatenate the list of dataframes into one dataframe if there are more than one.
if len(gama_elev_list) > 1:
    gama_elev = pd.concat(gama_elev_list)

else:
    gama_elev = gama_elev_list[0]

# Dict of attributes to rename.
gama_geo_dict = {
    'WELL NUMBER' : 'WID',
    'DEPTH TO WATER' : 'DTW',
    'MEASUREMENT DATE' : 'LOGDATE',
}
# Rename columns.
gama_elev = gama_elev.rename(columns=gama_geo_dict)

# Fix column formatting.
gama_elev['LOGDATE'] = gama_elev['LOGDATE'].astype(str)
gama_elev['WID'] = gama_elev['WID'].str.replace(' ', '')

# Create GID (group id) column. GID is the WID and LOGDATE concatenated.
gama_elev['GID'] = list(zip(gama_elev['WID'], gama_elev['LOGDATE']))
# Load Geotracker DTW data.
# Dictionary of data types for geo_elev geo_elev for open_table().
geo_elev_dtypes = {
    'GLOBAL_ID' : 'string',
    'FIELD_POINT_NAME' : 'string',
    'DTW' : 'float64',
    }

# Date column of geo_elev geo_elev for open_table().
geo_elev_date = ['GW_MEAS_DATE']

# Columns of geo_elev geo_elev for open_table().
geo_elev_cols = list(geo_elev_dtypes.keys()) + geo_elev_date

print('Loading Geotracker groundwater elevations. \n')

# create list of files to open
geo_elev_files = elev_path.glob('**/*Geo*.zip')
geo_elev_files = list(geo_elev_files)


# Use list comprehension to create a list of dataframes from the files list. Uses open_table() to open the files.
geo_elev_list = [open_table(i,geo_elev_dtypes,date_cols= geo_elev_date,cols =geo_elev_cols) for i in geo_elev_files]

# Concatenate the list of dataframes into one dataframe if there are more than one.
if len(geo_elev_list) > 1:
    geo_elev = pd.concat(geo_elev_list)

else:
    geo_elev = geo_elev_list[0]

# Create WID column.
geo_elev['WID'] = geo_elev['GLOBAL_ID'] + '-' + geo_elev['FIELD_POINT_NAME']

# Drop unnecessary columns.
geo_elev = geo_elev.drop(columns=['GLOBAL_ID', 'FIELD_POINT_NAME'])

# fix column formatting.
geo_elev['WID'] = geo_elev['WID'].str.replace(' ', '')

# Rename columns.
geo_elev = geo_elev.rename(columns={'GW_MEAS_DATE' : 'LOGDATE'})

# Fix column formatting.
geo_elev['LOGDATE'] = geo_elev['LOGDATE'].astype(str)

# Create GID (group id) column. GID is the WID and LOGDATE concatenated.
geo_elev['GID'] = list(zip(geo_elev['WID'], geo_elev['LOGDATE']))

e:\work\projects\coast_slr\scripts\slr_ground_water_quality_\elevation 

Loading GAMA groundwater elevations. 

[                  WELL NUMBER MEASUREMENT DATE  DEPTH TO WATER
0               17S11E22E002S       1993-10-19          100.52
1               17S11E22E002S       1994-03-14          100.41
2               17S11E22E002S       1994-10-17          100.43
3               17S11E22E002S       1995-03-29          100.36
4               17S11E22E002S       1995-10-16          100.17
...                       ...              ...             ...
4229858  SL204131495 - REW-12       2021-11-16           72.20
4229859  SL204131495 - REW-12       2021-12-07           72.33
4229860  SL204131495 - REW-14       2017-04-04             NaN
4229861  SL204131495 - REW-15       2017-04-04             NaN
4229862   SL204131495 - REW-3       2017-04-04             NaN

[4229863 rows x 3 columns]]
Loading Geotracker groundwater elevations. 



In [68]:
# Concatenate gama_results and edf_results.
dtw = pd.concat([geo_elev, gama_elev])
dtw['dtw_units'] = 'ft'

# List of columns that require a value.
dtw_req_cols = ['WID','DTW','LOGDATE']

# Drops rows with missing values in required columns.
dtw = dtw.dropna(subset=dtw_req_cols)

# Drop duplicate GID rows.
dtw = dtw.drop_duplicates(subset=['GID'])
samples_dtw = spec_samples.merge(dtw, left_on=['GID'], right_on=['GID'], how='inner')

In [69]:
spec_samples

Unnamed: 0,LOGDATE,PARLABEL,PARVAL,PARVQ,UNITS,WID,GID,SID,FIELD_PT_CLASS,LATITUDE,LONGITUDE,chem_name,comp_conc_val,exceedence,magnitude
29248,2019-10-15,XYLENES,50.0,=,UG/L,L10009614226-OTAY-LCRS,"(L10009614226-OTAY-LCRS, 2019-10-15)","((L10009614226-OTAY-LCRS, 2019-10-15), XYLENES)",LSP,32.598853,-117.008522,Xylenes (total),1750.0,False,-0.971429
30682,2015-10-28,XYLENES,37.0,=,UG/L,L10009614226-OTAY-LCRS,"(L10009614226-OTAY-LCRS, 2015-10-28)","((L10009614226-OTAY-LCRS, 2015-10-28), XYLENES)",LSP,32.598853,-117.008522,Xylenes (total),1750.0,False,-0.978857
31440,2016-10-25,XYLENES,31.0,=,UG/L,L10009614226-OTAY-LCRS,"(L10009614226-OTAY-LCRS, 2016-10-25)","((L10009614226-OTAY-LCRS, 2016-10-25), XYLENES)",LSP,32.598853,-117.008522,Xylenes (total),1750.0,False,-0.982286
32138,2020-10-06,XYLENES,27.0,=,UG/L,L10009614226-OTAY-LCRS,"(L10009614226-OTAY-LCRS, 2020-10-06)","((L10009614226-OTAY-LCRS, 2020-10-06), XYLENES)",LSP,32.598853,-117.008522,Xylenes (total),1750.0,False,-0.984571
32769,2017-10-16,XYLENES,24.0,=,UG/L,L10009614226-OTAY-LCRS,"(L10009614226-OTAY-LCRS, 2017-10-16)","((L10009614226-OTAY-LCRS, 2017-10-16), XYLENES)",LSP,32.598853,-117.008522,Xylenes (total),1750.0,False,-0.986286
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3102604,2017-10-16,CD,0.0,ND,UG/L,L10002513368-RAM-LCRS,"(L10002513368-RAM-LCRS, 2017-10-16)","((L10002513368-RAM-LCRS, 2017-10-16), CD)",,,,Cadmium,5.0,False,-1.0
3102623,2012-08-21,CD,0.0,ND,UG/L,SL209294204-QCEB,"(SL209294204-QCEB, 2012-08-21)","((SL209294204-QCEB, 2012-08-21), CD)",,,,Cadmium,5.0,False,-1.0
3102626,2013-07-23,CD,0.0,ND,UG/L,SL209294204-QCEB,"(SL209294204-QCEB, 2013-07-23)","((SL209294204-QCEB, 2013-07-23), CD)",,,,Cadmium,5.0,False,-1.0
3102656,2016-10-26,CD,0.0,ND,UG/L,L10009474974-SYC-LCRS,"(L10009474974-SYC-LCRS, 2016-10-26)","((L10009474974-SYC-LCRS, 2016-10-26), CD)",,,,Cadmium,5.0,False,-1.0


In [70]:
samples_dtw.columns

dtw_req_cols = ['LOGDATE_x', 'UNITS','PARLABEL', 'PARVAL', 'WID_x', 'DTW', 'LATITUDE', 'LONGITUDE', 'dtw_units']

for i in samples_dtw.columns:
    if i not in dtw_req_cols:
        samples_dtw = samples_dtw.drop(columns=i)

print(samples_dtw.columns)

Index(['LOGDATE_x', 'PARLABEL', 'PARVAL', 'UNITS', 'WID_x', 'LATITUDE',
       'LONGITUDE', 'DTW', 'dtw_units'],
      dtype='object')


In [71]:
chem_num = str(len(chems))

a = (len(spec_samples))
b = (len(samples_dtw))
c =((len(samples_dtw) / len(spec_samples)*100))
c = "{:.2f}".format(c)

print(area, chem_num,': \n')
print('There are ' + str(b) + ' samples with depth to water values.')
print("Out of " + str(a) + " samples in the original dataframe.")
print(str(c) + "% of samples. \n")

a = (len(spec_samples['WID'].unique()))
b = (len(samples_dtw['WID_x'].unique()))
c = (b/a)*100
c = "{:.2f}".format(c)

print('There are ' + str(b) + ' wells with depth to water values.')
print("Out of " + str(a) + " wells in the original dataframe.")
print(str(c) + "% of  wells. \n")

SanDiego 11 : 

There are 830 samples with depth to water values.
Out of 6035 samples in the original dataframe.
13.75% of samples. 

There are 7 wells with depth to water values.
Out of 40 wells in the original dataframe.
17.50% of  wells. 



In [73]:
# Save sample results to csv.
samples_dtw.to_csv(results_path / '{}_dd_dtw_sample_results_{}.csv'.format(area.lower(), chem_num))

In [None]:
# Run from here

### Pivot table for CCME Water Quality Index

In [None]:
sample_results.rename(columns={'WID' : 'Station', 'LOGDATE' : 'Date'}, inplace=True)

sample_results['PARLABEL'] = sample_results['PARLABEL'] + '_' + sample_results['units']

pivot_table = pd.pivot_table(sample_results, index=['Station', 'Date'], columns=['PARLABEL'], values=['PARVAL'])
ccme_wqi_data = pivot_table.reset_index()

ccme_wqi_data.columns = ['Station', 'Date', 'AS_UG/L', 'BZME_UG/L', 'BZ_UG/L', 'CD_UG/L', 'DBCP_UG/L',
       'EBZ_UG/L', 'EDB_UG/L', 'MTBE_UG/L', 'NO3N_MG/L', 'PB_UG/L', 'PCE_UG/L',
       'TCE_UG/L', 'TCPR123_UG/L', 'THM_UG/L', 'XYLENES_UG/L']

ccme_wqi_data.dropna(inplace=True)

In [None]:
ccme_wqi_data.to_csv(results_path / '{}_ccme_wqi_conc_samples.csv'.format(county.lower()))

### Normalize Sample Result Values at Wells

In [None]:
# Calculates the mean of magnitudes for each WID in the exceedences dataframe.
print('Calculating magnitudes for each WID \n')
print(samples_mcl.head())

means = samples_mcl.groupby(['WID'])['magnitude'].mean()

In [None]:
# Join mean magnitudes to well locations.
print('Merging geometric mean magnitudes to wells \n')
wells = wells.merge(means, how='inner', left_on='WID', right_index=True)
wells = wells.set_index('WID').sort_index()

# Save well mean magnitudes to csv.
wells.to_csv(bp / 'wells.csv')

In [None]:
# Convert well mean magnitudes to shapefile
import geopandas as gpd

# Create geodataframe from well mean magnitudes, uses long and lat columns as xy coordinates, NAD83 projection.
gdf = gpd.GeoDataFrame(wells, geometry=gpd.points_from_xy(x=wells.LONGITUDE, y=wells.LATITUDE), crs='EPSG:4326')

# Reproject to UTM 11N.
gdf = gdf.to_crs('EPSG:26911')


gdf.to_file(results_path / 'wells.shp'.format(county))

## **Sample Groups**

### Create Sample Groups

In [None]:
# Group samples by WID and LOGDATE apply list function to get list of PARLABELS for each group.
sample_groups = samples_mcl.groupby(['WID', 'LOGDATE'])['PARLABEL'].apply(list)

### Single Contaminant List

In [None]:
# Use list comprehension to create a list of sample indexes where all contaminants in the contaminant list are present.
index_list = [i for i in sample_groups.index if all(item in sample_groups.loc[i] for item in contaminants_3)]

# Uses index_list to create a dataframe of samples that meet the criteria.
sample_group_results = samples_mcl[samples_mcl['GID'].isin(index_list)]

In [None]:
# Print groups of samples that meet the criteria.
print('Groups: ',len(index_list))
print('Samples: ',len(sample_results))

In [None]:
# Join location data to sample results.
sample_group_results = sample_results.merge(wells, left_on='WID', right_on='WID', how='inner')

# Save sample group results to csv.
sample_group_results.to_csv(bp / '{}_sample_results.csv'.format(county.lower()))

In [None]:
sample_results.to_csv(bp / '{}_sample_results.csv'.format(county.lower()))

### Contaminant Combinations

In [None]:
from itertools import combinations

# Create list of all combinations of all contaminants
combinations_list = list(combinations(contaminants_3, 10))
len(combinations_list)

In [None]:
# Function to select sample groups based on combinations of contaminants.
def get_select_samples(row, contaminants):

    # checks list of contaminants in row against list of contaminants in function call.
    # if all contaminants in row are in contaminants, return True.
    if all(item in row for item in contaminants):
        return True

    else:
        return False

In [None]:
def get_select_samples(row, contaminants):

    count = 0

    for values in row:
        print(values)
        if all(item in values for item in contaminants):
            count += 1
    return count

In [None]:
ser_dict = {}

total =  len(combinations_list)
count = 0

for contaminants in combinations_list:

    count += 1
    percent = int(((count/total)*100))

    ser = sample_groups.apply(get_select_samples, contaminants=contaminants_3)

    ser_dict[contaminants] = ser

    #print('{}%'.format(percent))
    
combo_stats = pd.DataFrame.from_dict(ser_dict, orient='index')

print(combo_stats.max())
print(list(combo_stats.idxmax()))

In [None]:
ser_dict

## **Modin Combo Stats**

In [None]:
def get_select_samples_modin(row, contaminants):
    print(row)

    if all(element in row for element in contaminants) ==  True:
        print('contains all elements')
        return True

    else:
        print('does not contain all elements')
        return False

In [None]:
combinations_list

In [None]:
samples_modin = mpd.DataFrame(samples)

In [None]:
import modin.pandas as mpd
from distributed import Client
client = Client()

sample_groups_modin = mpd.DataFrame(sample_groups)


ser_dict = {}

for contaminants in combinations_list:

    ser = sample_groups_modin.apply(get_select_samples_modin, contaminants=contaminants)
    ser = ser[ser == True]

    ser_dict[contaminants] = len(ser)



print(max(ser_dict.values()))

In [None]:
print(combo_stats.max())
print(list(combo_stats.idxmax()))

In [None]:
df[df == True]

In [None]:
df[df == True]

In [None]:
print('Loading MCL table \n')

# Create path to mcl table.
mcl_path = bp / 'MCL_list_1.xlsx'

# Open mcl table.
mcl = pd.read_excel(mcl_path, engine='openpyxl')

# join MCL values to sample results
print('Joining MCL values to samples \n')
samples_mcl = select_samples.merge(mcl, left_on='PARLABEL', right_on='chem_abrv', how='left').set_index(select_samples.index)

In [None]:
# Save samples_mcl to csv.
alt = input("Input filename ending for 'county'_select_samples_'input'.csv: ")
name = '{}_select_samples_{}.csv'.format(county.lower(), alt)
sp = bp / name
samples_mcl.to_csv(sp)

In [None]:
# Get counts of samples for each contaminant.
parlabel_stats = samples['PARLABEL'].value_counts()

# Create a dataframe with the counts of samples for each contaminant.
parlabel_stats = parlabel_stats.to_frame(name='COUNTS').reset_index().rename(columns={'index':'PARLABEL'})

# Create PERCENT column for each contaminant. Showing percent of samples for each contaminant compared to total samples.
parlabel_stats['PERCENT'] = (parlabel_stats['COUNTS'] / len(samples) * 100).round(4)

In [None]:
# Save samples_mcl to csv.
name = '{}_parlabel_stats.csv'.format(county.lower())
sp = bp / name
parlabel_stats.to_csv(sp)