## Import Packages and Set Starting Parameters

In [70]:
import os, pandas as pd, slr_pkg.clean_load_data as cld, slr_pkg.para as para
from slr_pkg.clean_load_data import open_table
from pathlib import Path


# get current working directory
bp = Path(os.getcwd())

# set results directory
results_path = bp / "results"

# Ask for county to gather data for.
area = input('Enter county: ')
# area = 'Ventura'
# Set paths of sample data.
edf_path = bp / 'geotracker_edf_results'
gama_path = bp / 'gama_results'

# List of contaminants.
chems = para.conts10

## Open and Concat Data

### Load Sample Data

In [None]:
# create list of geotracker files to open
edf_files = edf_path.glob('**/*{}*.zip'.format(area))


# Use list comprehension to create a list of dataframes from the files list. Uses open_table() to open the files.
edf_results_list = [cld.Sample_Data.geotracker_df(i) for i in edf_files]
print(edf_results_list)

# Concatenate the list of dataframes into one dataframe if there are more than one.
print('\nConcatenating Geotracker EDF results: \n')
try:
    edf_results = pd.concat(edf_results_list)
except:
    edf_results = edf_results_list[0]

print('Geotracker EDF results: \n')
print(edf_results.head())

In [97]:
edf_results

Unnamed: 0,LOGDATE,PARLABEL,PARVAL,PARVQ,REPDL,UNITS,WID
0,2001-09-13,BZME,1.0,<,1.0,UG/L,T0607302931-MW-16
1,2001-09-12,TAME,20.0,<,20.0,UG/L,T0607301374-MW-1
2,2001-09-12,XYLENES1314,1.0,<,1.0,UG/L,T0607302389-MW-3
3,2001-09-10,XYLO,1.0,<,1.0,UG/L,T0607300928-MW-7
4,2001-09-12,EBZ,1.0,<,1.0,UG/L,T0607302389-MW-3
...,...,...,...,...,...,...,...
8378030,2022-01-19,DIPE,0.0,ND,1.0,UG/L,T0607301771-1327-MW-09
8378031,2022-01-20,TBA,2200.0,=,100.0,UG/L,T0607301771-1327-MW-39
8378032,2022-01-21,BZME,340.0,=,100.0,UG/L,T0607301771-1327-RW-05
8378033,2022-01-21,PHCG,13000.0,=,1000.0,UG/L,T0607301771-1327-VX11B


In [72]:
# Create list of GAMA files to open.
gama_files = gama_path.glob('**/*{}*.zip'.format(area.lower()))

# Use list comprehension to create a list of dataframes from the files list. Uses open_table() to open the files.
gama_results_list = [cld.Sample_Data.gama_df(i) for i in gama_files]

# Concatenate the list of dataframes into one dataframe.
print('\nConcatenating gama results: \n')
gama_results = pd.concat(gama_results_list)

print("GAMA results: \n")
print(gama_results.head())

Loading GAMA file. e:\work\projects\coast_slr\scripts\slr_ground_water_quality_\gama_results\gama_ddw_sandiego_v2.zip
Loading GAMA file. e:\work\projects\coast_slr\scripts\slr_ground_water_quality_\gama_results\gama_dpr_sandiego_v2.zip
Loading GAMA file. e:\work\projects\coast_slr\scripts\slr_ground_water_quality_\gama_results\gama_dwr_sandiego_v2.zip
Loading GAMA file. e:\work\projects\coast_slr\scripts\slr_ground_water_quality_\gama_results\gama_gama_dom_sandiego_v2.zip
Loading GAMA file. e:\work\projects\coast_slr\scripts\slr_ground_water_quality_\gama_results\gama_gama_sp-study_sandiego_v2.zip
Loading GAMA file. e:\work\projects\coast_slr\scripts\slr_ground_water_quality_\gama_results\gama_gama_usgs_sandiego_v2.zip
Loading GAMA file. e:\work\projects\coast_slr\scripts\slr_ground_water_quality_\gama_results\gama_usgs_nwis_sandiego_v2.zip
Loading GAMA file. e:\work\projects\coast_slr\scripts\slr_ground_water_quality_\gama_results\gama_wb_cleanup_sandiego_v2.zip

Concatenating gama re

### Concat Sample Data

In [73]:
samples = cld.Sample_Data.concat_samples(edf_results, gama_results)
print("Samples: \n")
print(samples.head())

Concatenating GAMA and Geotracker dataframes. 

Checking for missing values. 

Creating group ID (GID). 

Samples: 

      LOGDATE     PARLABEL  PARVAL PARVQ  REPDL UNITS                WID  \
0  2001-09-13         BZME     1.0     <    1.0  UG/L  T0607302931-MW-16   
1  2001-09-12         TAME    20.0     <   20.0  UG/L   T0607301374-MW-1   
2  2001-09-12  XYLENES1314     1.0     <    1.0  UG/L   T0607302389-MW-3   
3  2001-09-10         XYLO     1.0     <    1.0  UG/L   T0607300928-MW-7   
4  2001-09-12          EBZ     1.0     <    1.0  UG/L   T0607302389-MW-3   

                               GID  
0  (T0607302931-MW-16, 2001-09-13)  
1   (T0607301374-MW-1, 2001-09-12)  
2   (T0607302389-MW-3, 2001-09-12)  
3   (T0607300928-MW-7, 2001-09-10)  
4   (T0607302389-MW-3, 2001-09-12)  


### Location Data

In [74]:
# LOAD GEO XY PATH
geo_xy_path = bp / 'geotracker_xy'


def create_geo_xy(p):  # simple function for loading gama tables

    try:

        df = pd.read_csv(p, sep='\t', lineterminator='\n', encoding='unicode_escape',
                            quotechar='"',  quoting=3,  on_bad_lines='warn')

        df['WID'] = df['GLOBAL_ID'] + '-' + df['FIELD_PT_NAME']
        columns = ['WID', 'LATITUDE', 'LONGITUDE']
        df = df[columns]

        return df

    except:
        print('Exception, no such file.')


def concat_geo_xy(files):  # function to concat gama result datasets

    df_list = []

    for i in files:
        j = create_geo_xy(i)
        if j is not None:
            df_list.append(j)

    concatDF = pd.concat(df_list, axis=0)

    for df in df_list:
        del df

    return concatDF


geo_xy_files = geo_xy_path.glob('**/*.zip')
print('Loading Geotracker XY \n')
geo_xy = concat_geo_xy(geo_xy_files)

# geo_xy_gpd = gpd.GeoDataFrame(geo_xy, geometry=gpd.points_from_xy(geo_xy.LONGITUDE, geo_xy.LATITUDE), crs='EPSG:4326')

# load GAMA XY
print('Loading GAMA XY \n')
gama_xy_path = bp / "gama_xy\gama_location_construction_v2.zip"
gama_xy = pd.read_table(gama_xy_path, sep='\t', encoding='unicode_escape')

gama_xy.rename(columns={'GM_WELL_ID': 'WID', 'GM_LATITUDE': 'LATITUDE',
                'GM_LONGITUDE': 'LONGITUDE'}, inplace=True)
gama_xy_columns = ['WID', 'LATITUDE', 'LONGITUDE']
gama_xy = gama_xy[gama_xy_columns]

# combine well location data into singular dataset
print('Combining GAMA and Geotracker XY \n')
wells = pd.concat([gama_xy, geo_xy], ignore_index=True)
wells = wells.drop_duplicates(subset='WID').dropna(subset=['LATITUDE', 'LONGITUDE'])

Loading Geotracker XY 



b'Skipping line 18923: expected 13 fields, saw 14\nSkipping line 19016: expected 13 fields, saw 14\nSkipping line 27652: expected 13 fields, saw 14\nSkipping line 27653: expected 13 fields, saw 14\nSkipping line 27654: expected 13 fields, saw 14\nSkipping line 27823: expected 13 fields, saw 14\nSkipping line 27824: expected 13 fields, saw 14\nSkipping line 27825: expected 13 fields, saw 14\nSkipping line 29300: expected 13 fields, saw 14\nSkipping line 29301: expected 13 fields, saw 14\nSkipping line 29452: expected 13 fields, saw 14\nSkipping line 29453: expected 13 fields, saw 14\nSkipping line 29454: expected 13 fields, saw 14\nSkipping line 29455: expected 13 fields, saw 14\nSkipping line 29775: expected 13 fields, saw 14\nSkipping line 30039: expected 13 fields, saw 14\nSkipping line 30280: expected 13 fields, saw 14\nSkipping line 30281: expected 13 fields, saw 14\nSkipping line 32422: expected 13 fields, saw 14\nSkipping line 32423: expected 13 fields, saw 14\nSkipping line 3242

Loading GAMA XY 



  exec(code_obj, self.user_global_ns, self.user_ns)


Combining GAMA and Geotracker XY 



### Depth to Water Data

In [75]:
# Create elev_path.
elev_path = bp / 'elevation'
print(elev_path, '\n')

# Dictionary of data types for gama_elev gama_elev for open_table().
gama_elev_dtypes = {
    'WELL NUMBER' : 'string',
    'DEPTH TO WATER' : 'float64',
    }

# Date column of gama_elev gama_elev for open_table().
gama_elev_date = ['MEASUREMENT DATE']

# Columns of gama_elev gama_elev for open_table().
gama_elev_cols = list(gama_elev_dtypes.keys()) + gama_elev_date


print('Loading GAMA groundwater elevations. \n')

# create list of files to open
gama_elev_files = elev_path.glob('**/*gama*.zip')
gama_elev_files = list(gama_elev_files)

# Use list comprehension to create a list of dataframes from the files list. Uses open_table() to open the files.
gama_elev_list = [open_table(i,dtypes = gama_elev_dtypes,date_cols = gama_elev_date, cols =gama_elev_cols) for i in gama_elev_files]
print(gama_elev_list)

# Concatenate the list of dataframes into one dataframe if there are more than one.
if len(gama_elev_list) > 1:
    gama_elev = pd.concat(gama_elev_list)

else:
    gama_elev = gama_elev_list[0]

# Dict of attributes to rename.
gama_geo_dict = {
    'WELL NUMBER' : 'WID',
    'DEPTH TO WATER' : 'DTW',
    'MEASUREMENT DATE' : 'LOGDATE',
}
# Rename columns.
gama_elev = gama_elev.rename(columns=gama_geo_dict)

# Fix column formatting.
gama_elev['LOGDATE'] = gama_elev['LOGDATE'].astype(str)
gama_elev['WID'] = gama_elev['WID'].str.replace(' ', '')

# Create GID (group id) column. GID is the WID and LOGDATE concatenated.
gama_elev['GID'] = list(zip(gama_elev['WID'], gama_elev['LOGDATE']))

e:\work\projects\coast_slr\scripts\slr_ground_water_quality_\elevation 

Loading GAMA groundwater elevations. 

[                  WELL NUMBER MEASUREMENT DATE  DEPTH TO WATER
0               17S11E22E002S       1993-10-19          100.52
1               17S11E22E002S       1994-03-14          100.41
2               17S11E22E002S       1994-10-17          100.43
3               17S11E22E002S       1995-03-29          100.36
4               17S11E22E002S       1995-10-16          100.17
...                       ...              ...             ...
4229858  SL204131495 - REW-12       2021-11-16           72.20
4229859  SL204131495 - REW-12       2021-12-07           72.33
4229860  SL204131495 - REW-14       2017-04-04             NaN
4229861  SL204131495 - REW-15       2017-04-04             NaN
4229862   SL204131495 - REW-3       2017-04-04             NaN

[4229863 rows x 3 columns]]


In [76]:
# Dictionary of data types for geo_elev geo_elev for open_table().
geo_elev_dtypes = {
    'GLOBAL_ID' : 'string',
    'FIELD_POINT_NAME' : 'string',
    'DTW' : 'float64',
    }

# Date column of geo_elev geo_elev for open_table().
geo_elev_date = ['GW_MEAS_DATE']

# Columns of geo_elev geo_elev for open_table().
geo_elev_cols = list(geo_elev_dtypes.keys()) + geo_elev_date

print('Loading Geotracker groundwater elevations. \n')

# create list of files to open
geo_elev_files = elev_path.glob('**/*Geo*.zip')
geo_elev_files = list(geo_elev_files)


# Use list comprehension to create a list of dataframes from the files list. Uses open_table() to open the files.
geo_elev_list = [open_table(i,geo_elev_dtypes,date_cols= geo_elev_date,cols =geo_elev_cols) for i in geo_elev_files]

# Concatenate the list of dataframes into one dataframe if there are more than one.
if len(geo_elev_list) > 1:
    geo_elev = pd.concat(geo_elev_list)

else:
    geo_elev = geo_elev_list[0]

# Create WID column.
geo_elev['WID'] = geo_elev['GLOBAL_ID'] + '-' + geo_elev['FIELD_POINT_NAME']

# Drop unnecessary columns.
geo_elev = geo_elev.drop(columns=['GLOBAL_ID', 'FIELD_POINT_NAME'])

# fix column formatting.
geo_elev['WID'] = geo_elev['WID'].str.replace(' ', '')

# Rename columns.
geo_elev = geo_elev.rename(columns={'GW_MEAS_DATE' : 'LOGDATE'})

# Fix column formatting.
geo_elev['LOGDATE'] = geo_elev['LOGDATE'].astype(str)

# Create GID (group id) column. GID is the WID and LOGDATE concatenated.
geo_elev['GID'] = list(zip(geo_elev['WID'], geo_elev['LOGDATE']))

Loading Geotracker groundwater elevations. 



### Unit Conversion Data

In [77]:
# Load conversion tables.
metric_conversion = pd.read_excel(bp / 'unit_conversion.xlsx', sheet_name='metric')
molar_conversion = pd.read_excel(bp / 'unit_conversion.xlsx', sheet_name='molar')

# join coversion factors to samples based on sample unit.
samples = samples.merge(metric_conversion, how='inner', left_on='UNITS', right_on='start_unit')

## **All Samples**

### Join MCL Table to Sample Results

In [78]:
# Select samples of selected contaminants 
#samples = samples[samples['PARLABEL'].isin(contaminants_3)]

print('Loading MCL table \n')

# Create path to mcl table.
mcl_path = bp / 'MCLs.xlsx'

# Open mcl table.
mcl = pd.read_excel(mcl_path,sheet_name='MCL', engine='openpyxl')

# join MCL values to sample results
print('Joining MCL values to samples \n')
samples = samples.merge(mcl, left_on='PARLABEL', right_on='chem_abrv', how='left')

Loading MCL table 

Joining MCL values to samples 



### Convert Units

In [79]:
# Create mask for samples with MCL units in UG/L and converts sample result units to UG/L.
mask = samples['units'] == 'UG/L'

# Multiply sample results by conversion factor.
samples.loc[mask, 'PARVAL'] = samples['PARVAL'] * samples['coef']

In [80]:
# Join well location data to sample results.
samples = samples.merge(wells, left_on='WID', right_on='WID', how='inner')

# Drop columns that are not needed.
samples.drop(columns=['start_unit', 'coef', 'UNITS'], inplace=True)

In [81]:
# Create exceedence attribute, true if sample result exceeds reporting limit.
samples['exceedence'] = samples['PARVAL'] > samples['comp_conc_val']

# Create magnitude attribute. Sample result value divided by the comparison concentration value (MCL or Action level) minus 1.
samples['magnitude'] = (samples['PARVAL'] / samples['comp_conc_val']) - 1

In [82]:
# samples.to_csv(results_path / '{}_all_sample_results.csv'.format(area.lower()))

### Select Specific Wells

In [83]:
# Select samples taken since 2010.
samples = samples.loc[samples['LOGDATE'] >= '2010-01-01']

samples = samples.loc[samples['PARLABEL'].isin(chems)]

# Create groups of samples based on WID and PARLABEL(contaminant label).
sample_groups = samples.groupby(['WID'])['PARLABEL'].apply(list).reset_index()

In [84]:
from collections import Counter


def select_wells(row):
    vals = row.values[1]
    counter = Counter(vals)
    print(counter)
    if len(counter) == len(chems):
        if all(values >= 4 for values in counter.values()) == True:
            print('True')
            print(row.values[0])
            print(counter.values())
            return True
        else:
            return False
    else:
        return False

# Create mask of sample groups meeting parameter requirements.
res = sample_groups.apply(select_wells, axis=1)

Counter({'BZ': 22, 'TCE': 22, 'XYLENES': 22, 'EBZ': 22, 'BZME': 22, 'PCE': 22, 'CD': 22, 'PB': 22, 'TCPR123': 21, 'EDB': 1})
Counter({'TCPR123': 2, 'CD': 1, 'XYLENES': 1, 'EDB': 1, 'BZME': 1, 'BZ': 1, 'EBZ': 1, 'PCE': 1, 'TCE': 1})
Counter({'CD': 4, 'PB': 4, 'TCPR123': 4, 'XYLENES': 1, 'TCE': 1, 'EBZ': 1, 'BZ': 1, 'BZME': 1, 'PCE': 1})
Counter({'TCPR123': 4, 'CD': 3, 'PB': 2, 'EBZ': 1, 'BZME': 1, 'BZ': 1, 'PCE': 1, 'TCE': 1, 'XYLENES': 1, 'EDB': 1})
Counter({'CD': 3, 'TCPR123': 3, 'PB': 2, 'BZ': 2, 'BZME': 2, 'TCE': 2, 'PCE': 2, 'XYLENES': 2, 'EBZ': 2, 'EDB': 1})
Counter({'TCPR123': 4, 'PB': 1, 'CD': 1, 'PCE': 1, 'BZ': 1, 'BZME': 1, 'XYLENES': 1, 'EBZ': 1, 'TCE': 1, 'EDB': 1})
Counter({'TCPR123': 4, 'CD': 1, 'EDB': 1, 'PCE': 1, 'BZ': 1, 'BZME': 1, 'XYLENES': 1, 'EBZ': 1, 'TCE': 1})
Counter({'XYLENES': 2, 'EDB': 2, 'TCPR123': 2, 'PB': 2, 'CD': 2, 'EBZ': 2, 'BZME': 2, 'BZ': 2, 'PCE': 2, 'TCE': 2})
Counter({'CD': 1, 'PB': 1, 'TCPR123': 1, 'EDB': 1, 'BZ': 1, 'BZME': 1, 'XYLENES': 1, 'EBZ':

In [85]:
# Use mask to select sample results from wells that meet parameter requirements.
samples = samples[samples['WID'].isin(sample_groups.loc[res, 'WID'])]

In [86]:
# number of unique wells
nwells = len(samples['WID'].unique())
print('Number of wells: ' + str(nwells))

Number of wells: 169


In [87]:
# Save sample results to csv.
# samples.to_csv(results_path / '{}_spec_sample_results_10.csv'.format(county.lower()))

### Join Groundwater Elevations to Sample Results

In [88]:
# Concatenate gama_results and edf_results.
dtw = pd.concat([geo_elev, gama_elev])

# List of columns that require a value.
dtw_req_cols = ['WID','DTW','LOGDATE']

# Drops rows with missing values in required columns.
dtw = dtw.dropna(subset=dtw_req_cols)

# Drop duplicate GID rows.
dtw = dtw.drop_duplicates(subset=['GID'])
samples_dtw = samples.merge(dtw, left_on=['GID'], right_on=['GID'], how='inner')

In [89]:
samples_dtw.columns

dtw_req_cols = ['LOGDATE_x', 'PARLABEL', 'PARVAL', 'WID_x', 'DTW', 'LATITUDE', 'LONGITUDE']

for i in samples_dtw.columns:
    if i not in dtw_req_cols:
        samples_dtw = samples_dtw.drop(columns=i)

print(samples_dtw.columns)

Index(['LOGDATE_x', 'PARLABEL', 'PARVAL', 'WID_x', 'LATITUDE', 'LONGITUDE',
       'DTW'],
      dtype='object')


In [90]:
chem_num = str(len(chems))

a = (len(samples))
b = (len(samples_dtw))
c =((len(samples_dtw) / len(samples)*100))
c = "{:.2f}".format(c)

print(area, chem_num,': \n')
print('There are ' + str(b) + ' samples with depth to water values.')
print("Out of " + str(a) + " samples in the original dataframe.")
print(str(c) + "% of samples. \n")

a = (len(samples['WID'].unique()))
b = (len(samples_dtw['WID_x'].unique()))
c = (b/a)*100
c = "{:.2f}".format(c)

print('There are ' + str(b) + ' wells with depth to water values.')
print("Out of " + str(a) + " wells in the original dataframe.")
print(str(c) + "% of  wells. \n")

SanDiego 10 : 

There are 26879 samples with depth to water values.
Out of 47766 samples in the original dataframe.
56.27% of samples. 

There are 124 wells with depth to water values.
Out of 169 wells in the original dataframe.
73.37% of  wells. 



In [91]:
# Save sample results to csv.
samples_dtw.to_csv(results_path / '{}_sample_results_{}.csv'.format(area.lower(), chem_num))

In [None]:
# Run from here

### Pivot table for CCME Water Quality Index

In [None]:
sample_results.rename(columns={'WID' : 'Station', 'LOGDATE' : 'Date'}, inplace=True)

sample_results['PARLABEL'] = sample_results['PARLABEL'] + '_' + sample_results['units']

pivot_table = pd.pivot_table(sample_results, index=['Station', 'Date'], columns=['PARLABEL'], values=['PARVAL'])
ccme_wqi_data = pivot_table.reset_index()

ccme_wqi_data.columns = ['Station', 'Date', 'AS_UG/L', 'BZME_UG/L', 'BZ_UG/L', 'CD_UG/L', 'DBCP_UG/L',
       'EBZ_UG/L', 'EDB_UG/L', 'MTBE_UG/L', 'NO3N_MG/L', 'PB_UG/L', 'PCE_UG/L',
       'TCE_UG/L', 'TCPR123_UG/L', 'THM_UG/L', 'XYLENES_UG/L']

ccme_wqi_data.dropna(inplace=True)

In [None]:
ccme_wqi_data.to_csv(results_path / '{}_ccme_wqi_conc_samples.csv'.format(county.lower()))

### Normalize Sample Result Values at Wells

In [None]:
# Calculates the mean of magnitudes for each WID in the exceedences dataframe.
print('Calculating magnitudes for each WID \n')
print(samples_mcl.head())

means = samples_mcl.groupby(['WID'])['magnitude'].mean()

In [None]:
# Join mean magnitudes to well locations.
print('Merging geometric mean magnitudes to wells \n')
wells = wells.merge(means, how='inner', left_on='WID', right_index=True)
wells = wells.set_index('WID').sort_index()

# Save well mean magnitudes to csv.
wells.to_csv(bp / 'wells.csv')

In [None]:
# Convert well mean magnitudes to shapefile
import geopandas as gpd

# Create geodataframe from well mean magnitudes, uses long and lat columns as xy coordinates, NAD83 projection.
gdf = gpd.GeoDataFrame(wells, geometry=gpd.points_from_xy(x=wells.LONGITUDE, y=wells.LATITUDE), crs='EPSG:4326')

# Reproject to UTM 11N.
gdf = gdf.to_crs('EPSG:26911')


gdf.to_file(results_path / 'wells.shp'.format(county))

## **Sample Groups**

### Create Sample Groups

In [None]:
# Group samples by WID and LOGDATE apply list function to get list of PARLABELS for each group.
sample_groups = samples_mcl.groupby(['WID', 'LOGDATE'])['PARLABEL'].apply(list)

### Single Contaminant List

In [None]:
# Use list comprehension to create a list of sample indexes where all contaminants in the contaminant list are present.
index_list = [i for i in sample_groups.index if all(item in sample_groups.loc[i] for item in contaminants_3)]

# Uses index_list to create a dataframe of samples that meet the criteria.
sample_group_results = samples_mcl[samples_mcl['GID'].isin(index_list)]

In [None]:
# Print groups of samples that meet the criteria.
print('Groups: ',len(index_list))
print('Samples: ',len(sample_results))

In [None]:
# Join location data to sample results.
sample_group_results = sample_results.merge(wells, left_on='WID', right_on='WID', how='inner')

# Save sample group results to csv.
sample_group_results.to_csv(bp / '{}_sample_results.csv'.format(county.lower()))

In [None]:
sample_results.to_csv(bp / '{}_sample_results.csv'.format(county.lower()))

### Contaminant Combinations

In [None]:
from itertools import combinations

# Create list of all combinations of all contaminants
combinations_list = list(combinations(contaminants_3, 10))
len(combinations_list)

In [None]:
# Function to select sample groups based on combinations of contaminants.
def get_select_samples(row, contaminants):

    # checks list of contaminants in row against list of contaminants in function call.
    # if all contaminants in row are in contaminants, return True.
    if all(item in row for item in contaminants):
        return True

    else:
        return False

In [None]:
def get_select_samples(row, contaminants):

    count = 0

    for values in row:
        print(values)
        if all(item in values for item in contaminants):
            count += 1
    return count

In [None]:
ser_dict = {}

total =  len(combinations_list)
count = 0

for contaminants in combinations_list:

    count += 1
    percent = int(((count/total)*100))

    ser = sample_groups.apply(get_select_samples, contaminants=contaminants_3)

    ser_dict[contaminants] = ser

    #print('{}%'.format(percent))
    
combo_stats = pd.DataFrame.from_dict(ser_dict, orient='index')

print(combo_stats.max())
print(list(combo_stats.idxmax()))

In [None]:
ser_dict

## **Modin Combo Stats**

In [None]:
def get_select_samples_modin(row, contaminants):
    print(row)

    if all(element in row for element in contaminants) ==  True:
        print('contains all elements')
        return True

    else:
        print('does not contain all elements')
        return False

In [None]:
combinations_list

In [None]:
samples_modin = mpd.DataFrame(samples)

In [None]:
import modin.pandas as mpd
from distributed import Client
client = Client()

sample_groups_modin = mpd.DataFrame(sample_groups)


ser_dict = {}

for contaminants in combinations_list:

    ser = sample_groups_modin.apply(get_select_samples_modin, contaminants=contaminants)
    ser = ser[ser == True]

    ser_dict[contaminants] = len(ser)



print(max(ser_dict.values()))

In [None]:
print(combo_stats.max())
print(list(combo_stats.idxmax()))

In [None]:
df[df == True]

In [None]:
df[df == True]

In [None]:
print('Loading MCL table \n')

# Create path to mcl table.
mcl_path = bp / 'MCL_list_1.xlsx'

# Open mcl table.
mcl = pd.read_excel(mcl_path, engine='openpyxl')

# join MCL values to sample results
print('Joining MCL values to samples \n')
samples_mcl = select_samples.merge(mcl, left_on='PARLABEL', right_on='chem_abrv', how='left').set_index(select_samples.index)

In [None]:
# Save samples_mcl to csv.
alt = input("Input filename ending for 'county'_select_samples_'input'.csv: ")
name = '{}_select_samples_{}.csv'.format(county.lower(), alt)
sp = bp / name
samples_mcl.to_csv(sp)

In [None]:
# Get counts of samples for each contaminant.
parlabel_stats = samples['PARLABEL'].value_counts()

# Create a dataframe with the counts of samples for each contaminant.
parlabel_stats = parlabel_stats.to_frame(name='COUNTS').reset_index().rename(columns={'index':'PARLABEL'})

# Create PERCENT column for each contaminant. Showing percent of samples for each contaminant compared to total samples.
parlabel_stats['PERCENT'] = (parlabel_stats['COUNTS'] / len(samples) * 100).round(4)

In [None]:
# Save samples_mcl to csv.
name = '{}_parlabel_stats.csv'.format(county.lower())
sp = bp / name
parlabel_stats.to_csv(sp)