In [None]:
import os, pandas as pd, slr_pkg.clean_load_data as cld, slr_pkg.para as para
from itertools import combinations
from collections import Counter
from pathlib import Path


# Get current working directory
bp = Path(os.getcwd())

# Set results directory
results_path = bp / "results"

# Set base data directory.
dp  = bp / 'data'

# Set sample data directory.
edf_path = dp / 'geotracker_edf_results'
gama_path = dp / 'gama_results'

# Set location data directory.
geo_xy_path = dp / 'geotracker_xy'
gama_xy_path = dp / "gama_xy"

# Ask for county to gather data for.
# area = input('Enter county: ')
area = 'Ventura'
# areas = ['LosAngeles']

In [None]:
# List of contaminants.
chems = para.conts11
samples = pd.read_csv(dp / '{}_clean_samples.csv'.format(area))

In [None]:
samples

In [None]:
# subset of specific samples meeting parameters.
spec_samples = samples.copy()

# Select spec_samples taken since 2012.
spec_samples = spec_samples.loc[spec_samples['LOGDATE'] >= '2012-01-01']

# Select spec_samples with wells of "monitoring well" type.
spec_samples = spec_samples[(spec_samples['FIELD_PT_CLASS'] == 'MW') | (spec_samples['FIELD_PT_CLASS'] == 'MONITORING')]

# Select samples with contaminants of interest.
spec_samples = spec_samples.loc[spec_samples['PARLABEL'].isin(chems)]

counter = Counter(spec_samples['PARLABEL'])
print(counter.most_common(5))

In [None]:
def select_wells(row, c):
    wid = row['WID']
    counter = Counter(row['PARLABEL'])
    if len(counter) == len(c):
        if all(i >= 4 for i in counter.values()):
            return  wid

In [None]:
from collections import namedtuple

result = namedtuple('result', 'wells samples')

In [None]:
c_dict = {}

for i in range(4,12):
    
    c_list = combinations(chems, i)
    c_list = list(c_list)

    for c in c_list:
        df = spec_samples[spec_samples['PARLABEL'].isin(c)].copy()
        grp = df.groupby('WID')['PARLABEL'].apply(list).reset_index()
        res = grp.apply(select_wells, axis=1, c=c)
        res_samples = spec_samples[spec_samples['WID'].isin(res)].copy().reset_index(drop=True)
        s = len(res_samples)
        w = len(res_samples['WID'].unique())
        c_dict[str(sorted(c))] = result(wells=w, samples=s)

In [None]:
cdf = pd.DataFrame.from_dict(c_dict, orient='index').reset_index().rename(columns={'index':'contaminants'})
cdf['c_num'] = cdf['contaminants'].apply(lambda x: len(x.replace('[','').replace(']','').replace("'",'').split(',')))
cdf.sort_values(by='wells', ascending=False, inplace=True)
cdf.reset_index(drop=True, inplace=True)

In [None]:
spec = 'MW'
print('{}, {}: '.format(area, spec))
print(cdf)

In [None]:
cdf.to_csv(results_path / '{}_MW_contaminant_combos.csv'.format(area))

In [None]:
gbo = cdf.groupby(['c_num'])[['wells','samples']].max()
gbo.reset_index(inplace=True)

In [None]:
maxdf = pd.DataFrame()
for row in gbo.iterrows():
    c_num = row[1][0]
    wells = row[1][1]
    samples = row[1][2]
    print(c_num, wells, samples)
    res = cdf[(cdf['c_num'] == c_num) & (cdf['wells']==wells) &(cdf['samples']==samples)]
    maxdf = maxdf.append(res)


In [None]:
maxdf.to_csv(results_path / '{}_{}_contaminant_combos_max.csv'.format(area, spec))

In [None]:
# Run above

# Visualization

In [None]:
import matplotlib.pyplot as plt

In [None]:
a = c_dict.values()
b = c_dict.keys()

x = cdf['wells']
y = cdf['samples']
z = cdf['c_num']

fig, ax1 = plt.subplots(figsize=(10,10), dpi=200)
p1 =ax1.scatter(x, y, color='red')
ax1.set(xlabel='Wells', ylabel='Samples', title='{}: Wells vs. Samples and Contaminants'.format(area))
ax1.set_xlabel('Wells', fontsize=15)
ax1.set_ylabel('Samples', fontsize=15)
ax1.set_title('{}: Wells vs. Samples and Contaminants'.format(area), fontsize=20)
ax1.grid()
ax2 = ax1.twinx()
p2 = ax2.scatter(x, z, marker='.', color='blue')
ax2.set_ylabel('Contaminants', fontsize=15)
ax2.invert_yaxis()
ax1.legend(handles=[p1, p2], labels=['Samples', 'Contaminants'], loc='lower right')
plt.savefig(results_path / '{}_{}_contaminant_combos.png'.format(area, spec))

In [None]:
cdf.groupby('c_num').max().to_csv(results_path / '{}_c_groups.csv'.format(area))

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

x = cdf['wells']
y = cdf['samples']
y2 = cdf['c_num']
fig = plt.figure(figsize=(10,10))
ax1 = plt.subplot()
line1 = plt.scatter(x,y)
ax1.grid(True)

ax2 = ax1.twinx()
line2 = ax2.scatter(x, y2, color='green')
ax2.tick_params(axis='y', labelcolor='green')

annots = []
for ax in [ax1, ax2]:
    annot = ax.annotate("", xy=(0,0), xytext=(-20,20),textcoords="offset points",
                        bbox=dict(boxstyle="round", fc="w", alpha=0.4),
                        arrowprops=dict(arrowstyle="->"))
    annot.set_visible(False)
    annots.append(annot)

annot_dic = dict(zip([ax1, ax2], annots))
line_dic = dict(zip([ax1, ax2], [line1, line2]))

def update_annot(line, annot, ind):
    x,y = line.get_data()
    annot.xy = (x[ind["ind"][0]], y[ind["ind"][0]])
    text = "x = {}\ny= {}".format(x[ind["ind"][0]], y[ind["ind"][0]])
    annot.set_text(text)

def hover(event):

    if event.inaxes in [ax1, ax2]:
        for ax in [ax1, ax2]:
            cont, ind = line_dic[ax].contains(event)
            annot = annot_dic[ax]
            if cont:
                update_annot(line_dic[ax], annot, ind)
                annot.set_visible(True)
                fig.canvas.draw_idle()
            else:
                if annot.get_visible():
                    annot.set_visible(False)
                    fig.canvas.draw_idle()

fig.canvas.mpl_connect("motion_notify_event", hover)

plt.show()