# EJP Common template - Data Query Module (v0.6.5)
Upload the database and then use tabs to filter the data, make queries and generate different export files.

In [1]:
# needed import
import pandas as pd
import numpy as np
import time
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
from matplotlib.collections import PolyCollection
import matplotlib.dates as mdates
import tempfile
import requests
import os
#from zipfile import ZipFile
from urllib.parse import parse_qs
import base64
#from upsetplot import UpSet  # graph with interactions
import warnings
warnings.filterwarnings('ignore')

from ipyleaflet import Map, Marker, Icon

from ipywidgets import (FileUpload, Button, Output, Dropdown, RadioButtons,
                        SelectMultiple, VBox, HBox, Layout, Checkbox, Label, Text,
                       Tab, FloatRangeSlider, Tab, IntText, FloatText)
from IPython.display import FileLink, HTML, Markdown

sheetNames = [
    'experiment', 'reference', 'treatment', 'soil-type', 'tillage', 'crops',
    'amendment', 'irrigation', 'pest-weed', 'grazing',
    'soil-crop-measurement', 'data-crop', 'data-soil', 'dropDownList']

dtypes = {
    'Experiment ID': 'string',
    'Treatment ID': 'string',
    'Reference treatment': 'string',
}

def tofloat(x):
    try:
        return float(x)
    except:
        return np.nan

def camelCase(s):
    s = s.split()
    return s[0].lower() + ''.join(i.capitalize() for i in s[1:])

def underCase(s):
    s = s.split()
    return '_'.join(i.lower() for i in s)

def dicCopy(dic):
    bdic = {}
    for key in dic:
        bdic[key] = dic[key].copy()
    return bdic

In [2]:
# df1 = pd.DataFrame({
#     'Sampling year': [np.nan, np.nan, np.nan],
#     'SOC': [0.4, 0.2, 0.1],
# })
# df2 = pd.DataFrame({
#     'Sampling year': [np.nan, np.nan, np.nan],
#     'Yield': [1245, 564, 2345]
# })
# display(df1)
# display(df2)
# pd.merge(df1, df2, on='Sampling year', how='outer')

In [3]:
# df = pd.DataFrame(np.random.rand(1000, 100))
# fname = 'df.csv'
# df.to_csv(fname)
# FileLink(fname)

In [4]:
# type of columns
dbdic = {
    'experiment': {
        'Latitude': 'number',
        'Longitude': 'number',
        'Country': 'choice',
        'Land use prior experiment': 'choice',
        
    },
    'reference': {
        'Publication type': 'choice',
        'Publication first author': 'string',
        'Publication year': 'number',
        'Publication title': 'string',
        'Publication journal': 'string'
    },
    'soil-type': {
        'Top depth of layer': 'number',
        'Bottom depth of layer': 'number',
        'Clay (< 0.002 mm)': 'number',
        'Silt (0.002 - 0.05 mm)': 'number',
        'Sand (0.05 - 2 mm)': 'number',
        'Gravel (> 2 mm)': 'number',
        'Soil texture USDA': 'choice',
        'Soil group WRB': 'choice',
        'Soil group WRB qualifier': 'choice',
        'Soil group WRB specifier': 'choice',
        'Soil type USDA': 'choice',
        'Soil type USDA qualifier': 'choice',
    },
    'treatment': {
        'Land use': 'choice',
        'Year started': 'number',
        'Year ended': 'number',
        'Crop rotation': 'choice'
    },
    'tillage': {
        'Rotation': 'choice',
        'Tillage system': 'choice',
        'Tillage method': 'choice',
        'Tillage depth': 'number',
        'Permanent soil area covered by residues or crops': 'choice',
        'Tillage period': 'choice',
    },
    'crops': {
        'Rotation': 'choice',
        'Crop type': 'choice',
        'Cropping system': 'choice',
        'Crop': 'choice',
        'Harvesting/Termination method': 'choice',
        'Harvesting frequency': 'number',
        'Sowing period': 'choice',
        'Harvesting/Termination period': 'choice',
        'Residues removal': 'choice',
        'Residues incorporation': 'choice', 
        'Residues burning': 'choice'
    },
    'amendment': {
        'Rotation': 'choice',
        'Type of fertilizer/amendment': 'choice',
        'Fertilizer/Amendment application rate': 'number',
        'Fertilizer/Amendment application rate units': 'choice',
        'Fertilizer/Amendment application method': 'choice',
        'Amendment water content': 'number',
        'Amendment C': 'number',
        'Amendment N': 'number',
        'Amendment P': 'number',
        'Amendment K': 'number',
    },
    'irrigation': {
        'Rotation': 'choice',
        'Irrigation method': 'choice',
        'Amount of water': 'number',
        'Irrigation frequency': 'number',
        'Irrigation water': 'choice',
        'Drainage system': 'choice',
        'Drainage spacing': 'number',
        'Drainage depth': 'number',
    },
    'data-crop': {
        'Rotation': 'choice',
        'Sampling year': 'number',
        'Harvested yield': 'number',
        'Harvested yield water content': 'choice',
        'Harvested yield water content amount': 'number',
        'Residue above-ground': 'number',
        'Residue stubble': 'number',
        'Residue roots': 'number',
        'Residue sampling method': 'string',
        'Below-ground sampling depth': 'number',
    },
    'data-soil': {
        'Rotation': 'choice',
        'Sampling year': 'number',
        'Depth from': 'number',
        'Depth to': 'number',
        'Time-serie available': 'choice',
        'SOC conc': 'number',
        'SOC conc SD': 'number',
        'SOC conc SE': 'number',
        'SOC conc nb samples': 'number',
        'Analysis method': 'choice',
        'Bulk density': 'number',
        'Bulk density method': 'choice',
        'Bulk density SD': 'number',
        'Bulk density SE': 'number',
        'Bulk density nb samples': 'number',
        'SOC stock': 'number',
        'SOC stock SD': 'number',
        'SOC stock SE': 'number',
        'SOC stock nb samples': 'number',
        'pH': 'number',
        'pH method': 'choice'
    }
}

In [5]:
def readExcel(fname):
    t0 = time.time()
    print('Reading in Excel file...', end='')
    if fname[:4] == 'http': # it's a google sheet url
        fname = '/'.join(fname.split('/')[:-1] + ['export?format=xlsx'])
        response = requests.get(fname)
        data = response.content
    elif isinstance(fname, str):
        with open(fname, 'rb') as f:
            data = f.read()
    else:
        data = fname

    dfdic = pd.read_excel(data, sheet_name=None, skiprows=[0,1,3])
    version = pd.read_excel(data, sheet_name='README', header=None).iloc[0, 2]

    # convert potential bool columns to string (for trasks for WP2 for eg)
    for sheet in list(dfdic.keys())[2:-1]:
        dtypes = dfdic[sheet].dtypes
        bcols = dfdic[sheet].columns[dtypes == 'bool']
        dfdic[sheet].loc[:, bcols] = dfdic[sheet][bcols].astype(str)

    print('done ({:.2f}s)'.format(time.time() - t0))
    return dfdic, version

#dfdic, version = readExcel('https://docs.google.com/spreadsheets/d/1JA0EttKnzyEUDM0FLxwLMTQA0k0zBdGd/edit')
#dfdic, version = readExcel('../../carboseq-wp2-db-levels.xlsx')

In [6]:
# returns list of columns with given dtype for specific sheet
def getOptions(dfdic, sheet, onlyObject=True, rot=True):
    df = dfdic[sheet]
    ie = (~(df.columns.str.slice(-3, None) == ' ID')
          & ~(df.columns.str.slice(-10, None) == ' (comment)')
          & ~(df.columns == 'Treatment definition'))
    if onlyObject:
        ie = ie & (df.dtypes == 'object')
    if rot is False:
        ie = ie & (df.columns != 'Rotation')
    return df.columns[ie].tolist()

#getOptions(dfdic, 'crops')

In [7]:
# show map of experiments
def showMap(dfdic):
    # interactive map
    m = Map(
        center=(52.204793, 360.121558),
        zoom=4
    )
    df = dfdic['experiment']
    for i in range(df.shape[0]):
        lat, lon = df.loc[i, 'Latitude'], df.loc[i, 'Longitude']
        marker = Marker(location=(lat, 360+lon),
                        title=str(df.loc[i, 'Experiment ID']),
                        #icon=Icon(color='blue', icon='circle', prefix='fa'),
                        draggable=False)
        m.add_layer(marker)
    display(m)
#showMap(dfdic)

In [8]:
# filter experiments
class RowFilter(object):
    def __init__(self, filters, dfdic):
        self.sheetDropdown = Dropdown(options=list(dbdic.keys()), layout=Layout(width='15%'))
        self.sheetDropdown.observe(self.sheetDropdownFunc, names='value')
        self.colDropdown = Dropdown(options=getOptions(dfdic, 'experiment', False), layout=Layout(width='15%'))
        self.colDropdown.observe(self.colDropdownFunc, names='value')
        self.rmBtn = Button(description='Remove')
        self.rmBtn.on_click(self.rmBtnFunc)
        self.hbox = HBox([self.sheetDropdown, self.colDropdown, self.rmBtn], layout=Layout(display='flex'))
        self.filters = filters
        self.index = len(self.filters.children) - 1
        self.filters.children = self.filters.children[:-1] + (self.hbox, self.filters.children[-1])
        self.dfdic = dfdic
        self.opts = None
        self.buildOptions('experiment', 'Latitude')
        self.colDropdown.value = 'Latitude'

    def sheetDropdownFunc(self, a):
        self.colDropdown.options = getOptions(self.dfdic, a['new'])

    def colDropdownFunc(self, a):
        sheet = self.sheetDropdown.value
        col = a['new']
        self.buildOptions(sheet, col)

    def buildOptions(self, sheet, col):
        typ = self.dfdic[sheet][col].dtype
        if typ != 'object' and typ != 'bool':
            vmin = self.dfdic[sheet][col].min()
            vmax = self.dfdic[sheet][col].max()
            r = vmax - vmin
            #opts = FloatRangeSlider(value=[vmin, vmax], vmin=vmin - 0.1*r, vmax=vmax + 0.1*r, step=0.02*r)
            self.opts = HBox([Label('min:'), FloatText(value=vmin, layout=Layout(width='30%')),
                              Label('max:'), FloatText(value=vmax, layout=Layout(width='30%'))])
        else:
            choices = self.dfdic[sheet][col].dropna().unique()
            checkboxes = [Checkbox(value=True, description=a, indent=False)
                                  for a in choices]
            def checkallFunc(a):
                for c in checkboxes:
                    c.value = a['new']
            checkall = Checkbox(value=True, description='Select all', indent=False)
            checkall.observe(checkallFunc, names='value')
            self.opts = VBox([checkall] + checkboxes, layout=Layout(width='40%'))

        if len(self.hbox.children) > 3:
            self.hbox.children = self.hbox.children[:-2] + (self.opts, self.hbox.children[-1])
        else:
            self.hbox.children = self.hbox.children[:-1] + (self.opts, self.hbox.children[-1])

    def rmBtnFunc(self, a):
        b = list(self.filters.children)
        del b[self.index]
        self.filters.children = b

def buildFilters():
    global dfdic, dfdico, out

    filterLog = Output()
    #with log:
    #    print('Note: click "Reset filters" to restore the initial database.')

    def addFilterBtnFunc(b):
        _ = RowFilter(filters, dfdic)
    addFilterBtn = Button(description='Add filter')
    addFilterBtn.on_click(addFilterBtnFunc)

    def filterDataBtnFunc(a):
        global dfdic, dfdico, expOut
        expOut.clear_output() # also clear exports
        dfdic = dicCopy(dfdico)  # always start from the original database
        expids = pd.Series(dfdic['experiment']['Experiment ID'].unique())
        nexp = len(expids)
        for child in filters.children[:-1]:
            sheet = child.children[0].value
            col = child.children[1].value
            typ = dfdic[sheet][col].dtype
            if typ != 'object':
                vmin = child.children[2].children[1].value
                vmax = child.children[2].children[3].value
                ie = (dfdic[sheet][col] >= vmin) & (dfdic[sheet][col] <= vmax)
            else:
                choices = [b.description for b in child.children[2].children[1:] if b.value is True]
                # exclude first child as it's the 'select all' checkbox
                ie = dfdic[sheet][col].isin(choices)
            expids = expids[expids.isin(dfdic[sheet][ie]['Experiment ID'].unique())]

        # do the filtering
        expids = expids.tolist()
        for sheet in dfdic.keys():
            if 'Experiment ID' in dfdic[sheet].keys():
                ie = dfdic[sheet]['Experiment ID'].isin(expids)
                dfdic[sheet] = dfdic[sheet][ie].reset_index(drop=True)

        filterLog.clear_output()
        with filterLog:
            print('Experiment ID retained: {:d}/{:d}'.format(len(expids), nexp))
            showMap(dfdic)  # cannot update map in tab1 as it's not visible (so all tiles don't load)
            # but in the current tab it is possible
        
    filterDataBtn = Button(description='Apply filters')
    filterDataBtn.on_click(filterDataBtnFunc)

    def resetFilterBtnFunc(a):
        # we want to restore the global dfdic variable
        filters.children = (filters.children[-1],)
        filterDataBtnFunc(42)
    resetFilterBtn = Button(description='Reset filters')
    resetFilterBtn.on_click(resetFilterBtnFunc)
    

    filters = VBox([VBox([HBox([addFilterBtn, filterDataBtn, resetFilterBtn]), filterLog])])
    return filters

# filters = buildFilters()
# filters

In [9]:
# interactive histogram view
# class ExploratoryHist(object):
#     def __init__(self, dfdic):
#         self.sheetDropdown = Dropdown(options=list(dbdic.keys()), layout=Layout(width='25%'))
#         self.sheetDropdown.observe(self.sheetDropdownFunc, names='value')
#         self.colDropdown = Dropdown(options=list(dbdic['experiment'].keys()), layout=Layout(width='35%'))
#         self.colDropdown.observe(self.colDropdownFunc, names='value')
#         self.vminText = FloatText(layout=Layout(width='15%'))
#         self.vmaxText = FloatText(layout=Layout(width='15%'))
#         self.nbins = IntText(layout=Layout(width='10%'))
#         self.dfdic = dfdic
#         self.showLog = Checkbox(value=True, description='Show values head/tail', indent=False)
#         self.out = Output()
#         self.log = Output()
#         self.header = HBox([self.sheetDropdown, self.colDropdown])
#         self.numberOption = HBox([Label('Vmin:'), self.vminText,
#                                   Label('Vmax:'), self.vmaxText,
#                                   Label('Nb. bins:'), self.nbins])
#         self.hbox = HBox([VBox([self.header, self.out], layout=Layout(width='60%')),
#                           VBox([self.showLog, self.log])])
#         self.bins = None
#         self.sheet = 'experiment'
#         self.col = 'Latitude'
#         self.colDropdownFunc({'new': 'Latitude'})
#         #self.buildFigure('experiment', 'Latitude')

#     def sheetDropdownFunc(self, a):
#         self.sheet = a['new']
#         self.colDropdown.options = list(dbdic[a['new']].keys())

#     def colDropdownFunc(self, a):
#         self.col = a['new']
#         typ = dbdic[self.sheet][self.col]
#         if typ == 'number':
#             self.vminText.unobserve_all()
#             self.vmaxText.unobserve_all()
#             self.nbins.unobserve_all()
#             _, bins = np.histogram(self.dfdic[self.sheet][self.col].dropna())
#             self.vminText.value = np.round(bins[0], 2)
#             self.vmaxText.value = np.round(bins[-1], 2)
#             self.nbins.value = len(bins)
#             self.vminText.observe(self.updateFunc, names='value')
#             self.vmaxText.observe(self.updateFunc, names='value')
#             self.nbins.observe(self.updateFunc, names='value')
#             self.bins = bins
#             self.hbox.children[0].children = (self.header, self.numberOption, self.out)
#             self.buildFigure()
#         else:
#             self.hbox.children[0].children = (self.header, self.out)
#             self.buildFigure()

#     def updateFunc(self, a):
#         vmin = self.vminText.value
#         vmax = self.vmaxText.value
#         nbins = self.nbins.value
#         self.bins = np.linspace(vmin, vmax, nbins)
#         self.buildFigure()

#     def buildFigure(self):
#         self.out.clear_output()
#         self.log.clear_output()
#         with self.log:
#             if self.showLog.value is True:
#                 if self.dfdic[self.sheet].shape[0] > 10:
#                     print(self.dfdic[self.sheet].sort_values(self.col).reset_index().iloc[np.r_[0:5, -5:0]][self.col])
#                 else:
#                     print(self.dfdic[self.sheet][self.col].sort_values(self.col).reset_index())
#         with self.out:
#             typ = dbdic[self.sheet][self.col]
#             if typ == 'number':
#                 self.dfdic[self.sheet][self.col].apply(tofloat).plot.hist(bins=self.bins)
#                 plt.xlabel(self.col)
#             else:
#                 self.dfdic[self.sheet][self.col].value_counts().plot(kind='bar', xlabel=self.col)
#                 plt.ylabel('Count')
#             plt.show() # needed, otherwise, graph won't change

#expHist = ExploratoryHist(dfdic)
#display(expHist.hbox)

In [10]:
# extract treatments (advanced)
# Pairwise strategy:
# - filters: conditions to be applied (list of choices to keep or range of numerical values to keep)
# - inclusion of 'Rotation' in conditions or not:
#     - inclusion: comparison will be done if all the year in the rotation met the conditions
#     - not included: comparison will be done if at least one year in the rotation met the conditions
# - value: choice or boolean rules if numeric values
# - occurence (of a value):
#     - -1: neglect this parameter and compare the value vs other values
#     - 0: compare the 0 occurence vs > 0 occurence (at lest one occurence)
#     - 1: compare the 1 occurence vs > 1 occurence (hence neglecting the 0 occurence)
# - conditions:
#     - column name and value: only the row with where this column has this value will be considered in the comparison
#     - column name and 'identical': comparison between two rows will only be considered if the value in this column are the same between the two rows compared

queries = {
    'different tillage': {
        'sheet': 'tillage',
        'column': 'Tillage system',
        'value': 'Zero tillage',
        'occurence': -1,
        'conditions': [('crops', 'Cropping system', None)],
    },
    'cover crop vs no cover crops': {
        'sheet': 'crops',
        'column': 'Crop type',
        'value': 'Cover crop',
        'occurence': 0,
        'conditions': [('crops', 'Cropping system', None),
                       ('tillage', 'Tillage system', None)],
    },
    'one main crop vs double cropping': {
        'sheet': 'crops',
        'column': 'Crop type',
        'value': 'Main crop',
        'occurence': 1,
        'conditions': [('crops', 'Cropping system', 'Monoculture'),
                       ('crops', 'Rotation', None)]
    },
    # 'cover crop X vs cover crops Y': {
    #     'sheet': 'crops',
    #     'column': 'Crop',
    #     'value': '>> Clover/Trifolium sp.',
    #     'occurence': -1,
    #     'conditions': [('crops', 'Crop type', 'Cover crop'),
    #                    ('crops', 'Rotation', None)]
    # },
    # 'different termination method': {
    #     'sheet': 'crops',
    #     'column': 'Harvesting/Termination method',
    #     'value': '> Frost-killed',
    #     'occurence': -1,
    #     'conditions': [('crops', 'Crop type', 'Cover crop')]
    # },
    # 'different growing period': {
    #     'sheet': 'crops',
    #     'column': 'Sowing period',
    #     'value': 'March',
    #     'occurence': -1,
    #     'conditions': [('crops', 'Crop type', 'Cover crop')]
    # },
    'residues incorporated vs residue left on surface': {
        'sheet': 'crops',
        'column': 'Residues removal',
        'value': 'Full residue removal',
        'occurence': -1,
        'conditions': [('crops', 'Crop', None),
                       ('crops', 'Crop type', 'Main crop')]
    }
}

def extractQueries(queries, dfdic, debug=False):
    # create dataframe to store the pairwise comparison
    compdict = {}
    def dump(*args):
        if debug:
            print(*args)
        else:
            pass

    for key in tqdm(queries):
        # define query
        query = queries[key]
        compdict[key] = []

        # extract the conditions on values and build filtered df for tab
        df = dfdic[query['sheet']]
        i2keep = np.ones(df.shape[0], dtype=bool)
        sameCols = []  # list of column that must be similar between pairwise
        crossCols = []  # list of column similar but in other sheets
        for row in query['conditions']:
            if row[0] == query['sheet']:
                val = row[2]
                if val is None:
                    sameCols.append(row[1])
                else:
                    i2keep = i2keep & df[row[1]].eq(val)
            else:
                crossCols.append((row[0], row[1]))
        df = df[i2keep].reset_index(drop=True)
        dfcross = pd.DataFrame(crossCols, columns=['sheet', 'column'])

        # add Rotation year if not in df (to make the code more flexible)
        if 'Rotation' not in df.columns:
            df['Rotation'] = 'year0'

        # columns to use for pairwise comparison
        col = query['column']
        cols = [col] + sameCols

        # cannot compare number 1 vs multiple accross rotation but only inside
        if (query['occurence'] >= 1) & ('Rotation' not in cols):
            cols.append('Rotation')
            print('WARNING: cannot compare number 1 vs multiple accross rotation but only inside rotation')

        # identify rows which are equal to query value
        iref = df[col] == query['value']

        # repeat for each experiment in the tab
        expids = df['Experiment ID'].unique()
        for expid in tqdm(expids):
            ie = df['Experiment ID'] == expid
            treatids = df[ie]['Treatment ID'].unique()
            dump('||| expid: ' + str(expid) + ' with {:d} treatids'.format(len(treatids)))

            # only investigate expid where the query value is present
            if (ie & iref).sum() > 0:

                # first loop looks for the reference/control treatment
                for i, treatid1 in enumerate(treatids):
                    itreat1 = ie & df['Treatment ID'].eq(treatid1)

                    # get the potential rows which contain the control values
                    icontrol = itreat1 & iref
                    
                    controlFound = False
                    # if there is at least one occurence value vs other value)
                    if query['occurence'] < 0:
                        if np.sum(icontrol) > 0:
                            controlFound = True

                    # if there is presence or absence
                    elif query['occurence'] == 0:
                        #print('occ0', treatid1, irows, end='')
                        # for this to be a control, none of the rows should contain the value
                        if np.sum(icontrol) == 0:
                            controlFound = True

                    # if there is the exact number of occurence specified (within a rotation)
                    else:
                        # compute number of occurence of the control within a rotation year
                        icounts = df[icontrol].groupby('Rotation').count()
                        if query['occurence'] in icounts.loc[:, icounts.columns[0]].to_list():
                            controlFound = True

                    if controlFound:
                        dump('\tcontrol found ({:s}) with value:'.format(str(treatid1)),
                            df[itreat1][cols].values.tolist())

                        # once reference is found,look for the corresponding treatments
                        for j, treatid2 in enumerate(treatids):
                            itreat2 = ie & df['Treatment ID'].eq(treatid2)

                            # don't look into the same treatment already selected as control (j != i)
                            # don't look into treatment if it contains NaN for the column of interest
                            # don't look into treatment if it does not contain the value (occurence = 0)
                            if ((j != i) 
                                & (df[itreat2][col].isna().sum() == 0)
                                & (
                                    ( # if we look for the presence, the treatid2 must have at least one of this value
                                        (query['occurence'] == 0) 
                                         & (df[itreat2][col].eq(query['value']).sum() > 0))
                                    | ( # if we look for multiple occurence of value, check treatid2 has them
                                        (query['occurence'] > 0)
                                        & (df[itreat2][col].eq(query['value']).sum() > query['occurence']))
                                    # if we look for this value vs other, then we take them all as their place in the
                                    # rotation will matter
                                    | (query['occurence'] < 0))
                               ):

                                # check if treat2 satisfy the crossCols conditions (same columns in different sheets)
                                condCross = True
                                for csheet in dfcross['sheet'].unique():
                                    ic = dfcross['sheet'] == csheet
                                    scols = dfcross[ic]['column'].tolist()
                                    cdf = dfdic[csheet]
                                    df1 = cdf[cdf['Experiment ID'].eq(expid) &
                                            cdf['Treatment ID'].eq(treatid1)].reset_index(drop=True)
                                    df2 = cdf[cdf['Experiment ID'].eq(expid) &
                                            cdf['Treatment ID'].eq(treatid2)].reset_index(drop=True)
                                    if condCross is True:
                                        # if both trt and ctrl have same size (maybe both empty)
                                        if (df1.shape[0] > 0) & (df2.shape[0] > 0):
                                            for l in range(df1.shape[0]):
                                                #print('==', scols, '\n', df2[scols].eq(df1.loc[l, scols]), df2[scols].eq(df1.loc[l, scols]).all(1).any(0))
                                                if df2[scols].eq(df1.loc[l, scols]).all(1).any(0) == True:
                                                    #print('+/', df2[scols].eq(df1.loc[l, scols]).all(1).any(0))
                                                    # there is at least one row which is identical between ctrl and trt
                                                    pass
                                                else:
                                                    condCross = False
                                                    break
                                        elif (df1.shape[0] == 0) & (df2.shape[0] == 0):
                                            pass
                                        else:
                                            condCross = False
                                            break

                                if condCross is True:
                                    dump('\t\tpassed conditions: ' + treatid1 + ' >> << ' + treatid2)

                                    # wether to do the comparison for each rotation or not
                                    ftreat = False

                                    # one value VS all other values
                                    if query['occurence'] == -1:
                                        subdf2 = df[itreat2][cols]
                                        for irow in np.where(icontrol)[0]:
                                            match = subdf2.eq(df.loc[irow, cols])

                                            # only row different from the control values are wanted
                                            match[col] = ~match[col]
                                            # all rows with difference value than the control but same columns ok
                                            if match.all(1).any(0):
                                                ftreat = True
                                                #print('=== value vs other value', expid, treatid1, treatid2)
                                                compdict[key].append([expid, treatid1, treatid2])
                                                break  # no need to test the other rows

                                    # absence VS presence of the value (absence is the control)
                                    elif query['occurence'] == 0:
                                        subdf2 = df[itreat2][cols]
                                        for irow in df[df['Treatment ID'] == treatid1].index.to_list():
                                            match = subdf2.eq(df.loc[irow, cols])

                                            # among the rows with sameCols ok, check if at least a different row with
                                            # the query value compared to control value
                                            isub = match[sameCols].all(1) & subdf2[col].eq(query['value'])
                                            if (isub.sum() > 0):
                                                ftreat = True
                                                dump('=== presence vs absence', expid, treatid1, treatid2)
                                                compdict[key].append([expid, treatid1, treatid2])
                                                break

                                    # 1 occurences VS more than 1 occurence of the value
                                    # ISSUE cannot compare 2 VS more than 2 because colSames comparison will be difficult
                                    # as it's an edge case, we keep comparing 1 occurence vs more than 1
                                    else:
                                        subdf2 = df[itreat2][cols]
                                        for rotid in subdf2['Rotation'].unique():
                                            irowCtrl = df[itreat1 & df['Rotation'].eq(rotid)][col].eq(query['value'])
                                            irot = subdf2['Rotation'].eq(rotid)
                                            irowTrt = subdf2[col].eq(query['value']) & irot
                                            if irowCtrl.sum() == query['occurence']:
                                                # trick, here we only compare the first line so we assume the occurence == 1 for control
                                                match = subdf2[irowTrt].eq(df.loc[np.where(irowCtrl)[0][0], cols])

                                                # check number of occurence in itreat2
                                                if match.all(1).sum() > query['occurence']:
                                                    ftreat = True
                                                    compdict[key].append([expid, treatid1, treatid2])
                                                    #print('=== one vs more occurence', expid, treatid1, treatid2)
                                                    break

    # convert list of list to pandas.DataFrame
    for key in compdict:
        compdict[key] = pd.DataFrame(compdict[key], columns=['expid', 'treatid_C', 'treatid_T'])

    return compdict



# ISSUE: you can have "at least one occurence" inside the rotation OR inside the treatment

#compdict = extractQueries(queries, dfdic)
# for key in compdict:
#     print(key, ':')
#     display(compdict[key])

In [11]:
# interactive query building (UI)
# this module add a row for each query where you can defined the control values, occurence
# and add additional conditions from the same tab or not
# getQueries enables to extract the information from the UI into a dictionnary of queries

# class to add condition (within the same table)
class RowCondition(object):
    def __init__(self, rows, sheet, dfdic):
        #options = [b for b in dbdic[sheet] if dbdic[sheet][b] == 'choice']
        #options = ['Rotation'] + options if 'Rotation' in dfdic[sheet].columns else options
        options = getOptions(dfdic, sheet)
        self.sheetDropdown = Dropdown(options=list(dbdic.keys())[3:-2], layout=Layout(width='25%'))
        self.sheetDropdown.value = sheet
        self.sheetDropdown.observe(self.sheetDropdownFunc, names='value')
        self.colDropdown = Dropdown(options=options, layout=Layout(width='25%'))
        self.colDropdown.observe(self.colDropdownFunc, names='value')
        self.valDropdown = Dropdown(layout=Layout(width='30%'))
        self.rmBtn = Button(description='Remove')
        self.rmBtn.on_click(self.rmBtnFunc)
        self.hbox = HBox([Label('Sheet:', layout=Layout(width='50px')), self.sheetDropdown,
                          Label('Col:', layout=Layout(width='30px')), self.colDropdown,
                          Label('Val:', layout=Layout(width='30px')), self.valDropdown, self.rmBtn])
        self.rows = rows
        self.sheet = sheet
        self.dfdic = dfdic
        self.rows.children = self.rows.children + (self.hbox, )
        self.index = len(self.rows.children) - 1
        self.colDropdownFunc({'new': options[0]})

    def sheetDropdownFunc(self, a):
        newSheet = a['new']
        self.colDropdown.options = getOptions(self.dfdic, a['new'])

    def colDropdownFunc(self, a):
        col = a['new']
        if self.sheetDropdown.value == self.sheet:
            vals = dfdic[self.sheet][col].dropna().unique()
            self.valDropdown.options = ['*Ctrl same as Trt*'] + list(vals)
        else:
            self.valDropdown.options = ['*Ctrl same as Trt*']

    def rmBtnFunc(self, a):
        for i, child in enumerate(self.rows.children):
            if ((child.children[0].value == self.hbox.children[0].value)
                & (child.children[1].value == self.hbox.children[1].value)):
                b = list(self.rows.children)
                del b[i]
                self.rows.children = b
                break


# class to add reference/control and define occurence and value
class RowReference(object):
    def __init__(self, rows, dfdic, name=''):
        self.sheetDropdown = Dropdown(options=list(dbdic.keys())[3:-2], layout=Layout(width='15%'))
        self.sheetDropdown.observe(self.sheetDropdownFunc, names='value')
        self.colDropdown = Dropdown(options=[], layout=Layout(width='15%'))
        self.colDropdown.observe(self.colDropdownFunc, names='value')
        self.refDropdown = Dropdown(options=[], layout=Layout(width='15%'))
        self.occurenceDropdown = Dropdown(options=['this value VS other',
                                           'absence vs presence of value',
                                           '1 occurence vs more'], layout=Layout(width='15%'))
        self.rmBtn = Button(description='Remove')
        self.rmBtn.on_click(self.rmBtnFunc)
        self.hbox = HBox([Label('Sheet:'), self.sheetDropdown,
                         Label('Column:'), self.colDropdown,
                         Label('Control:'), self.refDropdown,
                         Label('Occurence:'), self.occurenceDropdown, self.rmBtn],
                         layout=Layout(display='flex'))
        self.rows = rows
        self.index = len(self.rows.children) - 1
        self.conditions = VBox([])
        self.addBtn = Button(description='Add condition')
        self.addBtn.on_click(self.addBtnFunc)
        self.vbox = VBox([Label(name), self.hbox, HBox([
            VBox([Label('Conditions:'), self.addBtn]),
            self.conditions], layout=Layout(width='90%'))])
        self.rows.children = self.rows.children[:-1] + (self.vbox, self.rows.children[-1])
        self.dfdic = dfdic
        self.opts = None

        # initiate initial configuration
        self.sheetDropdownFunc({'new': 'treatment'})
        self.buildOptions('treatment', 'Land use')
        self.colDropdown.value = 'Land use'

    def addBtnFunc(self, a):
        RowCondition(self.conditions, self.sheetDropdown.value, self.dfdic)

    def sheetDropdownFunc(self, a):
        sheet = a['new']
        self.colDropdown.options = getOptions(self.dfdic, a['new'], rot=False)
        self.conditions.children = []

    def colDropdownFunc(self, a):
        sheet = self.sheetDropdown.value
        col = a['new']
        self.buildOptions(sheet, col)

    def buildOptions(self, sheet, col):
        choices = dfdic[sheet][col].dropna().unique()
        self.refDropdown.options = choices

    def rmBtnFunc(self, _):
        for i, child in enumerate(self.rows.children):
            if child.children[0].value == self.vbox.children[0].value:
                b = list(self.rows.children)
                del b[i]
                self.rows.children = b
                break


nameCounter = 0  # to give numerical name to query

# add a query row (with a reference and some conditions rows eventually)
def buildMetaRef(dfdic):
    def addRefBtnFunc(b):
        global nameCounter
        name = 'query' + str(nameCounter) + ':'
        nameCounter += 1
        rowRef = RowReference(refs, dfdic, name=name)
    addRefBtn = Button(description='Add reference')
    addRefBtn.on_click(addRefBtnFunc)
    refs = VBox([HBox([addRefBtn])])
    return refs


# get all values from the UI and form a dictionnary of queries for
# extractQueries function
def getQueries(refs):
    dic = {}
    for i, row in enumerate(refs.children[:-1]):
        query = row.children[1]
        name = row.children[0].value[:-1]
        sheet = query.children[1].value
        column = query.children[3].value
        value = query.children[5].value
        occdic = {'this value VS other': -1,
                  'absence vs presence of value': 0,
                  '1 occurence vs more': 1}
        occurence = occdic[query.children[7].value]
        conditions = []
        for child in row.children[2].children[1].children:
            csheet = child.children[1].value
            col = child.children[3].value
            val = child.children[5].value
            if val == '*Ctrl same as Trt*':
                val = None
            conditions.append((csheet, col, val))
        dic[name] = {
            'sheet': sheet,
            'column': column,
            'value': value,
            'occurence': occurence,
            'conditions': conditions
        }
    return dic

# refs = buildMetaRef(dfdic)
# display(refs)

In [12]:
# test
# queries = getQueries(refs)
# print(queries)
# extractQueries(queries, dfdic)

In [13]:
# extract treatments according to query
def extractTreatments(trtOut):
    global compdict, dfdic, queries
    t0 = time.time()
    trtOut.clear_output()
    expOut.clear_output() # also clear exports
    with trtOut:
        print('processing...', end='')
        queries = getQueries(refs)
        compdict = extractQueries(queries, dfdic)
    dfsum = pd.DataFrame(columns=['name', 'sheet', 'value', 'occurence', 'conditions',
                                  'number of experiments', 'number of pairs'])
    for key in queries:
        query = queries[key]
        pairs = compdict[key]
        dfsum = dfsum.append({
            'name': key,
            'sheet': query['sheet'],
            'column': query['column'],
            'value': query['value'],
            'occurence': query['occurence'],
            'conditions': ' & '.join([':'.join([str(a) for a in b])
                                      for b in query['conditions']]),
            'number of experiments': pairs['expid'].unique().shape[0],
            'number of pairs': pairs.shape[0]
        }, ignore_index=True)
    with trtOut:
        print('done ({:.2f}s)'.format(time.time() - t0))
        display(dfsum[['name', 'number of experiments', 'number of pairs']])


# compdict = {}
# queries = {}
# trtOut = Output()
# extractTreatmentBtn = Button(description='Get treatments', style= {'button_color':'orange'})
# def func(_):
#     extractTreatments(trtOut)
# extractTreatmentBtn.on_click(func)
# VBox([extractTreatmentBtn, trtOut])

In [14]:
# select columns that must be the same in pairwise comparison
# select type of effect sizes (diff or ratio) and if log or not
# numChoices = []
# for sheet in dbdic:
#     for key in dbdic[sheet]:
#         if dbdic[sheet][key] == 'number' and 'data-' in sheet:
#             numChoices.append(sheet + ' | ' + key)
# numRadio = Dropdown(options=numChoices, layout=Layout(width='70%'))
# esType = RadioButtons(options=['difference', 'ratio'])
# esLog = RadioButtons(options=['yes', 'no'], value='no')
# metaOut = Output()
# def runMetaBtnFunc(a):
#     global dfmeta
#     metaOut.clear_output()
#     with metaOut:
#         plotES()
# runMetaBtn = Button(description='Plot Effect sizes',
#                     style= {'button_color':'orange'})
# runMetaBtn.on_click(runMetaBtnFunc)
# metaBox = VBox([HBox([VBox([Label('Numerical column for effect sizes:'), numRadio, runMetaBtn],
#                      layout=Layout(width='33%')),
#                 VBox([Label('How to compute ES?'), esType,
#                       Label('Apply log on numeric value?'), esLog], layout=Layout(width='40%')),
#                ], layout=Layout(height='110%')), metaOut])

#display(metaBox)

In [15]:
# build a flat dataframe with _C columns vs _T columns

def buildMetaDF(dfdic, compdicto, supcols=[]):
    """Build horizontal dataframe for meta-analysis.
    """
    # collect other supplemental informative columns
    mcols = [  # mandatory columns that should always be there
        'reference | Data entry person',
        'reference | Data entry person email',
        'reference | Publication ID',
    ]
    for mcol in mcols:
        if mcol not in supcols:
            supcols.append(mcol)
    if len(supcols) > 0:
        dfsc = pd.DataFrame([a.split(' | ') for a in supcols], 
                            columns=['sheet', 'column'])
    else:
        dfsc = pd.DataFrame(columns=['sheet', 'column'])
        # for sheet in list(dfdic.keys())[2:-1]:
        #     cols = dfdic[sheet].columns.tolist()
        #     sdf = pd.DataFrame(cols, columns=['column'])
        #     sdf['sheet'] = sheet
        #     dfsc = dfsc.append(sdf, ignore_index=True)

    # if compdict empty just take all expids
    isMA = True
    compdict = dicCopy(compdicto)
    if len(compdict.keys()) == 0 or 'all' in compdict.keys():
        compdict = {}  # clear out all keys
        compdict['all'] = dfdic['treatment'][['Experiment ID', 'Treatment ID']].rename(
            columns={'Experiment ID': 'expid', 'Treatment ID': 'treatid_C'})
        isMA = False

    # concatenation function for metadata
    def concat(x):
        xs = [str(a) for a in x]
        if all([a == xs[0] for a in xs]):  # if all values are the same, just put it once
            return xs[0]
        else:
            return ' | '.join(xs)

    dfs = []
    for query in compdict:
        df = compdict[query].rename(columns={'expid': 'Experiment ID'})
        df['pairedComparison'] = np.arange(df.shape[0]) + 1
        df['query'] = query

        # experiment specific metadata
        for sheet in ['experiment', 'soil-type', 'reference']:
            ldf = dfdic[sheet]
            cols = dfsc[dfsc['sheet'].eq(sheet)]['column'].tolist()
            if len(cols) > 0:
                cols = cols + ['Experiment ID'] if 'Experiment ID' not in cols else cols
                df = pd.merge(df, ldf[cols], on='Experiment ID', how='left')

        # treatment specific metadata
        for sheet in ['treatment', 'tillage', 'crops', 'amendment', 'irrigation']:
            ldf = dfdic[sheet]
            cols = dfsc[dfsc['sheet'].eq(sheet)]['column'].tolist()
            if len(cols) > 0:
                cols = cols + ['Experiment ID'] if 'Experiment ID' not in cols else cols
                cols = cols + ['Treatment ID'] if 'Treatment ID' not in cols else cols
                ldf = ldf.groupby(['Experiment ID', 'Treatment ID']).agg(concat).reset_index()

                # for control
                df = pd.merge(df, ldf[cols].add_suffix('_C'), how='left',
                              left_on=['Experiment ID', 'treatid_C'],
                              right_on=['Experiment ID_C', 'Treatment ID_C'])
                df = df.drop(['Experiment ID_C', 'Treatment ID_C'], axis=1)

                # for treatment
                if isMA:
                    df = pd.merge(df, ldf[cols].add_suffix('_T'), how='left',
                                  left_on=['Experiment ID', 'treatid_T'],
                                  right_on=['Experiment ID_T', 'Treatment ID_T'])
                    df = df.drop(['Experiment ID_T', 'Treatment ID_T'], axis=1)

        # add data-soil
        # we start by merging the data-soil so that if data is missing from data-crop, SOC can still be in the flat format
        ldf = dfdic['data-soil']
        rdic = {
            'Sampling year_C': 'Sampling year',
            'Sampling year_T': 'Sampling year',
            'Depth from_C': 'Depth from',
            'Depth from_T': 'Depth from',
            'Depth to_C': 'Depth to',
            'Depth to_T': 'Depth to',
            'Publication ID_C': 'Publication ID',
            'Publication ID_T': 'Publication ID',
        }

        # for control
        df = pd.merge(df, ldf.add_suffix('_C').rename(columns=rdic), how='left',
                      left_on=['Experiment ID', 'treatid_C', 'Publication ID'],
                      right_on=['Experiment ID_C', 'Treatment ID_C', 'Publication ID'])
        df = df.drop(['Experiment ID_C', 'Treatment ID_C'], axis=1)

        # for treatment
        if isMA:
            df = pd.merge(df, ldf.add_suffix('_T').rename(columns=rdic), how='left',
                          left_on=['Experiment ID', 'treatid_T', 'Publication ID',
                                   'Sampling year', 'Depth from', 'Depth to'],
                          right_on=['Experiment ID_T', 'Treatment ID_T', 'Publication ID',
                                    'Sampling year', 'Depth from', 'Depth to'])
            df = df.drop(['Experiment ID_T', 'Treatment ID_T'], axis=1)


        # add data-soil
        ldf = dfdic['data-crop']

        # for control
        df = pd.merge(df, ldf.add_suffix('_C').rename(columns=rdic), how='left',
                      left_on=['Experiment ID', 'treatid_C', 'Publication ID', 'Sampling year'],
                      right_on=['Experiment ID_C', 'Treatment ID_C', 'Publication ID', 'Sampling year'])
        df = df.drop(['Experiment ID_C', 'Treatment ID_C'], axis=1)

        # for treatment
        if isMA:
            df = pd.merge(df, ldf.add_suffix('_T').rename(columns=rdic), how='left',
                          left_on=['Experiment ID', 'treatid_T', 'Publication ID', 'Sampling year'],
                          right_on=['Experiment ID_T', 'Treatment ID_T', 'Publication ID', 'Sampling year'])                              
            df = df.drop(['Experiment ID_T', 'Treatment ID_T'], axis=1)

        dfs.append(df)
    dfm = pd.concat(dfs)

    # drop useless id columns (crop id, rotation, pubid)

    # drop nan rows
    dfm = dfm.replace(to_replace='nan', value=np.nan, regex=False)
    dcols = ['Harvested yield', 'SOC conc', 'SOC stock']
    colsC = [a + '_C' for a in dcols]
    colsT = [a + '_T' for a in dcols]
    if isMA:
        i2keep = dfm[colsC].notnull().any(1) & dfm[colsT].notnull().any(1)
    else:
        dfm = dfm.rename(columns=dict(zip(dfm.columns, [a.replace('_C', '') for a in dfm.columns])))
        i2keep = dfm[dcols].notnull().any(1)
    #print('{:d}/{:d} rows kept (rows with data associated)'.format(
    #    i2keep.sum(), i2keep.shape[0]))
    dfm = dfm[i2keep].reset_index(drop=True)
    
    # make columns underCase
    
    return dfm

#compdict = {}
#df = buildMetaDF(dfdic, compdict)#['experiment | Latitude', 'soil-type | Soil group WRB','irrigation | Irrigation method'])
#df.shape

In [16]:
# def buildMetaDf(supcols=[]):
#     """Build dataframe with control vs treatment.
#     """
#     global dfdic, compdict

#     # collect other supplemental informative columns
#     dfsc = pd.DataFrame([a.split(' | ') for a in supcols], 
#                         columns=['sheet', 'column'])

#     # get column with the target numeric on which compute ES
#     numsheet, numcol = numRadio.value.split(' | ')
#     dfsc = dfsc.append({'sheet': numsheet, 'column': numcol}, ignore_index=True)

#     dfmeta = pd.DataFrame(columns=['query', 'expid', 'treatid_C', 'treatid_T',
#                                   numcol + '_C', numcol + '_T', 'ES'])
#     dfstack = pd.DataFrame()
    
#     pairCounter = 0
#     for key in compdict:
#         # add columns involved in the query
#         sheet = queries[key]['sheet']
#         col = queries[key]['column']
#         dfsc = dfsc.append({'sheet': sheet, 'column': col}, ignore_index=True)
        
#         # add additional columns which are in the same sheet
#         ccols = [b for a, b, c in queries[key]['conditions'] if a == sheet]
#         for a in ccols:
#             dfsc = dfsc.append({'sheet': sheet, 'column': a}, ignore_index=True)

#         # ISSUE: with rotation or multiple rows per treatments (e.g. amendments),
#         # it's not possible to be sure that treatment and control will be on same rows
#         # an option would be to have a stacked design as we had with pc columns or so

#         # OPTION 1: stacked
#         for l in range(compdict[key].shape[0]):
#             expid = compdict[key]['expid'][l]
#             pairCounter += 1

#             # add metadata for the control
#             treatidC = compdict[key]['treatid_C'][l]
#             subdf = pd.DataFrame()
#             for j, sheet in enumerate(dfsc['sheet'].unique()):
#                 cols = dfsc[dfsc['sheet'] == sheet]['column'].to_list()
#                 if 'Treatment ID' in dfdic[sheet].columns:
#                     cols += ['Experiment ID', 'Treatment ID']
#                     df = dfdic[sheet][cols]
#                     ie = (df['Experiment ID'] == expid) & (df['Treatment ID'] == treatidC)
#                 else:
#                     cols += ['Experiment ID']
#                     df = dfdic[sheet][cols]
#                     ie = (df['Experiment ID'] == expid)
#                 if j == 0:
#                     subdf = df[ie]
#                 else:
#                     subdf = pd.merge(subdf, df[ie], how='outer')
#             subdf['query'] = key
#             subdf['pc'] = -pairCounter
#             dfstack = dfstack.append(subdf)

#             # add metadata for the treatment
#             treatidT = compdict[key]['treatid_T'][l]
#             subdf = pd.DataFrame()
#             for j, sheet in enumerate(dfsc['sheet'].unique()):
#                 cols = dfsc[dfsc['sheet'] == sheet]['column'].to_list()
#                 if 'Treatment ID' in dfdic[sheet].columns:
#                     cols += ['Experiment ID', 'Treatment ID']
#                     df = dfdic[sheet][cols]
#                     ie = (df['Experiment ID'] == expid) & (df['Treatment ID'] == treatidT)
#                 else:
#                     cols += ['Experiment ID']
#                     df = dfdic[sheet][cols]
#                     ie = (df['Experiment ID'] == expid)
#                 if j == 0:
#                     subdf = df[ie]
#                 else:
#                     subdf = pd.merge(subdf, df[ie], how='outer')
#             subdf['query'] = key
#             subdf['pc'] = pairCounter
#             dfstack = dfstack.append(subdf)

#         # OPTION 2: ctrls vs trt but not addition metadata
#         # NOTE: we take all values (even if multiple years)
#         subdf = compdict[key]
#         dfnum = dfdic[numsheet]
#         colindex = ['Experiment ID', 'Treatment ID']

#         # add value for control
#         subdf = pd.merge(subdf, dfnum[colindex + [numcol]],
#                          left_on=['expid', 'treatid_C'],
#                          right_on=colindex).rename(
#             columns={numcol: numcol + '_C'}).drop(colindex, axis=1)

#         # add value for treatment
#         subdf = pd.merge(subdf, dfnum[colindex + [numcol]],
#                          left_on=['expid', 'treatid_T'],
#                          right_on=colindex).rename(
#             columns={numcol: numcol + '_T'}).drop(colindex, axis=1)

#         # compute effect size
#         valC = subdf[numcol + '_C']
#         valT = subdf[numcol + '_T']
#         inan = ~np.isnan(valC) & ~np.isnan(valT)
#         if esType == 'difference':
#             if esLog == 'yes':
#                 subdf.loc[inan, 'ES'] = np.log10(valC[inan]) - np.log10(valT[inan])
#             else:
#                 subdf.loc[inan, 'ES'] = valC[inan] - valT[inan]
#         else:
#             if esLog == 'yes':
#                 subdf.loc[inan, 'ES'] = np.log10(valC[inan]) / np.log10(valT[inan])
#             else:
#                 subdf.loc[inan, 'ES'] = valC[inan] / valT[inan]

#         # add query and append to dfmeta
#         subdf['query'] = key
#         dfmeta = dfmeta.append(subdf)

#     # remove NaN
#     dfmeta = dfmeta[dfmeta['ES'].notnull()].reset_index(drop=True)

#     return dfmeta, dfstack

#dfmeta, dfstack = buildMetaDf(['experiment | Latitude', 'experiment | Longitude'])
#display(dfmeta)
#display(dfstack)

In [17]:
# ISSUE: what if multiple per year value? how to compute ES?
# mean of ES is done for now but we could do the mean of value and then compute ES
# def plotES():
#     global dfdic, comdict, dfmeta, dfstack
#     if 'query' not in dfmeta.columns:
#         print('Please first specify control and treatments in the Query tab')
#         return
#     # compute back the dfmeta if we have a different numcol
#     print('running...', end='')
#     dfmeta, dfstack = buildMetaDf()
#     print('done')
#     fig, ax = plt.subplots()
#     ax.set_title('Effect size on ' + numRadio.value.split(' | ')[1])
#     ylabs = []
#     if esType.value == 'difference':
#         ax.axvline(0, linestyle='--', color='k')
#     else:
#         ax.axvline(1, linestyle='--', color='k')
#     gmean = dfmeta.groupby(['query', 'expid', 'treatid_C', 'treatid_T']).mean().reset_index()
#     for i, query in enumerate(dfmeta['query'].unique()):
#         ie = gmean['query'] == query
#         if np.sum(ie) == 0:
#             print('No values found for ' + query)
#         ax.errorbar(gmean[ie]['ES'].mean(), i, xerr=gmean[ie]['ES'].sem(), marker='o', label=query)
#         ylabs.append('{:s} ({:d})'.format(query, ie.sum()))
#     ax.set_xlabel('Effect size') # units if difference, nothing if ratio
#     ax.set_yticks(np.arange(len(ylabs)))
#     ax.set_yticklabels(ylabs);
#     #ax.legend()
#     plt.show()

#plotES(dfmeta)

In [18]:
# provide a selection of additional meta-data columns to be added to the merged or C vs T export format

ids = ['Experiment ID', 'Treatment ID', 'Crop ID', 'Rotation', 'Publication ID']

def buildSelection(dfdic):
    scols = []
    for sheet in list(dfdic.keys())[2:-3]:  # all data-crop and data-soil are included by default
        cols = dfdic[sheet].columns
        ie1 = cols.isin(ids) | cols.isin(['Data entry person', 'Data entry person email'])
        scols +=  (sheet + ' | ' + cols[~ie1]).tolist()
    checkboxes = [Checkbox(value=True, description=a, indent=False) for a in scols]
    def checkall(a):
        for c in checkboxes:
            c.value = a['new']
    allcheck = Checkbox(value=True, description='Select all', indent=False)
    allcheck.observe(checkall, names='value')
    return VBox([allcheck] + checkboxes)

# def getMergedDf(dfdic, vbox):
#     selection = [child.description.split(' | ') for child in vbox.children[1:] if child.value is True]
#     dfsel = pd.DataFrame(selection, columns=['sheet', 'column'])
#     sheets = dfsel['sheet'].unique()
#     dfm = pd.DataFrame()
#     for i, sheet in enumerate(sheets):
#         cols = dfsel[dfsel['sheet'] == sheet]['column'].tolist()
#         cols += ids
#         cols = pd.Series(cols)
#         df = dfdic[sheet]
#         cols = cols[cols.isin(df.columns.tolist())]
#         if i == 0:
#             dfm = df[cols]
#         else:
#             dfm = pd.merge(dfm, df[cols], how='outer')
#     return dfm


#vbox = buildSelection(dfdic)
#vbox

In [19]:
# generate the filtered excel file (same format as the template)
def buildFilteredDB(dfdic, compdicto, name='Filtered DB', fname='filtered-db.xlsx'):
    # if compdict empty just take all expids
    compdict = dicCopy(compdicto)
    if len(compdict.keys()) == 0 or 'all' in compdict.keys():
        compdict = {}  # clear out all keys
        compdict['all'] = dfdic['treatment'][['Experiment ID', 'Treatment ID']].rename(
            columns={'Experiment ID': 'expid', 'Treatment ID': 'treatid_C'})

    with tempfile.TemporaryDirectory() as td:
        fpath = os.path.join(td + fname)
        writer = pd.ExcelWriter(fpath, engine='xlsxwriter')
        for tab, df in dfdic.items():
            if tab in ['experiment', 'reference', 'soil-type', 'treatment', 'tillage', 'crops',
                       'amendment', 'irrigation', 'grazing', 'pest-weed',
                       'soil-crop-measurement', 'data-crop', 'data-soil']:

                # filter to only keep the row relevant to queries
                if 'Experiment ID' not in df.columns:
                    ie = np.ones(df.shape[0], dtype=bool)
                else:
                    ie = np.zeros(df.shape[0], dtype=bool)
                    for key in compdict:
                        dfcol = compdict[key]
                        if 'Treatment ID' in df.columns:
                            for expid in dfcol['expid'].unique():
                                if 'treatid_T' in dfcol.columns:
                                    treatids = dfcol[dfcol['expid'] == expid][['treatid_C', 'treatid_T']].values.flatten()
                                else:
                                    treatids = dfcol[dfcol['expid'] == expid]['treatid_C'].values
                                ie = ie | ((df['Experiment ID'] == expid) & (df['Treatment ID'].isin(treatids)))
                        else:
                            ie = ie | (df['Experiment ID'].isin(dfcol['expid'].tolist()))
                df[ie].to_excel(writer, sheet_name=tab, index=False)
        writer.save()
#         fname = datadir + 'template-blank.xlsx'
#         workbook = load_workbook(filename=fname)
#         sheetNames = list(dfdic.keys())[2:-1]
#         for sheetName in sheetNames:
#             sheet = workbook[sheetName]
#             for row in dataframe_to_rows(dfdic[sheetName], index=False, header=False):
#                 sheet.append(row)
#         workbook.save(datadir + 'carboseq-wp2-db.xlsx')
        with open(fpath,  'rb') as f:
            data = f.read()
    b64 = base64.b64encode(data)
    payload = b64.decode()

    html_button = '''
    <a download="{fname}" href="data:text/csv;base64,{payload}" download>
    <button class="p-Widget jupyter-widgets jupyter-button widget-button mod-info">{name}</button>
    </a>
    '''.format(payload=payload, fname=fname, name=name)

    return HTML(html_button)

#buildFilteredDB(dfdic, compdict)

In [20]:
def prepareDownload(df, name='Download File', fname='df.xlsx'):
    t0 = time.time()
    with tempfile.TemporaryDirectory() as td:
        fpath = os.path.join(td + fname)
        df.to_excel(fpath, index=False)  # takes some time but csv takes more time to render in base64
        #with ZipFile(fpath.replace('.csv', '.zip'), mode='w') as myzip:
        #    myzip.write(fpath, arcname=fname.replace('.csv', ''))
        with open(fpath,  'rb') as f:
            data = f.read()
    b64 = base64.b64encode(data)
    payload = b64.decode()

    html_button = '''
    <a download="{fname}" href="data:text/xlsx;base64,{payload}" download>
    <button class="p-Widget jupyter-widgets jupyter-button widget-button mod-info">{name}</button>
    </a>
    '''.format(payload=payload, fname=fname, name=name)

    return HTML(html_button)

#DownloadDownload(df)

In [21]:
# global variables
dfdic = {}
dfdico = {} # original only for reseting dfdic
df = pd.DataFrame()
dfmeta = pd.DataFrame() # for meta-analysis
dfstack = pd.DataFrame() # stacked version of paired dataframe
compdict = {}
queries = {}
refs = None
version = ''
expOut = None

# upload button
def loadBtnFunc(btn):
    global dfdic, dfdico, dfmeta, dfstack, mainLayout, refs, version, expOut
    out.clear_output()
    with out:
        if gsurl.value != '':
            dfdic, version = readExcel(gsurl.value)
        elif len(uploadBtn.value) > 0:
            dfdic, version = readExcel(uploadBtn.value[0]['content'].tobytes())
            #with open('t.xlsx', 'wb') as f: # faster but less robust
            #    f.write(upload.data[0])
            #dfdic = readExcel2('t.xlsx')
        print('database version: ', version)
        
        # create backup copy
        dfdico = dicCopy(dfdic)
        
        # reset upload button
        uploadBtn.value = []

        # remove the 'unnamed columns'
        for sheet in dfdic:
            cols = dfdic[sheet].columns
            dfdic[sheet] = dfdic[sheet].drop(cols[cols.str.contains('Unnamed')], axis=1)

        # force float type
        for sheet in dbdic.keys():
            for col in dbdic[sheet].keys():
                if (dbdic[sheet][col] == 'number'):
                    try:
                        dfdic[sheet][col].astype(float)
                    except Exception as e:
                        dfdic[sheet][col] = dfdic[sheet][col].apply(tofloat)
                        print('had to force ', sheet, '>', col, 'to be float:', str(e))

        # make a backup to be used if we reset the filters
        dfdico = dfdic.copy()

        # build descriptive stats
        #print('\nThe plot below shows the number of treatments that have investigated a specific'
        #      ' factor as well as the ones that have investigated many.')
        #plotFactor(dfdic)
        showMap(dfdic)

        # but back mainLayout to one single tab
        mainLayout.children = mainLayout.children[:1]

        # build filtering
        filters = buildFilters()
        mainLayout.children += (VBox([
            Label('Add data filters. The filters are always applied on the initial database uploaded. '
                  'You can then further explored the filtered data by creating queries (Query tab).'), filters]), )

        # creating queries
        refs = buildMetaRef(dfdic)
        trtOut = Output()
        extractTreatmentBtn = Button(description='Get treatments', style= {'button_color':'orange'})
        def func(_):
            global dfmeta, dfstack
            extractTreatments(trtOut)
            dfmeta, dfstack = buildMetaDf()
        extractTreatmentBtn.on_click(func)
        trtInstructions = Output()
        with trtInstructions:
            display(Markdown('''
This tab helps to query the database by defining what you want to compare in a query. 
A query is formed by a reference/control (button 'Add reference') where you defined the sheet, column and value you want. 
You can then specified the 'occurence' of the value (this value against other, presence or absence, 1 vs more occurence of the value). 
The code will extract the reference/control rows and look for their corresponding 'treatment' rows, thus forming pairs of 'control vs treatment'.
To further restrain the comparison, conditions can be added to force both 'control' and 'treatment' rows to share the same or a specific value for given column.

Examples:
- tillage vs no-tillage:
    - sheet: tillage
    - column: Tillage type
    - value: Zero tillage
    - occurence: this value vs others
    - conditions: None
- cover crops vs no cover crop (inside same rotation):
    - sheet: crops
    - column: Crop type
    - value: Cover crop
    - occurence: presence or absence
    - conditions: Rotation: *Ctrl same as Trt*
'''))
        mainLayout.children += (VBox([trtInstructions, refs, extractTreatmentBtn, trtOut]), )

        # build meta-analysis part
        # mainLayout.children += (
        #         VBox([Label('Select a numeric column on from which to compute the effect sizes.'),
        #               metaBox]),)

        # build export part
        vbox = buildSelection(dfdic)

        def expBtnFunc(a):
            global dfmeta, dfstack
            expOut.clear_output()

            with expOut:
                # generate the merged dataframe
                print('creating merged file...', end='')
                t0 = time.time()
                #dfm = getMergedDf(dfdic, vbox)
                print('done ({:.2f}s)'.format(time.time() - t0))

                # generate the dfmeta and dfstack with the supplementary columns
                print('creating flat DB...', end='')
                t0 = time.time()
                supcols = [child.description for child in vbox.children[1:] if child.value is True]
                dfmeta = buildMetaDF(dfdic, compdict, supcols=supcols)
                #dfmeta, dfstack = buildMetaDf(supcols=supcols)
                print('done ({:.2f}s)'.format(time.time() - t0))

                # display the button to download the files
                print('creating download buttons...', end='')
                t0 = time.time()
                display(buildFilteredDB(dfdic, compdict, 'Filtered DB', 'filtered-db-' + version + '.xlsx'))
                #display(prepareDownload(dfm, 'Merged file', 'merged-db.xlsx'))
                display(prepareDownload(dfmeta, 'Flat DB', 'flat-db-' + version + '.xlsx'))
                #fname = 'meta-c-vs-t.xlsx'
                #dfmeta.to_excel(fname, index=False)
                #display(FileLink(fname))
                print('done ({:.2f}s)'.format(time.time() - t0))

        expBtn = Button(description='Generate exports')
        expBtn.on_click(expBtnFunc)
        expOut = Output()
        exports = HBox([vbox,
                        VBox([expBtn, expOut])])

        # instructions
        trtInstructions = Output()
        with trtInstructions:
            display(Markdown('''
Only the rows relevant to the queries (Query tab) will be exported. If no queries were done, the entire database (after filtering) is exported.
Two export format are available: a "Filtered DB" format which is an .xlsx file with similar tab structure than the database or
a "Flat DB" format which is an .xlsx file with one single tab in which the information is stacked. If queries are made,
the "Flat DB" format will contains columns with _C (for control) and _T for treatment so that this file can be used for meta-analysis more easily.
By default, only the columns in "data-soil" and "data-crop" tabs and also columns relevant to the "Data entry person" are included in the "Flat DB". You can select additional column to be added
with the checkboxes below. Do not forget to regenerate the exports if you change the selection of columns to be exported.
'''))

        mainLayout.children += (
            VBox([trtInstructions, exports]),
        )

        # filter and export tab
        fInstructions = Output()
        with fInstructions:
            display(Markdown('''
Import an excel file (.xlsx) containing two ('Experiment ID' and 'Treatment ID') or three columns ('Experiment ID', 'Treatment ID_C', 'Treatment ID_T') to operate a manual selection. 
The **initially uploaded database** will be filtered to only keep the selected 'Experiment ID' and 'Treatment ID'.
Export can then be done as 'Filtered DB' format (excel with multiple tabs) 
or as 'Flat DB' format (one sheet, includes all metadata columns specified in tab 'export').
'''))

        fexports = Output()

        def fbtnFunc(a):
            global compdict, dfdico, dfdic
            fexports.clear_output()
            with fexports:
                # read in data
                print('reading...', end='')
                dff = pd.read_excel(fbtn.value[0]['content'].tobytes())
                fbtn.value = []
                print('done')

                # filtered data
                dfdic2 = dicCopy(dfdico)
                for sheet in list(dbdic.keys())[3:-2]:
                    df = dfdic2[sheet].copy()
                    if 'Treatment ID' in df.columns:
                        if 'Treatment ID' in dff.columns:  # only two columns specified
                            ie = (df['Experiment ID'].isin(dff['Experiment ID'].unique().tolist())
                                  & df['Treatment ID'].isin(dff['Treatment ID'].tolist()))
                        elif 'Treatment ID_C' in dff.columns:  # three columns format
                            ie = (df['Experiment ID'].isin(dff['Experiment ID'].unique().tolist())
                                  & (df['Treatment ID'].isin(dff['Treatment ID_C'].tolist())
                                    | df['Treatment ID'].isin(dff['Treatment ID_T'].tolist())))
                        else:  # select all treatments from the given experiment id list
                            ie = df['Experiment ID'].isin(dff['Experiment ID'].unique().tolist())
                    else:
                        ie = df['Experiment ID'].isin(dff['Experiment ID'].unique().tolist())
                    dfdic2[sheet] = df[ie].reset_index(drop=True)

                # generate the flat DB format with the supplementary columns
                print('creating flat DB...', end='')
                t0 = time.time()
                supcols = [child.description for child in vbox.children[1:] if child.value is True]
                if 'Treatment ID_C' in dff.columns:
                     compdict2 = {'selection': dff.rename(columns={
                        'Experiment ID': 'expid',
                        'Treatment ID': 'treatid_C',
                        'Treatment ID_C': 'treatid_C',
                        'Treatment ID_T': 'treatid_T'
                    })}
                else:
                    compdict2 = {}
                dfmeta = buildMetaDF(dfdic2, compdict2, supcols=supcols)
                print('done ({:.2f}s)'.format(time.time() - t0))

                # display the button to download the files
                print('creating download buttons...', end='')
                t0 = time.time()
                display(buildFilteredDB(dfdic2, {}, 'Filtered DB', 'filtered-db-' + version + '.xlsx'))
                display(prepareDownload(dfmeta, 'Flat DB', 'flat-db-' + version + '.xlsx'))
                #fname = 'meta-c-vs-t.xlsx'
                #dfmeta.to_excel(fname, index=False)
                #display(FileLink(fname))
                #display(prepareDownload(dfstack, 'Meta stacked', 'meta-stacked.xlsx'))
                print('done ({:.2f}s)'.format(time.time() - t0))
                
        fbtn = FileUpload(description='Selection file', accept='.xlsx',
                          multiple=False, style= {'button_color':'orange'})
        fbtn.observe(fbtnFunc, names='value')

        mainLayout.children += (
            VBox([fInstructions, fbtn, fexports]),
        )

        mainLayout.set_title(1, 'Filtering')
        mainLayout.set_title(2, 'Query')
       # mainLayout.set_title(3, 'Meta-analysis')
        mainLayout.set_title(3, 'Export')
        mainLayout.set_title(4, 'Selection')

uploadBtn = FileUpload(accept='.xlsx', multiple=False)
uploadBtn.observe(loadBtnFunc, names='value')

# where to put the url of the Google Sheet
gsurl = Text()

loadBtn = Button(description='Load File/URL', style= {'button_color':'orange'})
loadBtn.on_click(loadBtnFunc)

# output for displaying processing
out = Output()

# case of headers TO KEEP?
headerRadio = RadioButtons(
    options=['Default (with space)', 'camelCase', 'under_case'],
    description="Headers:")

mainLayout = Tab([
    VBox([Label('Please load the database as .xlsx file.'), 
          HBox([uploadBtn, Label('OR Google Sheet URL:'), gsurl, loadBtn]),
          out])
])
mainLayout.set_title(0, 'Load Data')

display(mainLayout)

Tab(children=(VBox(children=(Label(value='Please load the database as .xlsx file.'), HBox(children=(FileUpload…

In [22]:
# automatically start the script if url parameters are provided
query_string = os.environ.get('QUERY_STRING', '')
parameters = parse_qs(query_string)
if 'gsurl' in parameters.keys():
    gsurl.value = parameters['gsurl'][0]
    loadBtn.click()

In [23]:
# questions:
# - shall we include 'Rotation' when merging? some people considered rotation as year so I would go for no