# EJP Common template - Data Query Module (v0.2.0)
Upload the database and then use tabs to explore the data, perform some preliminary meta-analysis and export it in different formats.

In [1]:
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
from matplotlib.collections import PolyCollection
import matplotlib.dates as mdates
import tempfile
import os
import base64
from upsetplot import UpSet  # graph with interactions
from ipyleaflet import Map, Marker

from ipywidgets import (FileUpload, Button, Output, Dropdown, RadioButtons,
                        SelectMultiple, VBox, HBox, Layout, Checkbox, Label, Text,
                       FloatText, FloatRangeSlider, Tab, IntText)
from IPython.display import FileLink, HTML

sheetNames = ['experiment', 'reference', 'treatment', 'soil-type', 'tillage', 'crops',
             'amendment', 'irrigation', 'pest-weed', 'grazing',
             'soil-crop-measurement', 'data-crop', 'data-soil', 'dropDownList']

dtypes = {
    'Experiment ID': 'string',
    'Treatment ID': 'string',
    'Reference treatment': 'string',
         }

def tofloat(x):
    try:
        return float(x)
    except:
        return np.nan
    
def camelCase(s):
    s = s.split()
    return s[0].lower() + ''.join(i.capitalize() for i in s[1:])
def underCase(s):
    s = s.split()
    return '_'.join(i.lower() for i in s)
def dicCopy(dic):
    bdic = {}
    for key in dic:
        bdic[key] = dic[key].copy()
    return bdic

In [2]:
# type of columns
dbdic = {
    'experiment': {
        'Latitude': 'number',
        'Longitude': 'number',
        'Country': 'choice',
        'Land use prior experiment': 'choice'
    },
    'reference': {
        'Publication type': 'choice',
        'Publication first author': 'string',
        'Publication year': 'number',
        'Publication title': 'string',
        'Publication journal': 'string'
    },
    'soil-type': {
        'Top depth of layer': 'number',
        'Bottom depth of layer': 'number',
        'Clay (< 0.002 mm)': 'number',
        'Silt (0.002 - 0.05 mm)': 'number',
        'Sand (0.05 - 2 mm)': 'number',
        'Gravel (> 2 mm)': 'number',
        'Soil texture USDA': 'choice',
        'Soil group WRB': 'choice',
        'Soil group WRB qualifier': 'choice',
        'Soil group WRB specifier': 'choice',
        'Soil type USDA': 'choice',
        'Soil type USDA qualifier': 'choice',
    },
    'treatment': {
        'Land use': 'choice',
        'Year started': 'number',
        'Year ended': 'number',
        'Crop rotation': 'choice'
    },
    'tillage': {
        'Tillage system': 'choice',
        'Tillage method': 'choice',
        'Tillage depth': 'number',
        'Permanent soil area covered by residues or crops': 'choice',
        'Tillage period': 'choice',
    },
    'crops': {
        'Crop type': 'choice',
        'Cropping system': 'choice',
        'Crop': 'choice',
        'Harvesting/Termination method': 'choice',
        'Harvesting frequency': 'number',
        'Sowing period': 'choice',
        'Harvesting/Termination period': 'choice',
        'Residues removal': 'choice',
        'Residues incorporation': 'choice', 
        'Residues burning': 'choice'
    },
    'amendment': {
        'Type of fertilizer/amendment': 'choice',
        'Fertilizer/Amendment application rate': 'number',
        'Fertilizer/Amendment application rate units': 'choice',
        'Fertilizer/Amendment application method': 'choice',
        'Amendment water content': 'number',
        'Amendment C': 'number',
        'Amendment N': 'number',
        'Amendment P': 'number',
        'Amendment K': 'number',
    },
    'irrigation': {
        'Irrigation method': 'choice',
        'Amount of water': 'number',
        'Irrigation frequency': 'number',
        'Irrigation water': 'choice',
        'Drainage system': 'choice',
        'Drainage spacing': 'number',
        'Drainage depth': 'number',
    },
    'data-crop': {
        'Sampling year': 'number',
        'Harvested yield': 'number',
        'Harvested yield water content': 'choice',
        'Harvested yield water content amount': 'number',
        'Residue above-ground': 'number',
        'Residue stubble': 'number',
        'Residue roots': 'number',
        'Residue sampling method': 'string',
        'Below-ground sampling depth': 'number',
    },
    'data-soil': {
        'Sampling year': 'number',
        'Depth from': 'number',
        'Depth to': 'number',
        'Time-serie available': 'choice',
        'SOC conc': 'number',
        'SOC conc SD': 'number',
        'SOC conc SE': 'number',
        'SOC conc nb samples': 'number',
        'Analysis method': 'choice',
        'Bulk density': 'number',
        'Bulk density method': 'choice',
        'Bulk density SD': 'number',
        'Bulk density SE': 'number',
        'Bulk density nb samples': 'number',
        'SOC stock': 'number',
        'SOC stock SD': 'number',
        'SOC stock SE': 'number',
        'SOC stock nb samples': 'number',
        'pH': 'number',
        'pH method': 'choice'
    }
}

In [3]:
def readExcel(fname):
    t0 = time.time()
    print('Reading in Excel file...', end='')
    if fname[:4] == 'http': # it's a google sheet url
        fname = '/'.join(fname.split('/')[:-1] + ['export?format=xlsx'])
    dfdic = pd.read_excel(fname, sheet_name=None, skiprows=[0,1,3])
    print('done ({:.2f}s)'.format(time.time() - t0))
    return dfdic

#dfdic = readExcel('../../../ejp-wp7/ejp-common-template2.xlsx')

In [4]:
# identify which management practice is treatment specific
# TODO for crops, we need to take cropsID into account
# def extractTreatment(dfdic):
#     mgnt = ['crops','tillage','amendment','irrigation','pest-weed','grazing']
#     experimentIDs = dfdic['experiment']['Experiment ID'].unique()
#     x = np.zeros((len(experimentIDs), len(mgnt)), dtype=bool)
#     dft = pd.DataFrame(x, columns=mgnt) # True if the practice is part of treatment
#     dft.insert(0, 'Experiment ID', experimentIDs)
#     for expid in experimentIDs:
#         treatList = []
#         for sheet in mgnt:
#             if sheet in dfdic.keys():
#                 df = dfdic[sheet]
#                 ie = df['Experiment ID'] == expid
#                 sdf = df[ie].copy().drop(['Experiment ID', 'Treatment ID'], axis=1).reset_index(drop=True)
#                 sdf = sdf.dropna(axis=1)
#                 isTreatment = False
#                 if sdf.shape[0] > 0: # sheet might be empty for expid
#                     s0 = sdf.loc[0,:]
#                     for i in range(1, sdf.shape[0]):
#                         if (sdf.loc[i, :] != s0).any():
#                             # print('++++++is different:', sdf.loc[i,:], '--', s0, '///', sdf.loc[i,:] != s0)
#                             isTreatment = True
#                             break
#                 if isTreatment is True:
#                     dft.loc[dft['Experiment ID'] == expid, sheet] = True
#     return dft

In [5]:
# plot interactions between treatments investigated
# def plotFactor(dfdic):
#     dft = extractTreatment(dfdic)
#     dfcount = dft.set_index(dft.columns[1:].tolist())
#     dfcount
#     fig = plt.figure()
#     _ = UpSet(dfcount).plot(fig=fig)
#     plt.show()
#plotFactor(dfdic)

In [6]:
def showMap(dfdic):
    # interactive map
    m = Map(
        center=(52.204793, 360.121558),
        zoom=4
    )
    df = dfdic['experiment']
    for i in range(df.shape[0]):
        lat, lon = df.loc[i, 'Latitude'], df.loc[i, 'Longitude']
        marker = Marker(location=(lat, 360+lon),
                        title=df.loc[i, 'Experiment ID'])
        m.add_layer(marker)
    display(m)
#showMap(dfdic)

In [7]:
# filter experiments
class RowFilter(object):
    def __init__(self, filters, dfdic):
        self.sheetDropdown = Dropdown(options=list(dbdic.keys()), layout=Layout(width='15%'))
        self.sheetDropdown.observe(self.sheetDropdownFunc, names='value')
        self.colDropdown = Dropdown(options=list(dbdic['experiment'].keys()), layout=Layout(width='15%'))
        self.colDropdown.observe(self.colDropdownFunc, names='value')
        self.rmBtn = Button(description='Remove')
        self.rmBtn.on_click(self.rmBtnFunc)
        self.hbox = HBox([self.sheetDropdown, self.colDropdown, self.rmBtn], layout=Layout(display='flex'))
        self.filters = filters
        self.index = len(self.filters.children) - 1
        self.filters.children = self.filters.children[:-1] + (self.hbox, self.filters.children[-1])
        self.dfdic = dfdic
        self.opts = None
        self.buildOptions('experiment', 'Latitude')

    def sheetDropdownFunc(self, a):
        self.colDropdown.options = list(dbdic[a['new']].keys())

    def colDropdownFunc(self, a):
        sheet = self.sheetDropdown.value
        col = a['new']
        self.buildOptions(sheet, col)

    def buildOptions(self, sheet, col):
        typ = dbdic[sheet][col]
        if typ == 'number':
            vmin = dfdic[sheet][col].min()
            vmax = dfdic[sheet][col].max()
            r = vmax - vmin
            #opts = FloatRangeSlider(value=[vmin, vmax], vmin=vmin - 0.1*r, vmax=vmax + 0.1*r, step=0.02*r)
            self.opts = HBox([Label('min:'), FloatText(value=vmin, layout=Layout(width='30%')),
                              Label('max:'), FloatText(value=vmax, layout=Layout(width='30%'))])
        if typ == 'choice' or typ == 'string':
            choices = dfdic[sheet][col].dropna().unique()
            self.opts = VBox([Checkbox(value=True, description=a, indent=False)
                              for a in choices], layout=Layout(width='40%'))
        if len(self.hbox.children) > 3:
            self.hbox.children = self.hbox.children[:-2] + (self.opts, self.hbox.children[-1])
        else:
            self.hbox.children = self.hbox.children[:-1] + (self.opts, self.hbox.children[-1])

    def rmBtnFunc(self, a):
        b = list(self.filters.children)
        del b[self.index]
        self.filters.children = b

def buildFilters():
    global dfdic, dfdico

    def addFilterBtnFunc(b):
        _ = RowFilter(filters, dfdic)
    addFilterBtn = Button(description='Add filter')
    addFilterBtn.on_click(addFilterBtnFunc)

    def filterDataBtnFunc(a):
        expids = pd.Series(dfdic['experiment']['Experiment ID'].unique())
        for child in filters.children[:-1]:
            sheet = child.children[0].value
            col = child.children[1].value
            typ = dbdic[sheet][col]
            if typ == 'number':
                vmin = child.children[2].children[1].value
                vmax = child.children[2].children[3].value
                ie = (dfdic[sheet][col] >= vmin) & (dfdic[sheet][col] <= vmax)
            else:
                choices = [b.description for b in child.children[2].children if b.value is True]
                ie = dfdic[sheet][col].isin(choices)
            expids = expids[expids.isin(dfdic[sheet][ie]['Experiment ID'].unique())]
        log.clear_output()
        with log:
            print('Experiment ID retained: {:d}/{:d}'.format(len(expids), 
                  dfdic['experiment']['Experiment ID'].unique().shape[0]))
        # do the filtering
        expids = expids.tolist()
        for sheet in dfdic.keys():
            if 'Experiment ID' in dfdic[sheet].keys():
                ie = dfdic[sheet]['Experiment ID'].isin(expids)
                dfdic[sheet] = dfdic[sheet][ie].reset_index(drop=True)
        #callback() # run the callback
    filterDataBtn = Button(description='Filter data')
    filterDataBtn.on_click(filterDataBtnFunc)

    def resetFilterBtnFunc(a):
        # we want to restore the global dfdic variable
        global dfdic
        dfdic = dicCopy(dfdico)
        filters.children = (filters.children[-1],)
        log.clear_output()
        with log:
            print('{:d} experiments restored'.format(
                dfdic['experiment']['Experiment ID'].unique().shape[0]))
        callback()
    resetFilterBtn = Button(description='Reset filters')
    resetFilterBtn.on_click(resetFilterBtnFunc)
    
    log = Output()

    filters = VBox([HBox([addFilterBtn, filterDataBtn, resetFilterBtn, log])])
    return filters

#display(buildFilters())

In [8]:
# interactive histogram view
# class ExploratoryHist(object):
#     def __init__(self, dfdic):
#         self.sheetDropdown = Dropdown(options=list(dbdic.keys()), layout=Layout(width='25%'))
#         self.sheetDropdown.observe(self.sheetDropdownFunc, names='value')
#         self.colDropdown = Dropdown(options=list(dbdic['experiment'].keys()), layout=Layout(width='35%'))
#         self.colDropdown.observe(self.colDropdownFunc, names='value')
#         self.vminText = FloatText(layout=Layout(width='15%'))
#         self.vmaxText = FloatText(layout=Layout(width='15%'))
#         self.nbins = IntText(layout=Layout(width='10%'))
#         self.dfdic = dfdic
#         self.showLog = Checkbox(value=True, description='Show values head/tail', indent=False)
#         self.out = Output()
#         self.log = Output()
#         self.header = HBox([self.sheetDropdown, self.colDropdown])
#         self.numberOption = HBox([Label('Vmin:'), self.vminText,
#                                   Label('Vmax:'), self.vmaxText,
#                                   Label('Nb. bins:'), self.nbins])
#         self.hbox = HBox([VBox([self.header, self.out], layout=Layout(width='60%')),
#                           VBox([self.showLog, self.log])])
#         self.bins = None
#         self.sheet = 'experiment'
#         self.col = 'Latitude'
#         self.colDropdownFunc({'new': 'Latitude'})
#         #self.buildFigure('experiment', 'Latitude')

#     def sheetDropdownFunc(self, a):
#         self.sheet = a['new']
#         self.colDropdown.options = list(dbdic[a['new']].keys())

#     def colDropdownFunc(self, a):
#         self.col = a['new']
#         typ = dbdic[self.sheet][self.col]
#         if typ == 'number':
#             self.vminText.unobserve_all()
#             self.vmaxText.unobserve_all()
#             self.nbins.unobserve_all()
#             _, bins = np.histogram(self.dfdic[self.sheet][self.col].dropna())
#             self.vminText.value = np.round(bins[0], 2)
#             self.vmaxText.value = np.round(bins[-1], 2)
#             self.nbins.value = len(bins)
#             self.vminText.observe(self.updateFunc, names='value')
#             self.vmaxText.observe(self.updateFunc, names='value')
#             self.nbins.observe(self.updateFunc, names='value')
#             self.bins = bins
#             self.hbox.children[0].children = (self.header, self.numberOption, self.out)
#             self.buildFigure()
#         else:
#             self.hbox.children[0].children = (self.header, self.out)
#             self.buildFigure()

#     def updateFunc(self, a):
#         vmin = self.vminText.value
#         vmax = self.vmaxText.value
#         nbins = self.nbins.value
#         self.bins = np.linspace(vmin, vmax, nbins)
#         self.buildFigure()

#     def buildFigure(self):
#         self.out.clear_output()
#         self.log.clear_output()
#         with self.log:
#             if self.showLog.value is True:
#                 if self.dfdic[self.sheet].shape[0] > 10:
#                     print(self.dfdic[self.sheet].sort_values(self.col).reset_index().iloc[np.r_[0:5, -5:0]][self.col])
#                 else:
#                     print(self.dfdic[self.sheet][self.col].sort_values(self.col).reset_index())
#         with self.out:
#             typ = dbdic[self.sheet][self.col]
#             if typ == 'number':
#                 self.dfdic[self.sheet][self.col].apply(tofloat).plot.hist(bins=self.bins)
#                 plt.xlabel(self.col)
#             else:
#                 self.dfdic[self.sheet][self.col].value_counts().plot(kind='bar', xlabel=self.col)
#                 plt.ylabel('Count')
#             plt.show() # needed, otherwise, graph won't change

#expHist = ExploratoryHist(dfdic)
#display(expHist.hbox)

In [9]:
# columns with reference/control
# def buildMetaDf(refs, dfdic):
#     controls = []
#     controlSheets = []
#     for child in refs.children[:-1]:
#         sheet = child.children[0].value
#         col = child.children[1].value
#         ref = child.children[3].value
#         h = camelCase(col)
#         controls.append((col, ref, 'pc' + h[0].upper() + h[1:]))
#         controlSheets.append(sheet)

#     # columns that should be similar
#     cols = []
#     sheets = []
#     for child in metaBox.children[0].children[1:]:
#         if child.value is True:
#             sheet, col = child.description.split(' | ')
#             cols.append(col)
#             sheets.append(sheet)

#     # column with numeric type for effect-size
#     numsheet, numcol = numRadio.value.split(' | ')

#     # create merged df
#     ccols = [a[0] for a in controls]
#     ucols = np.unique(cols + ccols + 
#                       ['Experiment ID', 'Treatment ID'] + [numcol]) # TODO include Rotation?
#     usheets = np.unique(sheets + controlSheets + [numsheet])
#     for i, sheet in enumerate(usheets):
#         ie = dfdic[sheet].columns.isin(ucols)
#         scols = dfdic[sheet].columns[ie].tolist()
#         df = dfdic[sheet][scols].copy()
#         # we clean the table by ensuring that each Treatment ID has only
#         # one row. For this we identify the row that contains reference/control
#         # values, to be sure to keep them. If a treatment does not contain any,
#         # the first row of it is kept
#         df['iref'] = 0
#         for ccol, ccval, pcColumn in controls:
#             if ccol in df.columns:
#                 # check which rows contains the reference value
#                 df.loc[:, 'iref'] = df['iref'].values + df[ccol].isin([ccval]).astype(int)
#         df = df.loc[df.groupby(['Experiment ID', 'Treatment ID'])['iref'].idxmax(),:]
#         df = df.drop(['iref'], axis=1).reset_index(drop=True)
#         if i == 0:
#             dfmeta = df
#         else:
#             dfmeta = pd.merge(dfmeta, df, how='outer')
#     dfmeta = dfmeta[ucols]
#     print('shrunk:', dfmeta.shape[0], '-> ', end='')
#     dfmeta = dfmeta.dropna(subset=ccols + [numcol], axis=0).reset_index(drop=True)
#     # INFO we enforce not nan in the control columns too
#     print(dfmeta.shape[0], 'after NaN drop')

#     # two issues:
#     # 1) the selected 'same columns' are not enough to differentiate the treatments,
#     # hence, only the last detected control is compared against all other treatments
#     # -> the user need to add more 'same columns' to differentiate the treatments
#     # 2) because of the outer merge, and many to many relationships, multiple amendment
#     # can be entered even if they refer to the same treatment -> for each treatment
#     # and for each control column, identify if the treatment contains the control
#     # value, if yes, keep the row and discard other rows within this treatment
#     # if no, just keep the first row of it (implemented above)

#     # build pair-wise comparison
#     for a, b, c in controls:
#         dfmeta[c] = 0

#     return dfmeta, numcol, cols, controls

#dfmeta, numcol, cols, controls = buildMetaDf(refs, dfdic)
#dfmeta

In [10]:
# for test
#dfdic = readExcel('https://docs.google.com/spreadsheets/d/1ptNtqtblPi-F8l_A1CbyECaw0EecBK3y-GHrPm7XWKE/edit?usp=sharing')

Pairwise strategy:
- filters: conditions to be applied (list of choices to keep or range of numerical values to keep)
- inclusion of 'Rotation' in conditions or not:
    - inclusion: comparison will be done if all the year in the rotation met the conditions
    - not included: comparison will be done if at least one year in the rotation met the conditions
- value: choice or boolean rules if numeric values
- occurence (of a value):
    - -1: neglect this parameter and compare the value vs other values
    - 0: compare the 0 occurence vs > 0 occurence (at lest one occurence)
    - 1: compare the 1 occurence vs > 1 occurence (hence neglecting the 0 occurence)
- conditions:
    - column name and value: only the row with where this column has this value will be considered in the comparison
    - column name and 'identical': comparison between two rows will only be considered if the value in this column are the same between the two rows compared

In [11]:
# extract treatments (advanced)
queries = {
    'cover crop vs no cover crops': {
        'sheet': 'crops',
        'column': 'Crop type',
        'value': 'Cover crop',
        'occurence': 0,
        'conditions': {'Cropping system': None},
    },
    'one main crop vs double cropping': {
        'sheet': 'crops',
        'column': 'Crop type',
        'value': 'Main crop',
        'occurence': 1,
        'conditions': {'Cropping system': 'Monoculture', 'Rotation': None}
    },
    'cover crop X vs cover crops Y': {
        'sheet': 'crops',
        'column': 'Crop',
        'value': '>> Clover/Trifolium sp.',
        'occurence': -1,
        'conditions': {'Crop type': 'Cover crop', 'Rotation': None}
    },
    'different termination method': {
        'sheet': 'crops',
        'column': 'Harvesting/Termination method',
        'value': '> Frost-killed',
        'occurence': -1,
        'conditions': {'Crop type': 'Cover crop'}
    },
    'different growing period': {
        'sheet': 'crops',
        'column': 'Sowing period',
        'value': 'March',
        'occurence': -1,
        'conditions': {'Crop type': 'Cover crop'}
    },
    'residues incorporated vs residue left on surface': {
        'sheet': 'crops',
        'column': 'Residues removal',
        'value': 'Full residue removal',
        'occurence': -1,
        'conditions': {'Crop': None, 'Crop type': 'Main crop'}
    }
}

def extractQueries(queries, dfdic):
    # create dataframe to store the pairwise comparison
    dft = dfdic['treatment'][['Experiment ID', 'Treatment ID']].copy()
    compdict = {}

    for key in queries:
        # define query
        query = queries[key]
        compdict[key] = []

        # create empty columns with 0 for pairwise
        pcColumn = 'pc-' + key.replace(' ', '_')
        dft[pcColumn] = 0

        # extract the conditions on values and build filtered df for tab
        df = dfdic[query['sheet']]
        i2keep = np.ones(df.shape[0], dtype=bool)
        sameCols = []  # list of column that must be similar between pairwise
        for cond in query['conditions']:
            val = query['conditions'][cond]
            if val is None:
                sameCols.append(cond)
            else:
                i2keep = i2keep & df[cond].eq(val)
        df = df[i2keep].reset_index(drop=True)

        # add Rotation year if not in df (to make the code more flexible)
        if 'Rotation' not in df.columns:
            df['Rotation'] = 'year0'

        # columns to use for pairwise comparison
        col = query['column']
        cols = [col] + sameCols

        # cannot compare number 1 vs multiple accross rotation but only inside
        if (query['occurence'] >= 1) & ('Rotation' not in cols):
            cols.append('Rotation')
            print('WARNING: canont compare number 1 vs multiple accross rotation but only inside rotation')

        # repeat for each experiment in the tab
        expids = df['Experiment ID'].unique()
        for expid in expids:
            ie = df['Experiment ID'] == expid
            treatids = df[ie]['Treatment ID'].unique()

            # first loop looks for the reference/control treatment
            for i, treatid1 in enumerate(treatids):
                itreat1 = ie & df['Treatment ID'].eq(treatid1)

                # get the potential rows with contains the control values
                iref = df[itreat1][col] == query['value']
                irows = df[itreat1 & iref].index.to_list()

                # compute number of occurence of the control within a rotation year
                icounts = df[iref & itreat1].groupby('Rotation').count()

                controlFound = False
                # if there is at least one occurence
                if query['occurence'] <= 0:
                    if len(irows) > 0:
                        controlFound = True

                # if there is the exact number of occurence specified (within a rotation)
                else:
                    if query['occurence'] in icounts.loc[:, icounts.columns[0]].to_list():
                        controlFound = True

                if controlFound:
                    #print('control::', df[itreat1][cols])

                    # once reference is found,look for the corresponding treatments
                    for j, treatid2 in enumerate(treatids):
                        itreat2 = ie & df['Treatment ID'].eq(treatid2)

                        # don't look into the same treatment already selected as control (j != i)
                        # don't look into treatment if it contains NaN for the column of interest
                        if (j != i) & (df[itreat2][col].isna().sum() == 0):

                            # wether to do the comparison for each rotation or not
                            ftreat = False

                            # one value VS all other values
                            if query['occurence'] == -1:
                                subdf2 = df[itreat2][cols]
                                for irow in irows:
                                    match = subdf2.eq(df.loc[irow, cols])

                                    # only row different from the control values are wanted
                                    match[col] = ~match[col]
                                    # all rows with difference value than the control but same columns ok
                                    if match.all(1).any(0):
                                        ftreat = True
                                        #print('=== value vs other value', expid, treatid1, treatid2)
                                        compdict[key].append([expid, treatid1, treatid2])
                                        break  # no need to test the other rows

                            # absence VS presence of the value
                            elif query['occurence'] == 0:
                                subdf2 = df[itreat2][cols]
                                for irow in irows:
                                    match = subdf2.eq(df.loc[irow, cols])

                                    # only row with a different control values
                                    match[col] = ~match[col]

                                    # among the rows with sameCols ok, check if different than control value
                                    isub = match[sameCols].all(1)
                                    if (isub.sum() > 1) & (match[isub][col].all(0)):
                                        ftreat = True
                                        #print('=== presence vs absence', expid, treatid1, treatid2)
                                        compdict[key].append([expid, treatid1, treatid2])
                                        break

                            # 1 occurences VS more than 1 occurence of the value
                            # ISSUE cannot compare 2 VS more than 2 because colSames comparison will be difficult
                            # as it's an edge case, we keep comparing 1 occurence vs more than 1
                            else:
                                subdf2 = df[itreat2][cols]
                                for rotid in subdf2['Rotation'].unique():
                                    irowCtrl = df[itreat1 & df['Rotation'].eq(rotid)][col].eq(query['value'])
                                    irot = subdf2['Rotation'].eq(rotid)
                                    irowTrt = subdf2[col].eq(query['value']) & irot
                                    if irowCtrl.sum() == query['occurence']:
                                        # trick, here we only compare the first line so we assume the occurence == 1 for control
                                        match = subdf2[irowTrt].eq(df.loc[np.where(irowCtrl)[0][0], cols])

                                        # check number of occurence in itreat2
                                        if match.all(1).sum() > query['occurence']:
                                            ftreat = True
                                            compdict[key].append([expid, treatid1, treatid2])
                                            #print('=== one vs more occurence', expid, treatid1, treatid2)
                                            break
                            if ftreat:
                                dft.loc[dft['Experiment ID'].eq(expid) & dft['Treatment ID'].eq(treatid1), pcColumn] = -(i+1)
                                dft.loc[dft['Experiment ID'].eq(expid) & dft['Treatment ID'].eq(treatid2), pcColumn] = i+1

    # convert list of list to pandas.DataFrame
    for key in compdict:
        compdict[key] = pd.DataFrame(compdict[key], columns=['expid', 'treatid_C', 'treatid_T'])

    return compdict


# ISSUE: you can have "at least one occurence" inside the rotation OR inside the treatment


# ISSUE: the approach with the pc columns is limited in cases like 'one or no cover crops'
# there there might be more control than treatments, so better log them all individually
#dft[dft[dft.columns[2:]].ne(0).any(1)]


# compdict = extractQueries(queries, dfdic)
# for key in compdict:
#     print(key, ':')
#     display(compdict[key])

In [12]:
# interactive query building

# class to add condition (within the same table)
class RowCondition(object):
    def __init__(self, rows, sheet, dfdic):
        options = [b for b in dbdic[sheet] if dbdic[sheet][b] == 'choice']
        options = ['Rotation'] + options if 'Rotation' in dfdic[sheet].columns else options
        self.colDropdown = Dropdown(options=options, layout=Layout(width='30%'))
        self.colDropdown.observe(self.colDropdownFunc, names='value')
        self.valDropdown = Dropdown(layout=Layout(width='35%'))
        self.rmBtn = Button(description='Remove condition')
        self.rmBtn.on_click(self.rmBtnFunc)
        self.hbox = HBox([self.colDropdown, self.valDropdown, self.rmBtn])
        self.rows = rows
        self.sheet = sheet
        self.rows.children = self.rows.children + (self.hbox, )
        self.index = len(self.rows.children) - 1
        self.colDropdownFunc({'new': options[0]})

    def colDropdownFunc(self, a):
        col = a['new']
        vals = dfdic[self.sheet][col].dropna().unique()
        self.valDropdown.options = ['*Ctrl same as Trt*'] + list(vals)

    def rmBtnFunc(self, a):
        b = list(self.rows.children)
        del b[self.index]
        for i, a in enumerate(b):
            a.index = i
        self.rows.children = b


# class to add reference/control and define occurence and value
class RowReference(object):
    def __init__(self, rows, dfdic):
        self.sheetDropdown = Dropdown(options=list(dbdic.keys())[3:-2], layout=Layout(width='15%'))
        self.sheetDropdown.observe(self.sheetDropdownFunc, names='value')
        self.colDropdown = Dropdown(options=[], layout=Layout(width='15%'))
        self.colDropdown.observe(self.colDropdownFunc, names='value')
        self.refDropdown = Dropdown(options=[], layout=Layout(width='20%'))
        self.occurenceDropdown = Dropdown(options=['this value VS other',
                                           'presence vs absence of value',
                                           '1 occurence vs more'], layout=Layout(width='15%'))
        self.rmBtn = Button(description='Remove')
        self.rmBtn.on_click(self.rmBtnFunc)
        self.hbox = HBox([self.sheetDropdown, self.colDropdown, Label('Control:'),
                          self.refDropdown, self.occurenceDropdown, self.rmBtn],
                         layout=Layout(display='flex'))
        self.rows = rows
        self.index = len(self.rows.children) - 1
        self.conditions = VBox([])
        self.addBtn = Button(description='Add condition')
        self.addBtn.on_click(self.addBtnFunc)
        self.vbox = VBox([self.hbox, HBox([
            VBox([Label('Conditions:'), self.addBtn]),
            self.conditions])])
        self.rows.children = self.rows.children[:-1] + (self.vbox, self.rows.children[-1])
        self.dfdic = dfdic
        self.opts = None
        
        # initiate initial configuration
        self.sheetDropdownFunc({'new': 'treatment'})
        self.buildOptions('treatment', 'Land use')
    
    def addBtnFunc(self, a):
        RowCondition(self.conditions, self.sheetDropdown.value, self.dfdic)

    def sheetDropdownFunc(self, a):
        sheet = a['new']
        vals = [b for b in dbdic[sheet] if dbdic[sheet][b] == 'choice']
        self.colDropdown.options = vals
        self.conditions.children = []

    def colDropdownFunc(self, a):
        sheet = self.sheetDropdown.value
        col = a['new']
        self.buildOptions(sheet, col)

    def buildOptions(self, sheet, col):
        choices = dfdic[sheet][col].dropna().unique()
        self.refDropdown.options = choices
#         self.opts = RadioButtons(options=choices, layout=Layout(width='30%'))
#         if len(self.hbox.children) > 4:
#             self.hbox.children = self.hbox.children[:-2] + (self.opts, self.hbox.children[-1])
#         else:
#             self.hbox.children = self.hbox.children[:-1] + (self.opts, self.hbox.children[-1])

    def rmBtnFunc(self, _):
        b = list(self.rows.children)
        del b[self.index]
        print([a.index for a in b])
        for i, a in enumerate(b):
            a.index = i
        self.rows.children = b

def buildMetaRef(dfdic):
    
    def addRefBtnFunc(b):
        rowRef = RowReference(refs, dfdic)
    addRefBtn = Button(description='Add reference')
    addRefBtn.on_click(addRefBtnFunc)
    refs = VBox([HBox([addRefBtn])])
    return refs


def getQueries(refs):
    dic = {}
    for i, row in enumerate(refs.children[:-1]):
        query = row.children[0]
        name = 'query' + str(i)
        sheet = query.children[0].value
        column = query.children[1].value
        value = query.children[3].value
        occdic = {'this value VS other': -1,
                  'presence vs absence of value': 0,
                  '1 occurence vs more': 1}
        occurence = occdic[query.children[4].value]
        conditions = {}
        for child in row.children[1].children[1].children:
            col = child.children[0].value
            val = child.children[1].value
            if val == '*Ctrl same as Trt*':
                val = None
            conditions[col] = val
        dic[name] = {
            'sheet': sheet,
            'column': column,
            'value': value,
            'occurence': occurence,
            'conditions': conditions
        }
    return dic

# refs = buildMetaRef(dfdic)
# display(refs)

In [13]:
# extract treatments according to query
def extractTreatments(trtOut):
    global compdict, dfdic, queries
    t0 = time.time()
    trtOut.clear_output()
    with trtOut:
        print('processing...', end='')
    queries = getQueries(refs)
    compdict = extractQueries(queries, dfdic)
    dfsum = pd.DataFrame(columns=['name', 'sheet', 'value', 'occurence', 'conditions',
                                  'number of experiments', 'number of pairs'])
    for key in queries:
        query = queries[key]
        pairs = compdict[key]
        dfsum = dfsum.append({
            'name': key,
            'sheet': query['sheet'],
            'column': query['column'],
            'value': query['value'],
            'occurence': query['occurence'],
            'conditions': ' & '.join([a + ': ' + str(query['conditions'][a])
                                      for a in query['conditions']]),
            'number of experiments': pairs['expid'].unique().shape[0],
            'number of pairs': pairs.shape[0]
        }, ignore_index=True)
    with trtOut:
        print('done ({:.2f}s)'.format(time.time() - t0))
        display(dfsum[['name', 'number of experiments', 'number of pairs']])


# compdict = {}
# queries = {}
# trtOut = Output()
# extractTreatmentBtn = Button(description='Get treatments', style= {'button_color':'orange'})
# def func(_):
#     extractTreatments(trtOut)
# extractTreatmentBtn.on_click(func)
# VBox([extractTreatmentBtn, trtOut])

In [14]:
# select columns that must be the same in pairwise comparision
# select type of effect sizes (diff or ratio) and if log or not
numChoices = []
for sheet in dbdic:
    for key in dbdic[sheet]:
        if dbdic[sheet][key] == 'number' and 'data-' in sheet:
            numChoices.append(sheet + ' | ' + key)
numRadio = Dropdown(options=numChoices, layout=Layout(width='70%'))
esType = RadioButtons(options=['difference', 'ratio'])
esLog = RadioButtons(options=['yes', 'no'], value='no')
metaOut = Output()
def runMetaBtnFunc(a):
    global dfmeta
    metaOut.clear_output()
    with metaOut:
        plotES(dfmeta)
runMetaBtn = Button(description='Plot Effect sizes',
                    style= {'button_color':'orange'})
runMetaBtn.on_click(runMetaBtnFunc)
metaBox = VBox([HBox([VBox([Label('Numerical column for effect sizes:'), numRadio, runMetaBtn],
                     layout=Layout(width='33%')),
                VBox([Label('How to compute ES?'), esType,
                      Label('Apply log on numeric value?'), esLog], layout=Layout(width='40%')),
               ], layout=Layout(height='110%')), metaOut])

#display(metaBox)

In [15]:
def buildMetaDf(supcols=[]):
    """Build dataframe with control vs treatment.
    """
    global dfdic, compdict

    # collect other supplemental informative columns
    dfsc = pd.DataFrame([a.split(' | ') for a in supcols], 
                        columns=['sheet', 'column'])

    # get column with the target numeric on which compute ES
    numsheet, numcol = numRadio.value.split(' | ')
    dfsc = dfsc.append({'sheet': numsheet, 'column': numcol}, ignore_index=True)

    dfmeta = pd.DataFrame(columns=['query', 'expid', 'treatid_C', 'treatid_T',
                                  numcol + '_C', numcol + '_T', 'ES'])
    dfstack = pd.DataFrame()
    
    pairCounter = 0
    for key in compdict:
        # add columns involved in the query
        sheet = queries[key]['sheet']
        col = queries[key]['column']
        dfsc = dfsc.append({'sheet': sheet, 'column': col}, ignore_index=True)
        ccols = list(queries[key]['conditions'].keys())
        for a in ccols:
            dfsc = dfsc.append({'sheet': sheet, 'column': c}, ignore_index=True)

        # ISSUE: with rotation or multipe rows per treatments (e.g. amendments),
        # it's not possible to be sure that treatment and control will be on same rows
        # an option would be to have a stacked design as we had with pc columns or so

        # OPTION 1: stacked
        for l in range(compdict[key].shape[0]):
            expid = compdict[key]['expid'][l]
            pairCounter += 1

            # add metadat for the control
            treatidC = compdict[key]['treatid_C'][l]
            subdf = pd.DataFrame()
            for j, sheet in enumerate(dfsc['sheet'].unique()):
                cols = dfsc[dfsc['sheet'] == sheet]['column'].to_list()
                if 'Treatment ID' in dfdic[sheet].columns:
                    cols += ['Experiment ID', 'Treatment ID']
                    df = dfdic[sheet][cols]
                    ie = (df['Experiment ID'] == expid) & (df['Treatment ID'] == treatidC)
                else:
                    cols += ['Experiment ID']
                    df = dfdic[sheet][cols]
                    ie = (df['Experiment ID'] == expid)
                if j == 0:
                    subdf = df[ie]
                else:
                    subdf = pd.merge(subdf, df[ie], how='outer')
            subdf['query'] = key
            subdf['pc'] = -pairCounter
            dfstack = dfstack.append(subdf)

            # add metadata for the treatment
            treatidT = compdict[key]['treatid_T'][l]
            subdf = pd.DataFrame()
            for j, sheet in enumerate(dfsc['sheet'].unique()):
                cols = dfsc[dfsc['sheet'] == sheet]['column'].to_list()
                if 'Treatment ID' in dfdic[sheet].columns:
                    cols += ['Experiment ID', 'Treatment ID']
                    df = dfdic[sheet][cols]
                    ie = (df['Experiment ID'] == expid) & (df['Treatment ID'] == treatidT)
                else:
                    cols += ['Experiment ID']
                    df = dfdic[sheet][cols]
                    ie = (df['Experiment ID'] == expid)
                if j == 0:
                    subdf = df[ie]
                else:
                    subdf = pd.merge(subdf, df[ie], how='outer')
            subdf['query'] = key
            subdf['pc'] = pairCounter
            dfstack = dfstack.append(subdf)

        # OPTION 2: ctrs vs trt but not addition metadata
        # NOTE: we take all values (even if multiple years)
        subdf = compdict[key]
        dfnum = dfdic[numsheet]
        colindex = ['Experiment ID', 'Treatment ID']

        # add value for control
        subdf = pd.merge(subdf, dfnum[colindex + [numcol]],
                         left_on=['expid', 'treatid_C'],
                         right_on=colindex).rename(
            columns={numcol: numcol + '_C'}).drop(colindex, axis=1)

        # add value for treatment
        subdf = pd.merge(subdf, dfnum[colindex + [numcol]],
                         left_on=['expid', 'treatid_T'],
                         right_on=colindex).rename(
            columns={numcol: numcol + '_T'}).drop(colindex, axis=1)

        # compute effect size
        valC = subdf[numcol + '_C']
        valT = subdf[numcol + '_T']
        inan = ~np.isnan(valC) & ~np.isnan(valT)
        if esType == 'difference':
            if esLog == 'yes':
                subdf.loc[inan, 'ES'] = np.log10(valC[inan]) - np.log10(valT[inan])
            else:
                subdf.loc[inan, 'ES'] = valC[inan] - valT[inan]
        else:
            if esLog == 'yes':
                subdf.loc[inan, 'ES'] = np.log10(valC[inan]) / np.log10(valT[inan])
            else:
                subdf.loc[inan, 'ES'] = valC[inan] / valT[inan]

        # add query and append to dfmeta
        subdf['query'] = key
        dfmeta = dfmeta.append(subdf)

    # remove NaN
    dfmeta = dfmeta[dfmeta['ES'].notnull()].reset_index(drop=True)

    return dfmeta, dfstack

#dfmeta, dfstack = buildMetaDf(['experiment | Latitude', 'experiment | Longitude'])
#display(dfmeta)
#display(dfstack)

In [16]:
def plotES(dfmeta):
    global dfdic, comdict
    fig, ax = plt.subplots()
    ax.set_title('Effect size on ' + numRadio.value.split(' | ')[1])
    ylabs = []
    if esType.value == 'difference':
        ax.axvline(0, linestyle='--', color='k')
    else:
        ax.axvline(1, linestyle='--', color='k')
    for i, query in enumerate(dfmeta['query'].unique()):
        ie = dfmeta['query'] == query
        ax.errorbar(dfmeta[ie]['ES'].mean(), i, xerr=dfmeta[ie]['ES'].sem(), marker='o', label=query)
        ylabs.append('{:s} ({:d})'.format(query, ie.sum()))
    ax.set_xlabel('Effect size') # units if difference, nothing if ratio
    ax.set_yticks(np.arange(len(ylabs)))
    ax.set_yticklabels(ylabs);
    #ax.legend()
    plt.show()

#plotES(dfmeta)

In [17]:
# build pairwise inside each publication
# TODO decide weather to include rotation or not
# def buildPairwise(dfmeta, numcol, cols, controls):
#     scols = np.array(['Experiment ID'] + cols)
#     for expid in dfmeta['Experiment ID'].unique():
#         ie = dfmeta['Experiment ID'] == expid
#         irows = np.where(ie)[0]
#         for ccol, cval, pcColumn in controls:
#             c = 0
#             colSame = scols[scols != ccol].copy()
#             for i in irows:
#                 row1 = dfmeta.loc[i, colSame]
#                 inan1 = row1.isna()
#                 # if the columns is a controlled value
#                 if dfmeta.loc[i, ccol] == cval:
#                     flag = False
#                     c = c + 1
#                     for j in irows:
#                         if ((j != i) & (dfmeta.loc[j, ccol] != cval) &
#                            (pd.isna(dfmeta.loc[j, ccol]) is False) &
#                            (dfmeta.loc[j, ccol] != 'unknown')):
#                             row2 = dfmeta.loc[j, colSame]
#                             inan2 = row2.isna()
#                             if row1.eq(row2)[~inan1 & ~inan2].all():
#                                 dfmeta.loc[i, pcColumn] = -c
#                                 dfmeta.loc[j, pcColumn] = c

#     for a, b, c in controls:
#         print(c, np.sum(dfmeta[c] > 0))

#     return dfmeta

#dfmeta = buildPairwise(dfmeta, numcol, cols, controls)
#dfmeta

In [18]:
# compute effect sizes
# def buildES(dfmeta, numcol, cols, controls):
#     if esLog.value is True:
#         dfmeta[numcol] = dfmeta[numcol].apply(np.log10)
#         print('applying log on numeric column')
#     for ccol, cval, pcColumn in controls:
#         dfmeta['es' + pcColumn[2:]] = np.nan
#     for expid in dfmeta['Experiment ID'].unique():
#         ie = dfmeta['Experiment ID'] == expid
#         for ccol, cval, pcColumn in controls:
#             esColumn = 'es' + pcColumn[2:]
#             for i in range(np.sum(ie)):
#                 # identify the treatments (positive number)
#                 ieTrt = dfmeta[pcColumn] == (i+1)
#                 itrts = np.where(ie & ieTrt)[0]
#                 if len(itrts) > 0:
#                     # and look for its control (negative number)
#                     ieCtl = dfmeta[pcColumn] == -(i+1)
#                     ictl = np.where(ie & ieCtl)[0][0]
#                     # get the control value
#                     controlValue = dfmeta.loc[ictl, numcol]
#                     #print(pcColumn, expid, ictl, itrts, controlValue)
#                     for itrt in itrts:
#                         # get the treatment value
#                         treatmentValue = dfmeta.loc[itrt, numcol]
#                         # compute effect size
#                         if esType.value == 'difference':
#                             dfmeta.loc[itrt, esColumn] = treatmentValue - controlValue
#                         else:
#                             dfmeta.loc[itrt, esColumn] = treatmentValue / controlValue
#     print('Number of unassigned rows:', np.sum((dfmeta[[a[2] for a in controls]] != 0).any(1)))
#     return dfmeta

#dfmeta = buildES(dfmeta, numcol, cols, controls)
#dfmeta

In [19]:
# plot effect sizes
# def plotES(dfmeta, numcol, cols, controls):
#     esColumns = ['es' + a[2][2:] for a in controls]
#     fig, ax = plt.subplots()
#     ax.set_title('Effect size on ' + numcol)
#     ylabs = []
#     if esType.value == 'difference':
#         ax.axvline(0, linestyle='--', color='k')
#     else:
#         ax.axvline(1, linestyle='--', color='k')
#     for i, esColumn in enumerate(esColumns):
#         ax.errorbar(dfmeta[esColumn].mean(), i, xerr=dfmeta[esColumn].sem(), marker='o', label=esColumn[2:])
#         ylabs.append('{:s} ({:d})'.format(esColumn[2:], dfmeta[esColumn].notnull().sum()))
#     ax.set_xlabel('Effect size') # units if difference, nothing if ratio
#     ax.set_yticks(np.arange(len(ylabs)))
#     ax.set_yticklabels(ylabs);
#     #ax.legend()
#     plt.show()

#plotES(dfmeta, controls)

In [20]:
ids = ['Experiment ID', 'Treatment ID', 'Crop ID', 'Rotation', 'Publication ID']

def buildSelection(dfdic):
    scols = []
    for sheet in list(dfdic.keys())[2:-1]:
        cols = dfdic[sheet].columns
        ie1 = cols.isin(ids)
        #ie2 = cols.str.contains('(comment)')
        scols +=  (sheet + ' | ' + cols[~ie1]).tolist()
    checkboxes = [Checkbox(value=False, description=a, indent=False) for a in scols]
    def checkall(a):
        for c in checkboxes:
            c.value = a['new']
    allcheck = Checkbox(value=False, description='Check all', indent=False)
    allcheck.observe(checkall, names='value')
    return VBox([allcheck] + checkboxes)

def getMergedDf(dfdic, vbox):
    selection = [child.description.split(' | ') for child in vbox.children if child.value is True]
    dfsel = pd.DataFrame(selection, columns=['sheet', 'column'])
    sheets = dfsel['sheet'].unique()
    dfm = pd.DataFrame()
    for i, sheet in enumerate(sheets):
        cols = dfsel[dfsel['sheet'] == sheet]['column'].tolist()
        cols += ids
        cols = pd.Series(cols)
        df = dfdic[sheet]
        cols = cols[cols.isin(df.columns.tolist())]
        if i == 0:
            dfm = df[cols]
        else:
            dfm = pd.merge(dfm, df[cols], how='outer')
    return dfm


#vbox = buildSelection(dfdic)
#vbox

In [21]:
def prepareDownload(df, name='Download File'):
    fname = 'df.xlsx'
    with tempfile.TemporaryDirectory() as td:
        fpath = os.path.join(td + fname)
        df.to_excel(fpath, index=False)
        with open(fpath,  'rb') as f:
            data = f.read()
    b64 = base64.b64encode(data)
    payload = b64.decode()

    html_button = '''
    <a download="{fname}" href="data:text/csv;base64,{payload}" download>
    <button class="p-Widget jupyter-widgets jupyter-button widget-button mod-info">{name}</button>
    </a>
    '''.format(payload=payload, fname=fname, name=name)

    return HTML(html_button)

#prepareDownload(dfdic['experiment'])

In [22]:
# global variables
dfdic = pd.DataFrame()
dfdico = pd.DataFrame() # original only for reseting dfdic
df = pd.DataFrame()
dfmeta = pd.DataFrame() # for meta-analysis
dfstack = pd.DataFrame() # stacked version of paired dataframe
compdict = {}
queries = {}
refs = None

# upload button
def loadBtnFunc(btn):
    global dfdic, dfmeta, dfstack, mainLayout, refs
    out.clear_output()
    with out:
        if gsurl.value != '':
            dfdic = readExcel(gsurl.value)
        elif len(uploadBtn.data) > 0:
            dfdic = readExcel(uploadBtn.data[0])
            #with open('t.xlsx', 'wb') as f: # faster but less robust
            #    f.write(upload.data[0])
            #dfdic = readExcel2('t.xlsx')

        # remove the 'unnamed columns'
        for sheet in dfdic:
            cols = dfdic[sheet].columns
            dfdic[sheet] = dfdic[sheet].drop(cols[cols.str.contains('Unnamed')], axis=1)

        # force float type
        for sheet in dbdic.keys():
            for col in dbdic[sheet].keys():
                if (dbdic[sheet][col] == 'number'):
                    try:
                        dfdic[sheet][col].astype(float)
                    except Exception as e:
                        dfdic[sheet][col] = dfdic[sheet][col].apply(tofloat)
                        print('had to force ', sheet, '>', col, 'to be float:', str(e))

        # make a backup to be used if we reset the filters
        dfdico = dfdic.copy()

        # build descriptive stats
        #print('\nThe plot below shows the number of treatments that have investigated a specific'
        #      ' factor as well as the ones that have investigated many.')
        #plotFactor(dfdic)
        showMap(dfdic)

        # but back mainLayout to one single tab
        mainLayout.children = mainLayout.children[:1]

        # build filtering
        filters = buildFilters()
        mainLayout.children += (VBox([
            Label('Add data filters. You can then explored the filtered data (Query tab).'), filters]), )

        # creating queries
        refs = buildMetaRef(dfdic)
        trtOut = Output()
        extractTreatmentBtn = Button(description='Get treatments', style= {'button_color':'orange'})
        def func(_):
            global dfmeta, dfstack
            extractTreatments(trtOut)
            dfmeta, dfstack = buildMetaDf()  # TODO add suppcols
        extractTreatmentBtn.on_click(func)
        mainLayout.children += (VBox([Label('Define treatments'), refs, extractTreatmentBtn, trtOut]), )

        # build meta-analysis part
        mainLayout.children += (
                VBox([Label('1. Add a reference (also called "control"). E.g, for tillage, you can choose "No tillage" as control.'),
                      Label('2. Then select which columns should be similar for pairwise comparison (if any).'),
                      Label('3. After having selected a numeical value and relevant options, run the Meta-analysis.'),
                      metaBox, metaOut]),)
            
        # build export part
        vbox = buildSelection(dfdic)

        def expBtnFunc(a):
            global dfmeta
            expOut.clear_output()
            dfm = getMergedDf(dfdic, vbox)
            with expOut:
                display(prepareDownload(dfm, 'Merged file'))
                display(prepareDownload(dfmeta, 'Meta-analysis C vs T file'))
                display(prepareDownload(dfstack, 'Meta-analysis stacked file'))
        expBtn = Button(description='Generate exports')
        expBtn.on_click(expBtnFunc)
        expOut = Output()
        exports = HBox([vbox, VBox([expBtn, expOut])])

        # append elements to layout
#         mainLayout.children = mainLayout.children[:2] + (
#             Label('Filters:'), filters,
#             Label('Export either excel for meta-analysis or with the selected columns:'),
#             exports,
#         )
        mainLayout.children += (
            VBox([Label('Export either excel for meta-analysis or with the selected columns:'), exports]),
        )

        mainLayout.set_title(1, 'Filtering')
        mainLayout.set_title(2, 'Query')
        mainLayout.set_title(3, 'Meta-analysis')
        mainLayout.set_title(4, 'Export')

uploadBtn = FileUpload(accept='.xlsx', multiple=False)
uploadBtn.observe(loadBtnFunc, names='value')

# where to put the url of the Google Sheet
gsurl = Text()

loadBtn = Button(description='Load File/URL', style= {'button_color':'orange'})
loadBtn.on_click(loadBtnFunc)



# output for displaying processing
out = Output()

# case of headers
headerRadio = RadioButtons(
    options=['Default (with space)', 'camelCase', 'under_case'],
    description="Headers:")

# list of sheets to include in the extracted table
sheetChecks = VBox([Checkbox(description='Select all', indent=False)] + 
                   [Checkbox(description=sheet, indent=False) for sheet in sheetNames[1:-3]])
def sheetChecksFunc(a):
    children = sheetChecks.children
    for child in children[1:]:
        child.value = children[0].value
sheetChecks.children[0].observe(sheetChecksFunc, names='value')

# list of variable to include in the extracted table
measChecks = VBox([
    Checkbox(description='Select all', indent=False)
])
def measChecksFunc(a):
    children = measChecks.children
    for child in children[1:]:
        child.value = children[0].value
measChecks.children[0].observe(measChecksFunc, names='value')

# display extracted links here
extracted = Output()


# choice between stacked or 'vs Control' layout for extracted table
treatmentRadio = RadioButtons(
    options=['Stacked','vs Control'],
    description='Treatments:')


# extract button (perform database extraction and merging of tables)
# def extractBtnFunc(a):
#     global dfdic, df
#     stacked = treatmentRadio.value == 'Stacked'
#     sheetCheck = [a.description for a in sheetChecks.children[1:] if a.value is True]
#     measCheck = [a.description for a in measChecks.children[1:] if a.value is True]
#     df = dfdic['experiment'] # always include 'experiment' tab
#     for sheet in sheetCheck:
#         # by default merge is performed on columns of same name (so perfect for our case)
#         df = pd.merge(df, dfdic[sheet], how='outer')
#     # add data to this
#     if 'data' in dfdic.keys():
#         dfdata = dfdic['data']
#         ie = np.in1d(dfdata['Measurement'], measCheck)
#         df = pd.merge(df, dfdata[ie], how='outer')

#     if stacked is False:
#         iref = df['Reference treatment'].isna()
#         dft = df[~iref].copy() # treatment
#         dfc = df[iref].copy() # control
#         cols = np.array(['Rotation', 'Crops ID']) # experiment id included later
#         ie = np.in1d(cols, df.columns)
#         on = cols[ie].tolist() + ['Reference treatment']
#         on = on + dfdic['experiment'].columns.tolist()
#         dfc = dfc.drop('Reference treatment', axis=1)
#         dfc = dfc.rename(columns={'Treatment ID': 'Reference treatment'})
#         if 'Measurement' not in on:
#             on += ['Measurement']
#         df = pd.merge(dft, dfc, on=on, how='outer', suffixes=('_T', '_C'))
#     if headerRadio.value == 'camelCase':
#         df = df.rename(columns=dict(zip(df.columns, list(map(camelCase, df.columns)))))
#     elif headerRadio.value == 'under_case':
#         df = df.rename(columns=dict(zip(df.columns, list(map(underCase, df.columns)))))
#     extracted.clear_output()
#     with extracted:
#         fname = 'extracted.xlsx'
#         df.to_excel(fname, index=False)
#         with open(fname,  'rb') as f:
#             data = f.read()
#         b64 = base64.b64encode(data)
#         payload = b64.decode()
#         html = '<a download="{filename}" href="data:text/csv;base64,{payload}" target="_blank">{title}</a>'
#         html = html.format(payload=payload, title=fname, filename=fname)
#         display(HTML(html))


# extractBtn = Button(description = 'Extract data')
# extractBtn.on_click(extractBtnFunc)


# layout
optionLayout = HBox([
    VBox([Label('Options:'), treatmentRadio, headerRadio]),
    VBox([Label('Sheets:'), sheetChecks]),
    VBox([Label('Measurements:'), measChecks], layout=Layout(width='40%')),
])

#accordion = Accordion(children=[], titles=())

mainLayout = Tab([
    VBox([Label('Please load the database as .xlsx file.'), 
          HBox([uploadBtn, Label('OR Google Sheet URL:'), gsurl, loadBtn]),
          out])
])
mainLayout.set_title(0, 'Load Data')

display(mainLayout)

Tab(children=(VBox(children=(Label(value='Please load the database as .xlsx file.'), HBox(children=(FileUpload…