# EJP Common template - Data Query Module
Upload the database and then use the drop-down to select the information you want to retrieve. A link to a temporary excel/csv file will be provided for you to download.

In [1]:
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
from matplotlib.collections import PolyCollection
import matplotlib.dates as mdates
from xlsx2csv import Xlsx2csv
import tempfile
import os
import base64
from upsetplot import UpSet  # graph with interactions

from ipywidgets import (FileUpload, Button, Output, Dropdown, RadioButtons,
                        SelectMultiple, VBox, HBox, Layout, Checkbox, Label, Text,
                       FloatText, FloatRangeSlider, Accordion)
from IPython.display import FileLink, HTML

sheetNames = ['experiment', 'reference', 'treatment', 'soil-type', 'tillage', 'crops',
             'amendment', 'irrigation', 'pest-weed', 'grazing',
             'soil-crop-measurement', 'data-crop', 'data-soil', 'dropDownList']

dtypes = {
    'Experiment ID': 'string',
    'Treatment ID': 'string',
    'Reference treatment': 'string',
         }

def camelCase(s):
    s = s.split()
    return s[0].lower() + ''.join(i.capitalize() for i in s[1:])
def underCase(s):
    s = s.split()
    return '_'.join(i.lower() for i in s)
def dicCopy(dic):
    bdic = {}
    for key in dic:
        bdic[key] = dic[key].copy()
    return bdic

C:\Users\gblanchy\WPy64-3890\python-3.8.9.amd64\lib\site-packages\numpy\.libs\libopenblas.WCDJNK7YVMPZQ2ME2ZZHJJRJ3JIKNDB7.gfortran-win_amd64.dll
C:\Users\gblanchy\WPy64-3890\python-3.8.9.amd64\lib\site-packages\numpy\.libs\libopenblas.xwydx2ikjw2nmtwsfyngfuwkqu3lytcz.gfortran-win_amd64.dll


In [2]:
# for test
#dfdic = readExcel('../../data/carboseq-wp2-db.xlsx')

In [3]:
# type of columns
dbdic = {
    'experiment': {
        'Latitude': 'number',
        'Longitude': 'number',
        'Country': 'choice',
        'Land use prior experiment': 'choice'
    },
    'reference': {
        'Publication type': 'choice',
        'Publication first author': 'string',
        'Publication year': 'number',
        'Publication title': 'string',
        'Publication journal': 'string'
    },
    'soil-type': {
        'Top depth of layer': 'number',
        'Bottom depth of layer': 'number',
        'Clay (< 0.002 mm)': 'number',
        'Silt (0.002 - 0.05 mm)': 'number',
        'Sand (0.05 - 2 mm)': 'number',
        'Gravel (> 2 mm)': 'number',
        'Soil texture USDA': 'choice',
        'Soil group WRB': 'choice',
        'Soil group WRB qualifier': 'choice',
        'Soil group WRB specifier': 'choice',
        'Soil type USDA': 'choice',
        'Soil type USDA qualifier': 'choice',
    },
    'treatment': {
        'Land use': 'choice',
        'Year started': 'number',
        'Year ended': 'number',
        'Crop rotation': 'choice'
    },
    'tillage': {
        'Tillage system': 'choice',
        'Tillage method': 'choice',
        'Tillage depth': 'number',
        'Permanent soil area covered by residues or crops': 'choice',
        'Tillage period': 'choice',
    },
    'crops': {
        'Crop type': 'choice',
        'Cropping system': 'choice',
        'Crop': 'choice',
        'Harvesting/Termination method': 'choice',
        'Harvesting frequency': 'number',
        'Sowing period': 'choice',
        'Harvesting/Termination period': 'choice',
        'Residues removal': 'choice',
        'Residues incorporation': 'choice', 
        'Residues burning': 'choice'
    },
    'amendment': {
        'Type of fertilizer/amendment': 'choice',
        'Fertilizer/Amendment application rate': 'number',
        'Fertilizer/Amendment application rate units': 'choice',
        'Fertilizer/Amendment application method': 'choice',
        'Amendment water content': 'number',
        'Amendment C': 'number',
        'Amendment N': 'number',
        'Amendment P': 'number',
        'Amendment K': 'number',
    },
    'irrigation': {
        'Irrigation method': 'choice',
        'Amount of water': 'number',
        'Irrigation frequency': 'number',
        'Irrigation water': 'number',
        'Drainage system': 'choice',
        'Drainage spacing': 'number',
        'Drainage depth': 'number',
    },
    'data-crop': {
        'Sampling year': 'number',
        'Harvested yield': 'number',
        'Harvested yield water content': 'number',
        'Harvested yield water content amount': 'number',
        'Residue above-ground': 'number',
        'Residue stubble': 'number',
        'Residue roots': 'number',
        'Residue sampling method': 'string',
        'Below-ground sampling depth': 'number',
    },
    'data-soil': {
        'Sampling year': 'number',
        'Depth from': 'number',
        'Depth to': 'number',
        'Time-serie available': 'choice',
        'SOC conc': 'number',
        'SOC conc SD': 'number',
        'SOC conc SE': 'number',
        'SOC conc nb samples': 'number',
        'Analysis method': 'choice',
        'Bulk density': 'number',
        'Bulk density method': 'choice',
        'Bulk density SD': 'number',
        'Bulk density SE': 'number',
        'Bulk density nb samples': 'number',
        'SOC stock': 'number',
        'SOC stock SD': 'number',
        'SOC stock SE': 'number',
        'SOC stock nb samples': 'number',
        'pH': 'number',
        'pH method': 'choice'
    }
}

In [4]:
def readExcel2(data):
    t0 = time.time()
    print('Reading in Excel file...', end='')
    a = Xlsx2csv(data, outputencoding="utf-8")
    dfdic = {}
    with tempfile.TemporaryDirectory() as td:
        a.convert(td, sheetid=0)
        for i, sheet in enumerate(sheetNames):
            fname = os.path.join(td, sheet + '.csv')
            df = pd.read_csv(fname, skiprows=[0,1,3], dtype=dtypes)
            dfdic[sheet] = df.dropna(how='all')
        dfdic['data'] = dfdic['data'].dropna(axis=1)
    datetimeList = [('crops', 'Sowing date'),
                    ('crops', 'Harvesting date'),
                    ('tillage', 'Tillage date'),
                    ('amendment', 'Amendment date'),
                    ('fertilization', 'Fertilizer application date'),
                    ('irrigation', 'Irrigation date'),
                    ('pest', 'Pesticide application date'),
                    ('measurement', 'Sampling date'),
                    ('data', 'Date')
                   ]
    for row in datetimeList:
        df = dfdic[row[0]]
        if row[0] in df.columns:
            df[row[1]] = pd.to_datetime(df[row[1]])
    dfdic['data'] = dfdic['data'].dropna(axis=1)
    print('done ({:.2f}s)'.format(time.time() - t0))
    return dfdic
#dfdic = readExcel2('../../ejp-common-template2.xlsx')

In [5]:
def readExcel(fname):
    t0 = time.time()
    print('Reading in Excel file...', end='')
    if fname[:4] == 'http': # it's a google sheet url
        fname = '/'.join(fname.split('/')[:-1] + ['export?format=xlsx'])
    dfdic = pd.read_excel(fname, sheet_name=None, skiprows=[0,1,3])
    print('done ({:.2f}s)'.format(time.time() - t0))
    return dfdic

#dfdic = readExcel('../../../ejp-wp7/ejp-common-template2.xlsx')

In [6]:
# identify which management practice is treatment specific
# TODO for crops, we need to take cropsID into account
def extractTreatment(dfdic):
    mgnt = ['crops','tillage','amendment','irrigation','pest-weed','grazing']
    experimentIDs = dfdic['experiment']['Experiment ID'].unique()
    x = np.zeros((len(experimentIDs), len(mgnt)), dtype=bool)
    dft = pd.DataFrame(x, columns=mgnt) # True if the practice is part of treatment
    dft.insert(0, 'Experiment ID', experimentIDs)
    for expid in experimentIDs:
        treatList = []
        for sheet in mgnt:
            if sheet in dfdic.keys():
                df = dfdic[sheet]
                ie = df['Experiment ID'] == expid
                sdf = df[ie].copy().drop(['Experiment ID', 'Treatment ID'], axis=1).reset_index(drop=True)
                sdf = sdf.dropna(axis=1)
                isTreatment = False
                if sdf.shape[0] > 0: # sheet might be empty for expid
                    s0 = sdf.loc[0,:]
                    for i in range(1, sdf.shape[0]):
                        if (sdf.loc[i, :] != s0).any():
                            # print('++++++is different:', sdf.loc[i,:], '--', s0, '///', sdf.loc[i,:] != s0)
                            isTreatment = True
                            break
                if isTreatment is True:
                    dft.loc[dft['Experiment ID'] == expid, sheet] = True
    return dft

In [7]:
# plot interactions between treatments investigated
def plotFactor(dfdic):
    dft = extractTreatment(dfdic)
    dfcount = dft.set_index(dft.columns[1:].tolist())
    dfcount
    fig = plt.figure()
    _ = UpSet(dfcount).plot(fig=fig)
    plt.show()
#plotFactor(dfdic)

In [8]:
# filter experiments
class RowFilter(object):
    def __init__(self, filters, dfdic):
        self.sheetDropdown = Dropdown(options=list(dbdic.keys()), layout=Layout(width='15%'))
        self.sheetDropdown.observe(self.sheetDropdownFunc, names='value')
        self.colDropdown = Dropdown(options=list(dbdic['experiment'].keys()), layout=Layout(width='15%'))
        self.colDropdown.observe(self.colDropdownFunc, names='value')
        self.rmBtn = Button(description='Remove')
        self.rmBtn.on_click(self.rmBtnFunc)
        self.hbox = HBox([self.sheetDropdown, self.colDropdown, self.rmBtn], layout=Layout(display='flex'))
        self.filters = filters
        self.index = len(self.filters.children) - 1
        self.filters.children = self.filters.children[:-1] + (self.hbox, self.filters.children[-1])
        self.dfdic = dfdic
        self.opts = None
        self.buildOptions('experiment', 'Latitude')

    def sheetDropdownFunc(self, a):
        self.colDropdown.options = list(dbdic[a['new']].keys())

    def colDropdownFunc(self, a):
        sheet = self.sheetDropdown.value
        col = a['new']
        self.buildOptions(sheet, col)

    def buildOptions(self, sheet, col):
        typ = dbdic[sheet][col]
        if typ == 'number':
            vmin = dfdic[sheet][col].min()
            vmax = dfdic[sheet][col].max()
            r = vmax - vmin
            #opts = FloatRangeSlider(value=[vmin, vmax], vmin=vmin - 0.1*r, vmax=vmax + 0.1*r, step=0.02*r)
            self.opts = HBox([Label('min:'), FloatText(value=vmin, layout=Layout(width='30%')),
                              Label('max:'), FloatText(value=vmax, layout=Layout(width='30%'))])
        if typ == 'choice' or typ == 'string':
            choices = dfdic[sheet][col].dropna().unique()
            self.opts = VBox([Checkbox(value=True, description=a, indent=False)
                              for a in choices], layout=Layout(width='40%'))
        if len(self.hbox.children) > 3:
            self.hbox.children = self.hbox.children[:-2] + (self.opts, self.hbox.children[-1])
        else:
            self.hbox.children = self.hbox.children[:-1] + (self.opts, self.hbox.children[-1])

    def rmBtnFunc(self, a):
        b = list(self.filters.children)
        del b[self.index]
        self.filters.children = b

def buildFilters(dfdic, dfdico, callback):

    def addFilterBtnFunc(b):
        _ = RowFilter(filters, dfdic)
    addFilterBtn = Button(description='Add filter')
    addFilterBtn.on_click(addFilterBtnFunc)

    def filterDataBtnFunc(a):
        expids = pd.Series(dfdic['experiment']['Experiment ID'].unique())
        for child in filters.children[:-1]:
            sheet = child.children[0].value
            col = child.children[1].value
            typ = dbdic[sheet][col]
            if typ == 'number':
                vmin = child.children[2].children[1].value
                vmax = child.children[2].children[3].value
                ie = (dfdic[sheet][col] >= vmin) & (dfdic[sheet][col] <= vmax)
            else:
                choices = [b.description for b in child.children[2].children if b.value is True]
                ie = dfdic[sheet][col].isin(choices)
            expids = expids[expids.isin(dfdic[sheet][ie]['Experiment ID'].unique())]
        log.clear_output()
        with log:
            print('Experiment ID retained: {:d}/{:d}'.format(len(expids), 
                  dfdic['experiment']['Experiment ID'].unique().shape[0]))
        # do the filtering
        expids = expids.tolist()
        for sheet in dfdic.keys():
            if 'Experiment ID' in dfdic[sheet].keys():
                ie = dfdic[sheet]['Experiment ID'].isin(expids)
                dfdic[sheet] = dfdic[sheet][ie].reset_index(drop=True)
        callback() # run the callback
    filterDataBtn = Button(description='Filter data')
    filterDataBtn.on_click(filterDataBtnFunc)

    def resetFilterBtnFunc(a):
        # we want to restore the global dfdic variable
        global dfdic
        dfdic = dicCopy(dfdico)
        filters.children = (filters.children[-1],)
        log.clear_output()
        with log:
            print('{:d} experiments restored'.format(
                dfdic['experiment']['Experiment ID'].unique().shape[0]))
        callback()
    resetFilterBtn = Button(description='Reset filters')
    resetFilterBtn.on_click(resetFilterBtnFunc)
    
    log = Output()

    filters = VBox([HBox([addFilterBtn, filterDataBtn, resetFilterBtn, log])])
    return filters

#display(buildFilters(dfdic, dicCopy(dfdic)))

In [9]:
# interactive histogram view
class ExploratoryHist(object):
    def __init__(self, dfdic):
        self.sheetDropdown = Dropdown(options=list(dbdic.keys()), layout=Layout(width='25%'))
        self.sheetDropdown.observe(self.sheetDropdownFunc, names='value')
        self.colDropdown = Dropdown(options=list(dbdic['experiment'].keys()), layout=Layout(width='45%'))
        self.colDropdown.observe(self.colDropdownFunc, names='value')
        self.dfdic = dfdic
        self.showLog = Checkbox(value=True, description='Show values head/tail')
        self.out = Output()
        self.log = Output()
        self.hbox = HBox([VBox([HBox([self.sheetDropdown, self.colDropdown]), self.out]),
                          VBox([self.showLog, self.log])])
        self.buildFigure('experiment', 'Latitude')

    def sheetDropdownFunc(self, a):
        self.colDropdown.options = list(dbdic[a['new']].keys())

    def colDropdownFunc(self, a):
        sheet = self.sheetDropdown.value
        col = a['new']
        self.buildFigure(sheet, col)

    def buildFigure(self, sheet, col):
        self.out.clear_output()
        self.log.clear_output()
        with self.log:
            if self.showLog.value is True:
                if self.dfdic[sheet].shape[0] > 10:
                    print(self.dfdic[sheet].iloc[np.r_[0:5, -5:0]][col])
                else:
                    print(self.dfdic[sheet][col])
        with self.out:
            fig, ax = plt.subplots()
            typ = dbdic[sheet][col]
            if typ == 'number':
                self.dfdic[sheet][col].plot.hist(ax=ax)
                ax.set_xlabel(col)
            else:
                self.dfdic[sheet][col].value_counts().plot(kind='bar', xlabel=col, ax=ax)
                ax.set_ylabel('Count')
            plt.show() # needed, otherwise, graph won't change

#expHist = ExploratoryHist(dfdic)
#display(expHist.hbox)

In [10]:
# plot a numeric as dots with colors as categorical


In [11]:
# interactive meta-analysis
# TODO a condition might be found from several of these
# select columns for interesting factor and reference within
class RowReference(object):
    def __init__(self, rows, dfdic):
        self.sheetDropdown = Dropdown(options=list(dbdic.keys())[3:-2], layout=Layout(width='15%'))
        self.sheetDropdown.observe(self.sheetDropdownFunc, names='value')
        self.colDropdown = Dropdown(options=[], layout=Layout(width='15%'))
        self.colDropdown.observe(self.colDropdownFunc, names='value')
        self.rmBtn = Button(description='Remove')
        self.rmBtn.on_click(self.rmBtnFunc)
        self.hbox = HBox([self.sheetDropdown, self.colDropdown, Label('Reference:'), self.rmBtn],
                         layout=Layout(display='flex'))
        self.rows = rows
        self.index = len(self.rows.children) - 1
        self.rows.children = self.rows.children[:-1] + (self.hbox, self.rows.children[-1])
        self.dfdic = dfdic
        self.opts = None
        self.sheetDropdownFunc({'new':'treatment'})
        self.buildOptions('treatment', 'Crop rotation')

    def sheetDropdownFunc(self, a):
        sheet = a['new']
        vals = [b for b in dbdic[sheet] if dbdic[sheet][b] == 'choice']
        self.colDropdown.options = vals

    def colDropdownFunc(self, a):
        sheet = self.sheetDropdown.value
        col = a['new']
        self.buildOptions(sheet, col)

    def buildOptions(self, sheet, col):
        choices = dfdic[sheet][col].dropna().unique()
        self.opts = RadioButtons(options=choices, layout=Layout(width='30%'))
        if len(self.hbox.children) > 4:
            self.hbox.children = self.hbox.children[:-2] + (self.opts, self.hbox.children[-1])
        else:
            self.hbox.children = self.hbox.children[:-1] + (self.opts, self.hbox.children[-1])

    def rmBtnFunc(self, a):
        b = list(self.rows.children)
        del b[self.index]
        self.rows.children = b

def buildMetaRef(dfdic):
    
    def addRefBtnFunc(b):
        rowRef = RowReference(refs, dfdic)
    addRefBtn = Button(description='Add reference')
    addRefBtn.on_click(addRefBtnFunc)

#     def getRefBtnFunc(a):
#         for child in refs.children[:-1]:
#             sheet = child.children[0].value
#             col = child.children[1].value
#             ref = child.children[3].value
#             print(sheet, col, ref)
#     getRefBtn = Button(description='Get references')
#     getRefBtn.on_click(getRefBtnFunc)

    refs = VBox([HBox([addRefBtn])])
    return refs

#display(buildMetaRef(dfdic))

In [12]:
# select columns that must be the same in pairwise comparision
# select type of effect sizes (diff or ratio) and if log or not
colCheck = []
numChoices = []
for sheet in dbdic:
    for key in dbdic[sheet]:
        if dbdic[sheet][key] == 'choice':
            colCheck.append(Checkbox(value=False, 
                                     description=sheet + ' | ' + key,
                                     indent=False))
        elif dbdic[sheet][key] == 'number' and 'data-' in sheet:
            numChoices.append(sheet + ' | ' + key)
numRadio = RadioButtons(options=numChoices)
esType = RadioButtons(options=['difference', 'ratio'])
esLog = RadioButtons(options=['yes', 'no'], value='no')
metaBox = HBox([VBox([Label('Identical column in pairwise:')] + colCheck),
                VBox([Label('Numerical column for effect sizes:'), numRadio]),
                VBox([Label('How to compute ES?'), esType,
                      Label('Apply log on numeric value?'), esLog])
               ])

#display(metaBox)

In [13]:
# columns with reference/control
def buildMetaDf(refs, dfdic):
    controls = []
    controlSheets = []
    for child in refs.children[:-1]:
        sheet = child.children[0].value
        col = child.children[1].value
        ref = child.children[3].value
        h = camelCase(col)
        controls.append((col, ref, 'pc' + h[0].upper() + h[1:]))
        controlSheets.append(sheet)

    # columns that should be similar
    cols = []
    sheets = []
    for child in metaBox.children[0].children[1:]:
        if child.value is True:
            sheet, col = child.description.split(' | ')
            cols.append(col)
            sheets.append(sheet)

    # column with numeric type for effect-size
    numsheet, numcol = numRadio.value.split(' | ')

    # create merged df
    ccols = [a[0] for a in controls]
    ucols = np.unique(cols + ccols + 
                      ['Experiment ID', 'Treatment ID'] + [numcol]) # TODO include Rotation?
    usheets = np.unique(sheets + controlSheets + [numsheet])
    for i, sheet in enumerate(usheets):
        ie = dfdic[sheet].columns.isin(ucols)
        scols = dfdic[sheet].columns[ie].tolist()
        df = dfdic[sheet][scols].copy()
        # we clean the table by ensuring that each Treatment ID has only
        # one row. For this we identify the row that contains reference/control
        # values, to be sure to keep them. If a treatment does not contain any,
        # the first row of it is kept
        df['iref'] = 0
        for ccol, ccval, pcColumn in controls:
            if ccol in df.columns:
                # check which rows contains the reference value
                df.loc[:, 'iref'] = df['iref'].values + df[ccol].isin([ccval]).astype(int)
        df = df.loc[df.groupby(['Experiment ID', 'Treatment ID'])['iref'].idxmax(),:]
        df = df.drop(['iref'], axis=1).reset_index(drop=True)
        if i == 0:
            dfmeta = df
        else:
            dfmeta = pd.merge(dfmeta, df, how='outer')
    dfmeta = dfmeta[ucols]
    print('shrunk:', dfmeta.shape[0], '-> ', end='')
    dfmeta = dfmeta.dropna(subset=ccols + [numcol], axis=0).reset_index(drop=True)
    # INFO we enforce not nan in the control columns too
    print(dfmeta.shape[0], 'after NaN drop')

    # two issues:
    # 1) the selected 'same columns' are not enough to differentiate the treatments,
    # hence, only the last detected control is compared against all other treatments
    # -> the user need to add more 'same columns' to differentiate the treatments
    # 2) because of the outer merge, and many to many relationship, multiple amendment
    # can be entered event if they refer to the same treatment -> for each treatment
    # and for each control column, identify if the treatment contains the control
    # value, if yes, keep the row and discard other rows within this treatment
    # if no, just keep the first row of it (implemented above)

    # build pair-wise comparison
    for a, b, c in controls:
        dfmeta[c] = 0

    return dfmeta, numcol, cols, controls

#dfmeta, numcol, cols, controls = buildMetaDf(refs, dfdic)
#dfmeta

In [14]:
# build pairwise inside each publication
# TODO decide weather to include rotation or not
def buildPairwise(dfmeta, numcol, cols, controls):
    scols = np.array(['Experiment ID'] + cols)
    for expid in dfmeta['Experiment ID'].unique():
        ie = dfmeta['Experiment ID'] == expid
        irows = np.where(ie)[0]
        for ccol, cval, pcColumn in controls:
            c = 0
            colSame = scols[scols != ccol].copy()
            for i in irows:
                row1 = dfmeta.loc[i, colSame]
                inan1 = row1.isna()
                # if the columns is a controlled value
                if dfmeta.loc[i, ccol] == cval:
                    flag = False
                    c = c + 1
                    for j in irows:
                        if ((j != i) & (dfmeta.loc[j, ccol] != cval) &
                           (pd.isna(dfmeta.loc[j, ccol]) is False) &
                           (dfmeta.loc[j, ccol] != 'unknown')):
                            row2 = dfmeta.loc[j, colSame]
                            inan2 = row2.isna()
                            if row1.eq(row2)[~inan1 & ~inan2].all():
                                dfmeta.loc[i, pcColumn] = -c
                                dfmeta.loc[j, pcColumn] = c

    for a, b, c in controls:
        print(c, np.sum(dfmeta[c] > 0))

    return dfmeta

#dfmeta = buildPairwise(dfmeta, numcol, cols, controls)
#dfmeta

In [15]:
# compute effect sizes
def buildES(dfmeta, numcol, cols, controls):
    if esLog.value is True:
        dfmeta[numcol] = dfmeta[numcol].apply(np.log10)
        print('applying log on numeric column')
    for ccol, cval, pcColumn in controls:
        dfmeta['es' + pcColumn[2:]] = np.nan
    for expid in dfmeta['Experiment ID'].unique():
        ie = dfmeta['Experiment ID'] == expid
        for ccol, cval, pcColumn in controls:
            esColumn = 'es' + pcColumn[2:]
            for i in range(np.sum(ie)):
                # identify the treatments (positive number)
                ieTrt = dfmeta[pcColumn] == (i+1)
                itrts = np.where(ie & ieTrt)[0]
                if len(itrts) > 0:
                    # and look for its control (negative number)
                    ieCtl = dfmeta[pcColumn] == -(i+1)
                    ictl = np.where(ie & ieCtl)[0][0]
                    # get the control value
                    controlValue = dfmeta.loc[ictl, numcol]
                    #print(pcColumn, expid, ictl, itrts, controlValue)
                    for itrt in itrts:
                        # get the treatment value
                        treatmentValue = dfmeta.loc[itrt, numcol]
                        # compute effect size
                        if esType.value == 'difference':
                            dfmeta.loc[itrt, esColumn] = treatmentValue - controlValue
                        else:
                            dfmeta.loc[itrt, esColumn] = treatmentValue / controlValue
    print('Number of unassigned rows:', np.sum((dfmeta[[a[2] for a in controls]] != 0).any(1)))
    return dfmeta

#dfmeta = buildES(dfmeta, numcol, cols, controls)
#dfmeta

In [16]:
# plot effect sizes
def plotES(dfmeta, numcol, cols, controls):
    esColumns = ['es' + a[2][2:] for a in controls]
    fig, ax = plt.subplots()
    ax.set_title('Effect size on ' + numcol)
    ylabs = []
    if esType.value == 'difference':
        ax.axvline(0, linestyle='--', color='k')
    else:
        ax.axvline(1, linestyle='--', color='k')
    for i, esColumn in enumerate(esColumns):
        ax.errorbar(dfmeta[esColumn].mean(), i, xerr=dfmeta[esColumn].sem(), marker='o', label=esColumn[2:])
        ylabs.append('{:s} ({:d})'.format(esColumn[2:], dfmeta[esColumn].notnull().sum()))
    ax.set_xlabel('Effect size') # units if difference, nothing if ratio
    ax.set_yticks(np.arange(len(ylabs)))
    ax.set_yticklabels(ylabs);
    #ax.legend()
    plt.show()

#plotES(dfmeta, controls)

In [17]:
ids = ['Experiment ID', 'Treatment ID', 'Crop ID', 'Rotation', 'Publication ID']

def buildSelection(dfdic):
    scols = []
    for sheet in list(dfdic.keys())[2:-1]:
        cols = dfdic[sheet].columns
        ie1 = cols.isin(ids)
        #ie2 = cols.str.contains('(comment)')
        scols +=  (sheet + ' | ' + cols[~ie1]).tolist()
    checkboxes = [Checkbox(value=False, description=a, indent=False) for a in scols]
    return VBox(checkboxes)

def getMergedDf(dfdic, vbox):
    selection = [child.description.split(' | ') for child in vbox.children if child.value is True]
    dfsel = pd.DataFrame(selection, columns=['sheet', 'column'])
    sheets = dfsel['sheet'].unique()
    dfm = pd.DataFrame()
    for i, sheet in enumerate(sheets):
        cols = dfsel[dfsel['sheet'] == sheet]['column'].tolist()
        cols += ids
        cols = pd.Series(cols)
        df = dfdic[sheet]
        cols = cols[cols.isin(df.columns.tolist())]
        if i == 0:
            dfm = df[cols]
        else:
            dfm = pd.merge(dfm, df[cols], how='outer')
    return dfm


#vbox = buildSelection(dfdic)
#vbox

In [18]:
def prepareDownload(df, name='Download File'):
    fname = 'df.xlsx'
    with tempfile.TemporaryDirectory() as td:
        fpath = os.path.join(td + fname)
        df.to_excel(fpath, index=False)
        with open(fpath,  'rb') as f:
            data = f.read()
    b64 = base64.b64encode(data)
    payload = b64.decode()

    html_button = '''
    <a download="{fname}" href="data:text/csv;base64,{payload}" download>
    <button class="p-Widget jupyter-widgets jupyter-button widget-button mod-warning">{name}</button>
    </a>
    '''.format(payload=payload, fname=fname, name=name)

    return HTML(html_button)

#prepareDownload(dfdic['experiment'])

In [19]:
dfdic = pd.DataFrame()
df = pd.DataFrame()
dfmeta = pd.DataFrame()

# upload button
def loadBtnFunc(btn):
    global dfdic, dfmeta
    out.clear_output()
    with out:
        if gsurl.value != '':
            dfdic = readExcel(gsurl.value)
        elif len(uploadBtn.data) > 0:
            dfdic = readExcel(uploadBtn.data[0])
            #with open('t.xlsx', 'wb') as f: # faster but less robust
            #    f.write(upload.data[0])
            #dfdic = readExcel2('t.xlsx')

        # remove the 'unnamed columns'
        for sheet in dfdic:
            cols = dfdic[sheet].columns
            dfdic[sheet] = dfdic[sheet].drop(cols[cols.str.contains('Unnamed')], axis=1)

        # make a backup to be used if we reset the filters
        dfdico = dfdic.copy()

        # build descriptive stats
        plotFactor(dfdic)

        def addMore():
            global dfmeta
            # exploratory graph
            expHist = ExploratoryHist(dfdic)

            # build meta-analysis part
            refs = buildMetaRef(dfdic)
            # metaBox variable already instantiated
            metaOut = Output()
            def runMetaBtnFunc(a):
                global dfmeta
                metaOut.clear_output()
                with metaOut:
                    dfmeta, numcol, cols, controls = buildMetaDf(refs, dfdic)
                    dfmeta = buildPairwise(dfmeta, numcol, cols, controls)
                    dfmeta = buildES(dfmeta, numcol, cols, controls)
                    plotES(dfmeta, numcol, cols, controls)
            runMetaBtn = Button(description='Run Meta-analysis (β)',
                                style= {'button_color':'orange'})
            runMetaBtn.on_click(runMetaBtnFunc)

            mainLayout.children = mainLayout.children[:4] + (
                Label('Exploratory bar chart:'), expHist.hbox,
                Label('<h1>Meta-analysis</h1>'), refs, metaBox,
                runMetaBtn, metaOut,
                mainLayout.children[-2],
                mainLayout.children[-1]
            )
#             accordion.children = [expHist.hbox, VBox([refs, metaBox, runMetaBtn, metaOut])]
#             accordion.titles = ['Exploratory bar chart', 'Meta-analysis']

        # build filtering
        filters = buildFilters(dfdic, dfdico, addMore)
        # the addMore() function, updates the exploratory part and meta-analysis part
        # with the dfdic after it has been filtered out

        # build export part
        vbox = buildSelection(dfdic)

        def expBtnFunc(a):
            global dfmeta
            expOut.clear_output()
            dfm = getMergedDf(dfdic, vbox)
            with expOut:
                display(prepareDownload(dfm, 'Merged file'))
                display(prepareDownload(dfmeta, 'Meta-analysis file'))
        expBtn = Button(description='Generate exports')
        expBtn.on_click(expBtnFunc)
        expOut = Output()
        exports = HBox([vbox, VBox([expBtn, expOut])])

        # append elements to layout
        mainLayout.children = mainLayout.children[:2] + (
            Label('Filters:'), filters,
            Label('Export either excel for meta-analysis or with the selected columns:'),
            exports,
        )
        addMore()

uploadBtn = FileUpload(accept='.xlsx', multiple=False)

# where to put the url of the Google Sheet
gsurl = Text()

loadBtn = Button(description='Load File/URL', style= {'button_color':'orange'})
loadBtn.on_click(loadBtnFunc)

# output for displaying processing
out = Output()

# case of headers
headerRadio = RadioButtons(
    options=['Default (with space)', 'camelCase', 'under_case'],
    description="Headers:")

# list of sheets to include in the extracted table
sheetChecks = VBox([Checkbox(description='Select all', indent=False)] + 
                             [Checkbox(description=sheet, indent=False) for sheet in sheetNames[1:-3]])
def sheetChecksFunc(a):
    children = sheetChecks.children
    for child in children[1:]:
        child.value = children[0].value
sheetChecks.children[0].observe(sheetChecksFunc, names='value')

# list of variable to include in the extracted table
measChecks = VBox([
    Checkbox(description='Select all', indent=False)
])
def measChecksFunc(a):
    children = measChecks.children
    for child in children[1:]:
        child.value = children[0].value
measChecks.children[0].observe(measChecksFunc, names='value')

# display extracted links here
extracted = Output()


# choice between stacked or 'vs Control' layout for extracted table
treatmentRadio = RadioButtons(
    options=['Stacked','vs Control'],
    description='Treatments:')


# extract button (perform database extraction and merging of tables)
def extractBtnFunc(a):
    global dfdic, df
    stacked = treatmentRadio.value == 'Stacked'
    sheetCheck = [a.description for a in sheetChecks.children[1:] if a.value is True]
    measCheck = [a.description for a in measChecks.children[1:] if a.value is True]
    df = dfdic['experiment'] # always include 'experiment' tab
    for sheet in sheetCheck:
        # by default merge is performed on columns of same name (so perfect for our case)
        df = pd.merge(df, dfdic[sheet], how='outer')
    # add data to this
    if 'data' in dfdic.keys():
        dfdata = dfdic['data']
        ie = np.in1d(dfdata['Measurement'], measCheck)
        df = pd.merge(df, dfdata[ie], how='outer')

    if stacked is False:
        iref = df['Reference treatment'].isna()
        dft = df[~iref].copy() # treatment
        dfc = df[iref].copy() # control
        cols = np.array(['Rotation', 'Crops ID']) # experiment id included later
        ie = np.in1d(cols, df.columns)
        on = cols[ie].tolist() + ['Reference treatment']
        on = on + dfdic['experiment'].columns.tolist()
        dfc = dfc.drop('Reference treatment', axis=1)
        dfc = dfc.rename(columns={'Treatment ID': 'Reference treatment'})
        if 'Measurement' not in on:
            on += ['Measurement']
        df = pd.merge(dft, dfc, on=on, how='outer', suffixes=('_T', '_C'))
    if headerRadio.value == 'camelCase':
        df = df.rename(columns=dict(zip(df.columns, list(map(camelCase, df.columns)))))
    elif headerRadio.value == 'under_case':
        df = df.rename(columns=dict(zip(df.columns, list(map(underCase, df.columns)))))
    extracted.clear_output()
    with extracted:
        fname = 'extracted.xlsx'
        df.to_excel(fname, index=False)
        with open(fname,  'rb') as f:
            data = f.read()
        b64 = base64.b64encode(data)
        payload = b64.decode()
        html = '<a download="{filename}" href="data:text/csv;base64,{payload}" target="_blank">{title}</a>'
        html = html.format(payload=payload, title=fname, filename=fname)
        display(HTML(html))


extractBtn = Button(description = 'Extract data')
extractBtn.on_click(extractBtnFunc)


# layout
optionLayout = HBox([
    VBox([Label('Options:'), treatmentRadio, headerRadio]),
    VBox([Label('Sheets:'), sheetChecks]),
    VBox([Label('Measurements:'), measChecks], layout=Layout(width='40%')),
])

#accordion = Accordion(children=[], titles=())

mainLayout = VBox([
    HBox([uploadBtn, Label('OR Google Sheet URL:'), gsurl, loadBtn]),
    out,
    #accordion
])


display(mainLayout)

VBox(children=(HBox(children=(FileUpload(value={}, accept='.xlsx', description='Upload'), Label(value='OR Goog…