# EJP Common template - Data Query Module
Upload the database and then use the drop-down to select the information you want to retrieve. A link to a temporary excel/csv file will be provided for you to download.

In [1]:
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
from matplotlib.collections import PolyCollection
import matplotlib.dates as mdates
from xlsx2csv import Xlsx2csv
import tempfile
import os
import base64
    
from ipywidgets import (FileUpload, Button, Output, Dropdown, RadioButtons,
                        SelectMultiple, VBox, HBox, Layout, Checkbox, Label, Text)
from IPython.display import FileLink, HTML

sheetNames = ['experiment', 'reference', 'treatment', 'soil', 'tillage', 'crops',
             'fertilization', 'amendment', 'irrigation', 'pest-weed', 'grazing',
             'soil-crop-measurement', 'data', 'dropDownList']

dtypes = {
    'Experiment ID': 'string',
    'Treatment ID': 'string',
    'Reference treatment': 'string',
         }

def camelCase(s):
    s = s.split()
    return s[0].lower() + ''.join(i.capitalize() for i in s[1:])
def underCase(s):
    s = s.split()
    return '_'.join(i.lower() for i in s)

In [2]:
def readExcel2(data):
    t0 = time.time()
    print('Reading in Excel file...', end='')
    a = Xlsx2csv(data, outputencoding="utf-8")
    dfdic = {}
    with tempfile.TemporaryDirectory() as td:
        a.convert(td, sheetid=0)
        for i, sheet in enumerate(sheetNames):
            fname = os.path.join(td, sheet + '.csv')
            df = pd.read_csv(fname, skiprows=[0,1,3], dtype=dtypes)
            dfdic[sheet] = df.dropna(how='all')
        dfdic['data'] = dfdic['data'].dropna(axis=1)
    datetimeList = [('crops', 'Sowing date'),
                    ('crops', 'Harvesting date'),
                    ('tillage', 'Tillage date'),
                    ('amendment', 'Amendment date'),
                    ('fertilization', 'Fertilizer application date'),
                    ('irrigation', 'Irrigation date'),
                    ('pest', 'Pesticide application date'),
                    ('measurement', 'Sampling date'),
                    ('data', 'Date')
                   ]
    for row in datetimeList:
        df = dfdic[row[0]]
        if row[0] in df.columns:
            df[row[1]] = pd.to_datetime(df[row[1]])
    dfdic['data'] = dfdic['data'].dropna(axis=1)
    print('done ({:.2f}s)'.format(time.time() - t0))
    return dfdic
#dfdic = readExcel2('../../ejp-common-template2.xlsx')

In [3]:
def readExcel(fname):
    t0 = time.time()
    print('Reading in Excel file...', end='')
    if fname[:4] == 'http': # it's a google sheet url
        fname = '/'.join(fname.split('/')[:-1] + ['export?format=xlsx'])
    dfdic = pd.read_excel(fname, sheet_name=None, skiprows=[0,1,3])
    print('done ({:.2f}s)'.format(time.time() - t0))
    return dfdic

#dfdic = readExcel('../../../ejp-wp7/ejp-common-template2.xlsx')

In [11]:
# proof of concept of web-based input file check (on binder for instance)
dfdic = pd.DataFrame()
df = pd.DataFrame()

# upload button
def uploadBtnFunc(btn):
    global dfdic
    out.clear_output()
    with out:
        if gsurl.value != '':
            dfdic = readExcel(gsurl.value)
        elif len(uploadBtn.data) > 0:
            dfdic = readExcel(uploadBtn.data[0])
            #with open('t.xlsx', 'wb') as f: # faster but less robust
            #    f.write(upload.data[0])
            #dfdic = readExcel2('t.xlsx')
        if 'data' in dfdic.keys():
            measurements = dfdic['data']['Measurement'].unique()
            measurement = sorted(measurements)
            measChecks.children = [measChecks.children[0]] + \
                [Checkbox(value=False, indent=False, description=a) for a in measurements]
uploadBtn = FileUpload(accept='.xlsx', multiple=False)
#uploadBtn.observe(uploadBtnFunc, names='_counter')

# where to put the url of the Google Sheet
gsurl = Text()
#gsurl.observed(uploadBtnFunc, names='_')

loadBtn = Button(description='Load File/URL', style= {'button_color':'orange'})
loadBtn.on_click(uploadBtnFunc)

# output for displaying processing
out = Output()

# choice between stacked or 'vs Control' layout for extracted table
treatmentRadio = RadioButtons(
    options=['Stacked','vs Control'],
    description='Treatments:')

# case of headers
headerRadio = RadioButtons(
    options=['Default (with space)','camelCase','under_case'],
    description="Headers:")

# list of sheets to include in the extracted table
sheetChecks = VBox([Checkbox(description='Select all', indent=False)] + 
                             [Checkbox(description=sheet, indent=False) for sheet in sheetNames[1:-3]])
def sheetChecksFunc(a):
    children = sheetChecks.children
    for child in children[1:]:
        child.value = children[0].value
sheetChecks.children[0].observe(sheetChecksFunc, names='value')

# list of variable to include in the extracted table
measChecks = VBox([
    Checkbox(description='Select all', indent=False)
])
def measChecksFunc(a):
    children = measChecks.children
    for child in children[1:]:
        child.value = children[0].value
measChecks.children[0].observe(measChecksFunc, names='value')

# display extracted links here
extracted = Output()

# extract button (perform database extraction and merging of tables)
def extractBtnFunc(a):
    global dfdic, df
    stacked = treatmentRadio.value == 'Stacked'
    sheetCheck = [a.description for a in sheetChecks.children[1:] if a.value is True]
    measCheck = [a.description for a in measChecks.children[1:] if a.value is True]
    df = dfdic['experiment'] # always include 'experiment' tab
    for sheet in sheetCheck:
        # by default merge is performed on columns of same name (so perfect for our case)
        df = pd.merge(df, dfdic[sheet], how='outer')
    # add data to this
    if 'data' in dfdic.keys():
        dfdata = dfdic['data']
        ie = np.in1d(dfdata['Measurement'], measCheck)
        df = pd.merge(df, dfdata[ie], how='outer')
    
    if stacked is False:
        iref = df['Reference treatment'].isna()
        dft = df[~iref].copy() # treatment
        dfc = df[iref].copy() # control
        cols = np.array(['Rotation', 'Crops ID']) # experiment id included later
        ie = np.in1d(cols, df.columns)
        on = cols[ie].tolist() + ['Reference treatment']
        on = on + dfdic['experiment'].columns.tolist()
        dfc = dfc.drop('Reference treatment', axis=1)
        dfc = dfc.rename(columns={'Treatment ID': 'Reference treatment'})
        if 'Measurement' not in on:
            on += ['Measurement']
        df = pd.merge(dft, dfc, on=on, how='outer', suffixes=('_T', '_C'))
    if headerRadio.value == 'camelCase':
        df = df.rename(columns=dict(zip(df.columns, list(map(camelCase, df.columns)))))
    elif headerRadio.value == 'under_case':
        df = df.rename(columns=dict(zip(df.columns, list(map(underCase, df.columns)))))
    extracted.clear_output()
    with extracted:
        fname = 'extracted.csv'
        df.to_csv(fname, index=False)
        with open(fname,  'rb') as f:
            data = f.read()
        b64 = base64.b64encode(data)
        payload = b64.decode()
        html = '<a download="{filename}" href="data:text/csv;base64,{payload}" target="_blank">{title}</a>'
        html = html.format(payload=payload, title=fname, filename=fname)
        display(HTML(html))
        #display(FileLink('extracted.csv'))
extractBtn = Button(description = 'Extract data')
extractBtn.on_click(extractBtnFunc)


# layout
optionLayout = HBox([
    VBox([Label('Options:'), treatmentRadio, headerRadio]),
    VBox([Label('Sheets:'), sheetChecks]),
    VBox([Label('Measurements:'), measChecks], layout=Layout(width='40%')),
])
mainLayout = VBox([
    HBox([uploadBtn, Label('OR Google Sheet URL:'), gsurl]),
    loadBtn,
    out,
    optionLayout,
    extractBtn,
    extracted
])


display(mainLayout)

VBox(children=(HBox(children=(FileUpload(value={}, accept='.xlsx', description='Upload'), Label(value='OR Goog…