# EJP Common template - Data Checking Module
Please upload your filled excel template and press 'Check' to see if there are any errors that required attention.

In [1]:
import pandas as pd
import numpy as np
import time
from ipywidgets import FileUpload, Button, Output
from IPython.display import FileLink

In [115]:
# proof of concept of web-based input file check (on binder for instance)

upload = FileUpload()
out = Output()
def processBtnFunc(btn):
    df = pd.read_excel(upload.data[0])
    df.to_excel('processedFile.xlsx', index=False)
    with out:
        print(df.head())
        display(FileLink('processedFile.xlsx'))
processBtn = widgets.Button(description = 'Check File')
processBtn.on_click(processBtnFunc)

# layout
display(upload)
display(processBtn)
display(out)

FileUpload(value={}, description='Upload')

Button(description='Check File', style=ButtonStyle())

Output()

In [105]:
fname = wd + 'ejp-common-template.xlsx'
sheetNames = ['experiment', 'reference', 'treatment', 'soil', 'tillage', 'crops',
             'fertilization', 'amendment', 'irrigation', 'pest', 'grazing',
             'measurement', 'data']
t0 = time.time()
dfdic = pd.read_excel(fname, sheet_name=sheetNames, skiprows=[0,1,3])
print('Read in {:.2f}s'.format(time.time() - t0))

Read in 19.81s


In [106]:
# list of the controlled vocabulary
dfdrop = pd.read_excel(fname, sheet_name='dropDownList', skiprows=[0])

## Data checking and sanitization
Check indexes make sense, that choices are from the Bonares list or the drop down menu, ...

In [107]:
def camelCase(s):
    s = s.split()
    return s[0].lower() + ''.join(i.capitalize() for i in s[1:])
camelCase('Experiment ID')

'experimentId'

In [109]:
# check that all ID colums contains unique values
tocheck = [
    ('experiment', 'Experiment ID'),
    ('treatment', 'Treatment ID'),
    ('crops', 'Crops ID'),
#     ('fertilization', 'Fertilizer ID'),
#     ('amendment', 'Amendment ID'),
#     ('irrigation', 'Irrigation ID'),
#     ('pest', 'Pest ID'),
#     ('weed', 'Weed ID'),
#     ('grazing', 'Grazing ID')
]
ok = True
for a in tocheck:
    df = dfdic[a[0]]
    if len(df[a[1]].unique()) != df.shape[0]:
        print(a[1], 'not unique!')
        ok = False
if ok:
    print('All indexes are unique.')

All indexes are unique.


In [110]:
# check that all Treatment ID are specified in subsequent sheet
treatmentIDs = dfdic['treatment']['Treatment ID'].unique()
tocheck = ['plant', 'residue', 'pest', 'weed', 'irrigation', 'tillage']
ok = True
for a in tocheck:
    specified = df['Treatment ID'].unique()
    ie = np.in1d(treatmentIDs, specified)
    if np.sum(~ie) > 0: # some treatment are not specified
        print(treatmentIDs[~ie], 'are not specified in', str(df))
        ok = False
if ok:
    print('All treatment IDs are specified in the different sheets.')

All treatment IDs are specified in the different sheets.


In [111]:
# check Reference treatment is part of treatmentID
df = dfdic['treatment']
ref = df['Reference treatment'].dropna().unique()
ids = df['Treatment ID'].unique()
ie = np.in1d(ref, ids)
if np.sum(~ie) > 0:
    print('Some "Reference treatment" do not refer to "Treatment ID"')

In [112]:
# check rotations do not appear suddently (not 100% sure about that)
rots = dfdic['tillage']['Rotation'].unique()
sheets = ['crops', 'pest', 'weed', 'irrigation', 'grazing']
for sheet in sheets:
    rots2 = dfdic[sheet]['Rotation'].unique()
    ie = np.in1d(rots2, rots)
    if np.sum(~ie) > 0:
        print('Rotations: {:s} from tab {:s} are not defined in other tabs.'.format(str(rots2[~ie])), sheet)

IndexError: Replacement index 1 out of range for positional args tuple

In [113]:
# check controlled vocabulary and raise new words introduced
newWords = {}
for key in dfdic.keys():
    df = dfdic[key]
    for col in df.columns:
        if col in dfdrop.columns:
            status = 'ok'
            voc = df[col].dropna().unique()
            cvoc = dfdrop[col].dropna().values
            ie = np.in1d(voc, cvoc)
            if np.sum(~ie) > 0:
                status = 'new words: ' + str(voc[~ie])
                newWords[col] = list(voc[~ie])
            print('check: {:14s} > {:34s}: {:s}'.format(key, col, status))

check: experiment     > Climate zone                      : ok
check: experiment     > European climatic zone            : ok
check: experiment     > Land use                          : ok
check: experiment     > Aspect                            : ok
check: experiment     > Specific research infrastructure  : new words: ['Flux tower, Rain out shelter, Lysimeter']
check: reference      > Publication type                  : ok
check: treatment      > Farming system                    : ok
check: soil           > Soil type WRB                     : ok
check: soil           > Soil type WRB qualifier           : ok
check: soil           > Soil type WRB specifier           : ok
check: soil           > Soil type USDA                    : ok
check: soil           > Soil type USDA qualifier          : ok
check: tillage        > Rotation                          : ok
check: tillage        > Tillage system                    : ok
check: tillage        > Tillage method                    : ok
che

In [None]:
# refactor ID (make them unique between experiment)


In [None]:
# multiply 'all treatments'

In [None]:
# identify which management practice is treatment specific
mgnt = ['soil','crops','tillage','fertilization','amendment','irrigation','pest','grazing']


## Crop rotation vizualization