# EJP Common template - Data Checking Module
Please upload your filled excel template and press 'Check' to see if there are any errors that required attention.

In [None]:
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt

from ipywidgets import FileUpload, Button, Output
from IPython.display import FileLink

sheetNames = ['experiment', 'reference', 'treatment', 'soil', 'tillage', 'crops',
             'fertilization', 'amendment', 'irrigation', 'pest', 'grazing',
             'measurement', 'data', 'dropDownList']

In [2]:
def readExcel(data):
    t0 = time.time()
    print('Reading in Excel file...', end='')
    dfdic = pd.read_excel(data, sheet_name=sheetNames, skiprows=[0,1,3])
    print('done ({:.2f}s)'.format(time.time() - t0))
    return dfdic

dfdic = readExcel('../../../ejp-wp7/ejp-common-template.xlsx')

Reading in Excel file...done (15.28s)


In [3]:
# check that all ID colums contains unique values
def checkID(dfdic):
    tocheck = [
        ('experiment', 'Experiment ID'),
        ('treatment', 'Treatment ID'),
        ('crops', 'Crops ID'),
    ]
    ok = True
    for a in tocheck:
        df = dfdic[a[0]]
        if len(df[a[1]].unique()) != df.shape[0]:
            print(a[1], 'not unique!')
            ok = False
    if ok:
        print('All indexes are unique.')

In [4]:
# check that all Treatment ID are specified in subsequent sheet
def checkTreatmentID(dfdic):
    experimentIDs = dfdic['experiment']['Experiment ID'].unique()
    for expid in experimentIDs:
        dftreat = dfdic['treatment']
        ie = dftreat['Experiment ID'] == expid
        treatmentIDs = dftreat[ie]['Treatment ID'].unique()
        tocheck = ['crops', 'pest', 'irrigation', 'tillage', 'soil']
        ok = True
        for a in tocheck:
            df = dfdic[a]
            ie = df['Experiment ID'] == expid
            specified = df[ie]['Treatment ID'].unique()
            icommon = np.in1d(treatmentIDs, specified)
            if np.sum(~icommon) > 0: # some treatment are not specified
                print('{:s} are not specified for "{:s}" in experiment "{:s}"'.format(str(treatmentIDs[~icommon]), a, expid))
                ok = False
        if ok:
            print('{:s}: all treatment IDs are specified in the different sheets.'.format(expid))
checkTreatmentID(dfdic)

['zeroTill with N' 'zeroTill no N' 'normalTill with N' 'normalTill no N'] are not specified for "soil" in experiment "cc1"
cc2: all treatment IDs are specified in the different sheets.


In [5]:
# check rotations do not appear suddently (not 100% sure about that)
def checkRotation(dfdic):
    print('not sure about that...')
    rots = dfdic['tillage']['Rotation'].unique()
    sheets = ['crops', 'pest', 'weed', 'irrigation', 'grazing']
    for sheet in sheets:
        rots2 = dfdic[sheet]['Rotation'].unique()
        ie = np.in1d(rots2, rots)
        if np.sum(~ie) > 0:
            print('Rotations: {:s} from tab {:s} are not defined in other tabs.'.format(str(rots2[~ie])), sheet)

In [6]:
# check controlled vocabulary and raise new words introduced
def checkVocabulary(dfdic):
    dfdrop = dfdic['dropDownList']
    newWords = {}
    for key in dfdic.keys():
        df = dfdic[key]
        for col in df.columns:
            if col in dfdrop.columns:
                status = 'ok'
                voc = df[col].dropna().unique()
                cvoc = dfdrop[col].dropna().values
                ie = np.in1d(voc, cvoc)
                if np.sum(~ie) > 0:
                    status = 'new words: ' + str(voc[~ie])
                    newWords[col] = list(voc[~ie])
                print('check: {:14s} > {:34s}: {:s}'.format(key, col, status))

In [7]:
# multiply 'all treatments'?

In [8]:
# identify which management practice is treatment specific
def extractTreatment(dfdic):
    mgnt = ['soil','crops','tillage','fertilization','amendment','irrigation','pest','grazing']
    # which management practice is unique or not


In [9]:
# check for unexpected increase in date for sowing or harvesting


In [10]:
def createRotation(dfdic):
    pass

In [12]:
# proof of concept of web-based input file check (on binder for instance)

upload = FileUpload()
out = Output()
def processBtnFunc(btn):
    with out:
        dfdic = readExcel(upload.data[0])
        print('\n\n--------------------------------- Check IDs ---------------------------------')
        checkID(dfdic)
        print('\n\n--------------------------------- Check Treatment ID ------------------------')
        checkTreatmentID(dfdic)
        #print('\n\n--------------------------------- Check Rotation ----------------------------')
        #checkRotation(dfdic)
        print('\n\n--------------------------------- Check Controlled Vocabulary ---------------')
        checkVocabulary(dfdic)
        print('\n\n--------------------------------- Extract Treatment -------------------------')
        extractTreatment(dfdic)
        print('\n\n--------------------------------- Create Rotation Display -------------------')
        createRotation(dfdic)
processBtn = Button(description = 'Check File')
processBtn.on_click(processBtnFunc)

# layout
display(upload)
display(processBtn)
display(out)

FileUpload(value={}, description='Upload')

Button(description='Check File', style=ButtonStyle())

Output()