# EJP Common template - Data Checking Module
Please upload your filled excel template and press 'Check' to see if there are any errors that required attention.

In [1]:
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
from matplotlib.collections import PolyCollection
import matplotlib.dates as mdates
from xlsx2csv import Xlsx2csv
import tempfile
import os
    
from ipywidgets import FileUpload, Button, Output, Dropdown
from IPython.display import FileLink

sheetNames = ['experiment', 'reference', 'treatment', 'soil', 'tillage', 'crops',
             'fertilization', 'amendment', 'irrigation', 'pest', 'grazing',
             'measurement', 'data', 'dropDownList']

In [2]:
def readExcel2(data):
    t0 = time.time()
    print('Reading in Excel file...', end='')
    a = Xlsx2csv(data, outputencoding="utf-8")
    dfdic = {}
    with tempfile.TemporaryDirectory() as td:
        a.convert(td, sheetid=0)
        for i, sheet in enumerate(sheetNames):
            fname = os.path.join(td, sheet + '.csv')
            dfdic[sheet] = pd.read_csv(fname, skiprows=[0,1,3]).dropna(how='all')
    datetimeList = [('crops', 'Sowing date'),
                    ('crops', 'Harvesting date'),
                    ('tillage', 'Tillage date'),
                    ('amendment', 'Amendment date'),
                    ('fertilization', 'Fertilizer application date'),
                    ('irrigation', 'Irrigation date'),
                    ('pest', 'Pesticide application date'),
                    ('measurement', 'Sampling date'),
                    ('data', 'Date')
                   ]
    for row in datetimeList:
        df = dfdic[row[0]]
        df[row[1]] = pd.to_datetime(df[row[1]])
    print('done ({:.2f}s)'.format(time.time() - t0))
    return dfdic
#dfdic = readExcel2('../../../ejp-wp7/ejp-common-template2.xlsx')

In [3]:
def readExcel(data):
    t0 = time.time()
    print('Reading in Excel file...', end='')
    dfdic = pd.read_excel(data, sheet_name=None, skiprows=[0,1,3])
    print('done ({:.2f}s)'.format(time.time() - t0))
    return dfdic

#dfdic = readExcel('../../../ejp-wp7/ejp-common-template2.xlsx')

In [4]:
# check that all ID colums contains unique values
def checkID(dfdic):
    tocheck = [
        ('experiment', 'Experiment ID'),
        ('treatment', 'Treatment ID'),
        ('crops', 'Crops ID'),
    ]
    ok = True
    for a in tocheck:
        df = dfdic[a[0]]
        if len(df[a[1]].unique()) != df.shape[0]:
            print(a[1], 'not unique!, multiple IDs defined')
            ok = False
    if ok:
        print('All indexes are unique.')

In [5]:
# check that all Treatment ID are specified in subsequent sheet
def checkTreatmentID(dfdic):
    experimentIDs = dfdic['experiment']['Experiment ID'].unique()
    for expid in experimentIDs:
        dftreat = dfdic['treatment']
        ie = dftreat['Experiment ID'] == expid
        treatmentIDs = dftreat[ie]['Treatment ID'].unique()
        tocheck = ['crops', 'pest', 'irrigation', 'tillage', 'soil']
        ok = True
        for a in tocheck:
            df = dfdic[a]
            ie = df['Experiment ID'] == expid
            specified = df[ie]['Treatment ID'].values
            icommon = np.in1d(treatmentIDs, specified)
            if np.sum(~icommon) > 0: # some treatment are not specified
                print('{:s} are not specified for "{:s}" in experiment "{:s}"'.format(str(treatmentIDs[~icommon]), a, expid))
                ok = False
        if ok:
            print('{:s}: all treatment IDs are specified in the different sheets.'.format(expid))

In [6]:
# check rotations do not appear suddently (not 100% sure about that)
def checkRotation(dfdic):
    print('not sure about that...')
    rots = dfdic['tillage']['Rotation'].unique()
    sheets = ['crops', 'pest', 'weed', 'irrigation', 'grazing']
    for sheet in sheets:
        rots2 = dfdic[sheet]['Rotation'].unique()
        ie = np.in1d(rots2, rots)
        if np.sum(~ie) > 0:
            print('Rotations: {:s} from tab {:s} are not defined in other tabs.'.format(str(rots2[~ie])), sheet)

In [7]:
# check controlled vocabulary and raise new words introduced
def checkVocabulary(dfdic):
    dfdrop = dfdic['dropDownList']
    newWords = {}
    for key in dfdic.keys():
        df = dfdic[key]
        for col in df.columns:
            if col in dfdrop.columns:
                status = 'ok'
                voc = df[col].dropna().unique()
                cvoc = dfdrop[col].dropna().values
                ie = np.in1d(voc, cvoc)
                if np.sum(~ie) > 0:
                    status = 'new words: ' + str(voc[~ie])
                    newWords[col] = list(voc[~ie])
                print('check: {:14s} > {:34s}: {:s}'.format(key, col, status))

In [8]:
# multiply 'all treatments' to enter it in the relational database
def multiplyTreatments(dfdic):
    tocheck = ['crops', 'pest', 'irrigation', 'tillage', 'soil']
    dftreat = dfdic['treatment']
    dfdic2 = dfdic.copy()
    for a in tocheck:
        df = dfdic[a]
        df2 = pd.DataFrame()
        for i in range(df.shape[0]):
            row = df.loc[i, :].copy()
            if row['Treatment ID'] == 'all treatments':
                print('Sheet "{:s}" > experiment "{:s}" expanded'.format(
                a, row['Experiment ID']))
                ie = dftreat['Experiment ID'] == row['Experiment ID']
                treatmentIDs = dftreat[ie]['Treatment ID'].values
                for treatmentID in treatmentIDs:
                    row['Treatment ID'] = treatmentID
                    df2 = df2.append(row.copy())
            else:
                df2 = df2.append(row.copy())
        dfdic2[a] = df2.reset_index(drop=True)
    return dfdic2

In [9]:
# identify which management practice is treatment specific
# TODO for crops, we need to take cropsID into account
def extractTreatment(dfdic):
    mgnt = ['soil','crops','tillage','fertilization','amendment','irrigation','pest','grazing']
    experimentIDs = dfdic['experiment']['Experiment ID'].unique()
    x = np.zeros((len(experimentIDs), len(mgnt)), dtype=bool)
    dft = pd.DataFrame(x, columns=mgnt) # True if the practice is part of treatment
    dft.insert(0, 'Experiment ID', experimentIDs)
    for expid in experimentIDs:
        treatList = []
        for sheet in mgnt:
            df = dfdic[sheet]
            ie = df['Experiment ID'] == expid
            sdf = df[ie].copy().drop(['Experiment ID', 'Treatment ID'], axis=1).reset_index(drop=True)
            sdf = sdf.dropna(axis=1)
            isTreatment = False
            if sdf.shape[0] > 0: # sheet might be empty for expid
                s0 = sdf.loc[0,:]
                if sheet != 'soil':
                    for i in range(1, sdf.shape[0]):
                        if (sdf.loc[i,:] != s0).any():
                            # print('++++++is different:', sdf.loc[i,:], '--', s0, '///', sdf.loc[i,:] != s0)
                            isTreatment = True
                            break
                else: # for 'soil', we need to compare each layer per layer
                    soilLayers = sdf['Layer number'].unique()
                    for soilLayer in soilLayers:
                        ie2 = sdf['Layer number'] == soilLayer
                        ssdf = sdf[ie2].copy().drop('Layer number', axis=1).reset_index(drop=True)
                        ssdf = ssdf.dropna(axis=1)
                        s0 = ssdf.loc[0,:]
                        for i in range(1, ssdf.shape[0]):
                            if (ssdf.loc[i,:] != s0).any():
                                 # print('----- not identical', (ssdf.loc[i,:] != s0))
                                isTreatment = True
                                break
            if isTreatment is True:
                dft.loc[dft['Experiment ID'] == expid, sheet] = True
    return dft

In [1]:
# check for unexpected increase in date for sowing or harvesting or for harvesting date after sowing date
def checkDates(dfdic):
    dfcrop = dfdic['crops']
    for i in range(dfcrop.shape[0]):
        row = dfcrop.loc[i,:]
        sowing = row['Sowing date'].values
        harvesting = row['Harvesting date'].values
        if pd.isnull(sowing) and pd.isnull(harvesting):
            sowing = row['Sowing period']
            harvesting = row['Harvesting period']
            if pd.isnull(sowing) and pd.isnull(harvestig):
                print('No sowing date/period AND no harvesting date/period specified for row:', row)
        elif not pd.isnull(sowing) and not pd.isnull(harvesting):
            if harvesting < sowing:
                print('Harvesting date is smaller than sowing date, please check row {:d} of the "crops" tab'.format(i+1))
    

In [11]:
def createRotation(dfdic, expid):
    df = dfdic['crops']
    ie1 = df['Experiment ID'] == expid
    treatments = df[ie1]['Treatment ID'].unique()
    ucrops = df[ie1]['Crop'].unique()
    colors = dict(zip(ucrops, [plt.cm.tab10(i) for i in range(len(ucrops))]))
    xmax = df[ie1]['Harvesting date'].max()
    fig, ax = plt.subplots(figsize=(14,4))
    c = 0
    tticks = []
    for i, treatment in enumerate(treatments):
        ax.axhline(c, color='k', linestyle=':')
        tticks.append(c)
        c += 1
        ie2 = df['Treatment ID'] == treatment
        crops = df[ie1 & ie2]['Crops ID'].unique()
        for j, crop in enumerate(crops):
            ie3 = df['Crops ID'] == crop
            row = df[ie1 & ie2 & ie3]
            cropName = row['Crop'].values[0]
            sowing = row['Sowing date'].values[0]
            harvesting = row['Harvesting date'].values[0]         
            if pd.isnull(harvesting):
                harvesting = xmax
            if not pd.isnull(sowing) and not pd.isnull(harvesting):
                xy = np.array([[sowing, c],
                              [sowing, c+3],
                              [harvesting, c+3],
                              [harvesting, c],
                              [sowing, c]])
                xy[:,0] = mdates.date2num(xy[:,0])
                coll = PolyCollection([xy], facecolors=[colors[cropName]], alpha=0.5)
                ax.add_collection(coll)
                ax.text(sowing + np.abs(sowing - harvesting)/2, c+2,
                        cropName.replace('> ','').replace('>',''), ha='center', fontsize=8)
                # print(i, j, crop, sowing, harvesting)
                # check for overlap to see if we need to skip row?
                c += 4
    tticks.append(c)
    ax.autoscale()
    # add vertical bar for year
    #xmin, xmax = ax.get_xlim()
    #xmin, xmax = mdates.num2date(xmin), mdates.num2date(xmax)
    xmin = df[ie1]['Sowing date'].min()
    xmin = xmin.replace(month=1, day=1)
    xmax = xmax.replace(year=xmax.year+1, month=1, day=1)
    ax.set_xlim([xmin, xmax])
    d = xmin
    for i in range(100):
        d = d.replace(year=d.year + 1)
        if d < xmax:
            ax.axvline(d, color='k')
    loc = mdates.MonthLocator()
    ax.xaxis.set_major_locator(loc)
    ax.xaxis.set_major_formatter(mdates.AutoDateFormatter(loc)) 
    ax.invert_yaxis()
    ax.grid(axis='x')
    ax.set_yticks(tticks[:-1] + np.diff(tticks)/2)
    ax.set_yticklabels(treatments)
    fig.autofmt_xdate()
    fname = 'rotation-' + str(expid) + '.jpg'
    fig.savefig(fname, dpi=500)
    plt.show() # needed to plot the graph in the 'out' context
    display(FileLink(fname))
    
#createRotation(dfdic, 'cc1')

In [12]:
# proof of concept of web-based input file check (on binder for instance)
dfdic = pd.DataFrame()
upload = FileUpload()
out = Output()
def processBtnFunc(btn):
    out.clear_output()
    with out:
        #dfdic = readExcel(upload.data[0])
        with open('t.xlsx', 'wb') as f:
            f.write(upload.data[0])
        dfdic = readExcel2('t.xlsx')
        print('\n\n--------------------------------- Check IDs ---------------------------------')
        checkID(dfdic)
        print('\n\n--------------------------------- Expanding Treatment ID --------------------')
        dfdic = multiplyTreatments(dfdic)
        print('\n\n--------------------------------- Check Treatment ID ------------------------')
        checkTreatmentID(dfdic)
        #print('\n\n--------------------------------- Check Rotation ----------------------------')
        #checkRotation(dfdic)
        print('\n\n--------------------------------- Check Controlled Vocabulary ---------------')
        checkVocabulary(dfdic)
        print('\n\n--------------------------------- Extract Treatments ------------------------')
        dft = extractTreatment(dfdic)
        print(dft)
    expDropdown.options = dfdic['experiment']['Experiment ID'].unique()
    def rotBtnFunc(a):
        with out:
            createRotation(dfdic, expDropdown.value)
    rotBtn.on_click(rotBtnFunc)
processBtn = Button(description = 'Check File')
processBtn.on_click(processBtnFunc)

expDropdown = Dropdown(options=[''], description='Experiment ID:')
rotBtn = Button(description='Rotation Graph')

# layout
display(upload)
display(processBtn)
display(expDropdown)
display(rotBtn)
display(out)

FileUpload(value={}, description='Upload')

Button(description='Check File', style=ButtonStyle())

Dropdown(description='Experiment ID:', options=('',), value='')

Button(description='Rotation Graph', style=ButtonStyle())

Output()