# EJP Common template - Data Checking Module
Please copy-paste the URL of your completed template or alternatively, upload your filled excel template (.xlsx) and press 'Check' to see if there are any errors that require attention. If there are errors, please make change to the template and check it again here.

In [1]:
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
from matplotlib.collections import PolyCollection
import matplotlib.dates as mdates
from datetime import datetime
from xlsx2csv import Xlsx2csv
import tempfile
import os
import warnings
warnings.filterwarnings('ignore')
    
import ipywidgets as widgets
from IPython.display import FileLink, HTML

sheetNames = ['experiment', 'reference', 'treatment', 'soil-type', 'tillage', 'crops',
              'amendment', 'irrigation', 'pest-weed', 'grazing',
             'soil-crop-measurement', 'data', 'dropDownList']

def dump(text, level='warning'):
    #print(text)
    display(HTML('<div class="alert alert-{:s}" role="alert">{:s}</div>'.format(level, text)))

C:\Users\gblanchy\WPy64-3890\python-3.8.9.amd64\lib\site-packages\numpy\.libs\libopenblas.WCDJNK7YVMPZQ2ME2ZZHJJRJ3JIKNDB7.gfortran-win_amd64.dll
C:\Users\gblanchy\WPy64-3890\python-3.8.9.amd64\lib\site-packages\numpy\.libs\libopenblas.xwydx2ikjw2nmtwsfyngfuwkqu3lytcz.gfortran-win_amd64.dll


In [2]:
def readExcel2(data):
    t0 = time.time()
    print('Reading in Spreadsheet...', end='')
    a = Xlsx2csv(data, outputencoding="utf-8")
    dfdic = {}
    with tempfile.TemporaryDirectory() as td:
        a.convert(td, sheetid=0)
        for i, sheet in enumerate(sheetNames):
            fname = os.path.join(td, sheet + '.csv')
            dfdic[sheet] = pd.read_csv(fname, skiprows=[0,1,3]).dropna(how='all')
    datetimeList = [('crops', 'Sowing date'),
                    ('crops', 'Harvesting/Termination date'),
                    ('tillage', 'Tillage date'),
                    ('amendment', 'Amendment date'),
                    ('irrigation', 'Irrigation date'),
                    ('pest-weed', 'Pesticide application date'),
                    ('soil-crop-measurement', 'Sampling date'),
                    ('data', 'Date')
                   ]
    for row in datetimeList:
        df = dfdic[row[0]]
        df[row[1]] = pd.to_datetime(df[row[1]])
    print('done ({:.2f}s)'.format(time.time() - t0))
    return dfdic
#dfdic = readExcel2('../../../ejp-wp7/ejp-common-template2.xlsx')

In [8]:
def readExcel(fname):
    if fname[:4] == 'http': # it's a google sheet url
        fname = '/'.join(fname.split('/')[:-1] + ['export?format=xlsx'])
    dfdic = pd.read_excel(fname, sheet_name=None, skiprows=[0, 1, 3])

    # make all ID as string
    for key in dfdic:
        for dtype in ['Experiment ID', 'Treatment ID']:
            if dtype in dfdic[key].columns:
                dfdic[key][dtype] = dfdic[key][dtype].astype(str)

    # remove Unnamed columns
    for key in dfdic.keys():
        ie = dfdic[key].columns.str.contains('Unnamed')
        dfdic[key] = dfdic[key].drop(dfdic[key].columns[ie], axis=1)

    return dfdic

#dfdic = readExcel('https://docs.google.com/spreadsheets/d/1aVowuuz5-Ot2RiXViKudeZt9iKHmgkColuXVHWHl_sU/edit?usp=sharing')

In [4]:
# numeric types
ntypes = {
    'experiment': {
        'Latitude': 'float',
        'Longitude': 'float',
    },
    'reference': {
        'Publication year': 'float',
    },
    'soil-type': {
        'Top depth of layer': 'float',
        'Bottom depth of layer': 'float',
        'Clay (< 0.002 mm)': 'float',
        'Silt (0.002 - 0.05 mm)': 'float',
        'Sand (0.05 - 2 mm)': 'float',
        'Gravel (> 2 mm)': 'float'
    },
    'treatment': {
        'Year started': 'float',
        #'Year ended': 'float'
    },
    'tillage': {
        'Tillage depth': 'float',
    },
    'crops': {
        'Harvesting frequency': 'float',
    },
    'amendment': {
        'Fertilizer/Amendment application rate': 'float',
        'Amendment water content': 'float',
        'Amendment C': 'float',
        'Amendment N': 'float',
        'Amendment P': 'float',
        'Amendment K': 'float',
    },
    'irrigation': {
        'Amount of water': 'float',
        'Irrigation frequency': 'float',
        #'Drainage spacing': 'float',
        #'Drainage depth': 'float',
    },
    'data-crop': {
        'Sampling year': 'float',
        'Harvested yield': 'float',
        'Harvested yield water content amount': 'float',
        'Residue above-ground': 'float',
        'Residue stubble': 'float',
        'Residue roots': 'float',
        'Below-ground sampling depth': 'float',
    },
    'data-soil': {
        'Sampling year': 'float',
        'Depth from': 'float',
        'Depth to': 'float',
        'SOC conc': 'float',
        'SOC conc SD': 'float',
        'SOC conc SE': 'float',
        'SOC conc nb samples': 'float',
        'Bulk density': 'float',
        'Bulk density SD': 'float',
        'Bulk density SE': 'float',
        'Bulk density nb samples': 'float',
        'SOC stock': 'float',
        'SOC stock SD': 'float',
        'SOC stock SE': 'float',
        'SOC stock nb samples': 'float',
        'pH': 'float',
    }
}

In [5]:
# check dtype
def checkType(dfdic, dic=None):
    if dic is None:
        dic = {}
    wrongTypes = {}
    for sheet in ntypes.keys():
        for col in ntypes[sheet].keys():
            try:
                dfdic[sheet][col] = dfdic[sheet][col].astype(ntypes[sheet][col])
            except Exception as e:
                dump('Wrong type: "{:s}" > "{:s}" should be float. Error: {:s}'.format(sheet, col, str(e)), 'danger')
                if sheet in wrongTypes.keys():
                    wrongTypes[sheet].append(col)
                else:
                    wrongTypes[sheet] = [col]
    dic['wrong_types'] = wrongTypes
    return dfdic, dic
#checkType(dfdic)

In [6]:
# check that all ID colums contains unique values
def checkID(dfdic):
    tocheck = [
        ('experiment', 'Experiment ID'),
        ('treatment', 'Treatment ID'),
        ('crops', 'Crop ID'),
        ('reference', 'Publication ID')
    ]
    ok = True
#     for a in tocheck:
#         df = dfdic[a[0]]
#         if len(df[a[1]].unique()) != df.shape[0]:
#             dump('{:s} is not unique! ID must be unique.'.format(a[1]), 'danger')
#             ok = False
    df = dfdic['experiment']
    if len(df['Experiment ID'].unique()) != df.shape[0]:
        ok = False
        dump('Experiment ID are not unique', 'danger')
    df = dfdic['treatment']
    df['id'] = df['Experiment ID'] + '_' + df['Treatment ID']
    ok = True
    if len(df['id'].unique()) != df.shape[0]:
        dump('Identical Treatment ID for same Experiment ID', 'danger')
        ok = False
    df = df.drop('id', axis=1)
    df = dfdic['crops']
    df['id'] = df['Experiment ID'] + '_' + df['Treatment ID'] + '_' + df['Crop ID']
    ok = True
    if len(df['id'].unique()) != df.shape[0]:
        dump('Multiple Crop ID defined for same Treatment ID', 'danger')
        ok = False
    df = df.drop('id', axis=1)
    if ok:
        dump('All indexes are unique.', 'success')

In [7]:
# check that all Treatment ID are specified in subsequent sheet
def checkTreatmentID(dfdic):
    experimentIDs = dfdic['experiment']['Experiment ID'].unique()
    for expid in experimentIDs:
        dftreat = dfdic['treatment']
        ie = dftreat['Experiment ID'] == expid
        treatmentIDs = dftreat[ie]['Treatment ID'].unique()
        tocheck = ['crops', 'pest-weed', 'irrigation', 'tillage']
        ok = True
        for a in tocheck:
            if a in dfdic.keys():
                df = dfdic[a]
                ie = df['Experiment ID'] == expid
                specified = df[ie]['Treatment ID'].values
                icommon = np.in1d(treatmentIDs, specified)
                if np.sum(~icommon) > 0: # some treatment are not specified
                    dump('Treatments "{:s}" are not specified in tab "{:s}" for Experiment ID "{:s}"'.format(
                        '", "'.join(treatmentIDs[~icommon]), a, expid), 'danger')
                    ok = False
        if ok:
            dump('{:s}: all treatment IDs are specified in the different sheets.'.format(expid), 'success')

In [8]:
# check rotations do not appear suddently (not 100% sure about that)
def checkRotation(dfdic):
    print('not sure about that...')
    rots = dfdic['tillage']['Rotation'].unique()
    sheets = ['crops', 'pest-weed', 'irrigation', 'grazing']
    for sheet in sheets:
        rots2 = dfdic[sheet]['Rotation'].unique()
        ie = np.in1d(rots2, rots)
        if np.sum(~ie) > 0:
            dump('Rotations: {:s} from tab {:s} are not defined in other tabs.'.format(str(rots2[~ie]), sheet), 'danger')

In [9]:
# check controlled vocabulary and raise new words introduced
def checkVocabulary(dfdic):
    dfdrop = dfdic['dropDownList']
    newWords = {}
    for key in dfdic.keys():
        df = dfdic[key]
        for col in df.columns:
            if col in dfdrop.columns:
                status = 'ok'
                voc = df[col].dropna().unique()
                cvoc = dfdrop[col].dropna().values
                ie = np.in1d(voc, cvoc)
                if np.sum(~ie) > 0:
                    status = 'new words: ' + str(voc[~ie]) + 'not in drop-down list'# + str(cvoc)
                    newWords[col] = list(voc[~ie])
                    dump('check: {:25s} > {:50s}: {:s}'.format(key, col, status), 'warning')
                #print('check: {:25s} > {:50s}: {:s}'.format(key, col, status))
    if len(newWords) == 0:
        dump('All vocabulary used already in drop-down list', 'success')

In [10]:
# multiply 'all treatments' to enter it in the relational database
def multiplyTreatments(dfdic):
    tocheck = ['crops', 'pest-weed', 'irrigation', 'tillage']
    dftreat = dfdic['treatment']
    dfdic2 = dfdic.copy()
    for a in tocheck:
        if a in dfdic.keys():
            df = dfdic[a]
            df2 = pd.DataFrame(columns=df.columns)
            for i in range(df.shape[0]):
                row = df.loc[i, :].copy()
                if row['Treatment ID'] == 'all treatments':
                    #print('Sheet "{:s}" > experiment "{:s}" expanded'.format(
                    #a, row['Experiment ID']))
                    ie = dftreat['Experiment ID'] == row['Experiment ID']
                    treatmentIDs = dftreat[ie]['Treatment ID'].values
                    for treatmentID in treatmentIDs:
                        row['Treatment ID'] = treatmentID
                        df2 = df2.append(row.copy())
                else:
                    df2 = df2.append(row.copy())
            dfdic2[a] = df2.reset_index(drop=True)
    return dfdic2

In [11]:
# identify which management practice is treatment specific
# TODO for crops, we need to take cropsID into account
def extractTreatment(dfdic):
    mgnt = ['crops','tillage','amendment','irrigation','pest-weed','grazing']
    experimentIDs = dfdic['experiment']['Experiment ID'].unique()
    x = np.zeros((len(experimentIDs), len(mgnt)), dtype=bool)
    dft = pd.DataFrame(x, columns=mgnt) # True if the practice is part of treatment
    dft.insert(0, 'Experiment ID', experimentIDs)
    for expid in experimentIDs:
        treatList = []
        for sheet in mgnt:
            if sheet in dfdic.keys():
                df = dfdic[sheet]
                ie = df['Experiment ID'] == expid
                sdf = df[ie].copy().drop(['Experiment ID', 'Treatment ID'], axis=1).reset_index(drop=True)
                sdf = sdf.dropna(axis=1)
                isTreatment = False
                if sdf.shape[0] > 0: # sheet might be empty for expid
                    s0 = sdf.loc[0,:]
                    for i in range(1, sdf.shape[0]):
                        if (sdf.loc[i,:] != s0).any():
                            # print('++++++is different:', sdf.loc[i,:], '--', s0, '///', sdf.loc[i,:] != s0)
                            isTreatment = True
                            break
                if isTreatment is True:
                    dft.loc[dft['Experiment ID'] == expid, sheet] = True
    return dft

In [12]:
# check for unexpected increase in date for sowing or harvesting or for harvesting date after sowing date
def checkDates(dfdic):
    dfcrop = dfdic['crops']
    for i in range(dfcrop.shape[0]):
        row = dfcrop.loc[i,:]
        sowing = row['Sowing date'].values
        harvesting = row['Harvesting/Termination date'].values
        if pd.isnull(sowing) and pd.isnull(harvesting):
            sowing = row['Sowing period']
            harvesting = row['Harvesting/Termination period']
            if pd.isnull(sowing) and pd.isnull(harvestig):
                print('No sowing date/period AND no harvesting/termination date/period specified for row:', row)
        elif not pd.isnull(sowing) and not pd.isnull(harvesting):
            if harvesting < sowing:
                print('Harvesting/Termination date is smaller than sowing date, please check row {:d} of the "crops" tab'.format(i+1))
    

In [13]:
def createRotation(dfdic, expid):
    df = dfdic['crops']
    ie1 = df['Experiment ID'] == expid
    treatments = df[ie1]['Treatment ID'].unique()
    ucrops = df[ie1]['Crop'].unique()
    colors = dict(zip(ucrops, [plt.cm.tab10(i) for i in range(len(ucrops))]))
    xmax = df[ie1]['Harvesting/Termination date'].max()
    fig, ax = plt.subplots(figsize=(14,4))
    c = 0
    tticks = []
    for i, treatment in enumerate(treatments):
        ax.axhline(c, color='k', linestyle=':')
        tticks.append(c)
        c += 1
        ie2 = df['Treatment ID'] == treatment
        crops = df[ie1 & ie2]['Crop ID'].unique()
        for j, crop in enumerate(crops):
            ie3 = df['Crop ID'] == crop
            row = df[ie1 & ie2 & ie3]
            cropName = row['Crop'].values[0]
            if 'Sowing date' in row.keys():
                sowing = row['Sowing date'].values[0]
            elif 'Sowing period' in row.keys():
                sowing = datetime.strptime(row['Sowing period'].values[0], '%B')
            else:
                raise ValueError('No "Sowing date" or "Sowing period", impossible to do the rotation graph')
                return
            if 'Harvesting/Termination date' in row.keys():
                harvesting = row['Harvesting/Termination date'].values[0]
            elif 'Harvesting/Termination period' in row.keys():
                harvesting = datetime.strptime(row['Harvesting/Termination period'].values[0], '%B')
            if pd.isnull(harvesting):
                harvesting = xmax
            if not pd.isnull(sowing) and not pd.isnull(harvesting):
                xy = np.array([[sowing, c],
                              [sowing, c+3],
                              [harvesting, c+3],
                              [harvesting, c],
                              [sowing, c]])
                xy[:,0] = mdates.date2num(xy[:,0])
                coll = PolyCollection([xy], facecolors=[colors[cropName]], alpha=0.5)
                ax.add_collection(coll)
                ax.text(sowing + np.abs(sowing - harvesting)/2, c+2,
                        cropName.replace('> ','').replace('>',''), ha='center', fontsize=8)
                # print(i, j, crop, sowing, harvesting)
                # check for overlap to see if we need to skip row?
                c += 4
    tticks.append(c)
    ax.autoscale()
    # add vertical bar for year
    #xmin, xmax = ax.get_xlim()
    #xmin, xmax = mdates.num2date(xmin), mdates.num2date(xmax)
    xmin = df[ie1]['Sowing date'].min()
    xmin = xmin.replace(month=1, day=1)
    xmax = xmax.replace(year=xmax.year+1, month=1, day=1)
    ax.set_xlim([xmin, xmax])
    d = xmin
    for i in range(100):
        d = d.replace(year=d.year + 1)
        if d < xmax:
            ax.axvline(d, color='k')
    loc = mdates.MonthLocator()
    ax.xaxis.set_major_locator(loc)
    ax.xaxis.set_major_formatter(mdates.AutoDateFormatter(loc)) 
    ax.invert_yaxis()
    ax.grid(axis='x')
    ax.set_yticks(tticks[:-1] + np.diff(tticks)/2)
    ax.set_yticklabels(treatments)
    fig.autofmt_xdate()
    fname = 'rotation-' + str(expid) + '.jpg'
    fig.savefig(fname, dpi=500)
    plt.show() # needed to plot the graph in the 'out' context
    display(FileLink(fname))
    
#createRotation(dfdic, 'cc1')

In [15]:
# proof of concept of web-based input file check (on binder for instance)
dfdic = pd.DataFrame()
upload = widgets.FileUpload()
out = widgets.Output()
gsurl = widgets.Text(
    value='',
    placeholder='',
    description='',
    disabled=False,
)


def processBtnFunc(btn):
    out.clear_output()
    with out:
        #dfdic = readExcel(upload.data[0])
        if gsurl.value != '':
            dfdic = readExcel(gsurl.value)
        elif len(upload.data) > 0:
            dfdic = readExcel(upload.data[0])
            #with open('t.xlsx', 'wb') as f: # faster but less robust
            #    f.write(upload.data[0])
            #dfdic = readExcel2('t.xlsx')
        else:
            raise ValueError('Please upload a .xlsx or specify Google Sheet url')
            return
        
        # drop automatically filled columns with the rest empty
        df = dfdic['crops']
        dfdic['crops'] = df[df['Crop ID'] != '___'].reset_index(drop=True)
        df = dfdic['reference']
        dfdic['reference'] = df[df['Publication ID'] != '__'].reset_index(drop=True)

        print('\n\n--------------------------------- Check IDs ---------------------------------')
        checkID(dfdic)
        dfdic, _ = checkType(dfdic)
        #print('\n\n--------------------------------- Expanding Treatment ID --------------------')
        dfdic = multiplyTreatments(dfdic)
        print('\n\n--------------------------------- Check Treatment ID ------------------------')
        checkTreatmentID(dfdic)
        #print('\n\n--------------------------------- Check Rotation ----------------------------')
        #checkRotation(dfdic)
        print('\n\n--------------------------------- Check Controlled Vocabulary ---------------')
        checkVocabulary(dfdic)
        #print('\n\n--------------------------------- Extract Treatments ------------------------')
        #dft = extractTreatment(dfdic)
        #print(dft)
    expDropdown.options = dfdic['experiment']['Experiment ID'].unique()
    def rotBtnFunc(a):
        with out:
            createRotation(dfdic, expDropdown.value)
    rotBtn.on_click(rotBtnFunc)
processBtn = widgets.Button(description = 'Check File')
processBtn.on_click(processBtnFunc)

expDropdown = widgets.Dropdown(options=[''], description='')
rotBtn = widgets.Button(description='Rotation Graph')

# layout
display(widgets.HBox([upload, widgets.Label('OR Google Sheet URL:'), gsurl]))
display(processBtn)
#display(widgets.HBox([widgets.Label('Select experiment for rotation graph:'), expDropdown]))
#display(rotBtn)
display(out)

HBox(children=(FileUpload(value={}, description='Upload'), Label(value='OR Google Sheet URL:'), Text(value='',…

Button(description='Check File', style=ButtonStyle())

Output()