# Validate your dataset

In [1]:
### Import required libraries

from IPython.display import Markdown, display
import ipywidgets as widgets # load library for interactive widgets (drop-down lists, button, etc.)
from ipywidgets import HBox, Label, Layout
from IPython.display import clear_output # clear output if you click on button several times
import os
from os import listdir, walk
from os.path import isfile, join
import pandas as pd
from pathlib import Path
import numpy as np

### Define some useful functions for the GUI

# Print with style (bold etc.)
def printmd(string): 
    display(Markdown(string))

# Dropdown list
def mydropdownlist(listoptions):
    mydropdownlistis = widgets.Dropdown(
    options = listoptions,
    value = listoptions[0],
    disabled = False    
    )
    return mydropdownlistis

# Button
def mybutton(mydescription):
    mybuttonis = widgets.Button(
    description = mydescription,
    disabled = False,
    button_style = '', # 'success', 'info', 'warning', 'danger' or ''    
    )
    return mybuttonis

# Textbox
def mytextbox(placeholder):
    mytextboxis = widgets.Text(
        #value='Type dataset name',
        placeholder=placeholder,
        disabled=False,
        layout=Layout(width='60%')
    )
    return mytextboxis

### Validator selection box

# List of validators currently included in the program (generated automatically based on files included in the "validators" folder)
validatorlist = ['Select']
current_folder = os.getcwd() 
validators_folder = r'validators'
validators_path = os.path.join(current_folder, validators_folder) # full path of the validators folder
validators_names = [".".join(f.split(".")[:-1]) for f in os.listdir(validators_path) \
                  if os.path.isfile(os.path.join(validators_path, f))] # add name of available validators to the dropdown list without extension
validators_names = [x for x in validators_names if not x.startswith('~')] # discard temporary excel files starting with ~
validatorlist = validatorlist + validators_names # dropdown list options

# Drop down list for selecting validator
select_validatortype = mydropdownlist(validatorlist)

#Specify file location
folderlocation_textbox = mytextbox(r'e.g. C:\User\Desktop\DOPE-Validation\examples\Dataset-good')

# Button to validate dataytype selection and display checklist
checkdataset_button = mybutton('Validate dataset') 
  
# Box
selection_box = HBox([Label('Select your validator:'), select_validatortype])
display(selection_box)
folderlocation_box = HBox([Label('Specify the path to your dataset:'), folderlocation_textbox])
display(folderlocation_box)
display(checkdataset_button)  

### Validate dataset

# Show checklist as a table with link to useful webpages, form templates, and interactive notebooks (when applicable)
output = widgets.Output()
@output.capture()
def on_button_clicked(b):
    clear_output()
    if (select_validatortype.value == 'Select'):
        clear_output()
        printmd('**Please select a validator**')
    else:
        clear_output()
        printmd('**Checking your dataset**')
        mydataset_path = folderlocation_textbox.value
        
        if os.path.exists(mydataset_path):
            
            myfiles = [f for f in listdir(mydataset_path) if isfile(join(mydataset_path, f))] #list of all files in the main folder of the dataset
            myfolders = next(os.walk(os.path.join(mydataset_path,'.')))[1] #list of all folders in the main folder of the dataset
        
            # 1- Check that all the files expected according to the validator are included in the dataset
            printmd("")
            printmd('-------------------------')
            printmd('**1 - Verifying that the required files and folders are included**')
            validatorfile = os.path.join(validators_path, select_validatortype.value + '.xlsx')
            df = pd.read_excel(validatorfile)
            requiredfiles = df.loc[df['type'] == 'file']
            requiredfolders = df.loc[df['type'] == 'folder']
            
            for filename in requiredfiles['name']:
                if (filename in myfiles):
                    print(filename, ': Checked')
                else:
                    printstring = '**' + filename + ': MISSING!**'
                    printmd(printstring)
                    
            for foldername in requiredfolders['name']:
                if (foldername in myfolders):
                    print(foldername, ': Checked')
                else:
                    printstring = '**' + foldername + ': MISSING!**'
                    printmd(printstring)
                    
            # 2- Identify empty folders (no file or only manifest.xlsx file and or )
            printmd("")
            printmd('-------------------------')
            printmd('**2 - Checking for empty folders (no files, or only manifest and/or Readme)**')

            countemptyfolder = 0
            nonemptyfolders =[]
            for foldername in myfolders:
                folderpath = join(mydataset_path, foldername)
                folderfiles = [f for f in listdir(folderpath) if isfile(join(folderpath, f))] #list of all files in the folder
                folderfolders = next(os.walk(os.path.join(folderpath,'.')))[1] #list of all folders in the folder 
                allfiles = folderfiles + folderfolders 
                allfiles = [e for e in allfiles if e not in ('.DS_Store', 'manifest.xlsx', 'Readme')]
                if not allfiles:
                    countemptyfolder += 1
                    printstring = '**' + foldername + ' folder is EMPTY!**'
                    printmd(printstring)
                else:
                    nonemptyfolders.append(foldername)
                
            if countemptyfolder == 0:
                printmd('No empty folder was found')
                
            # 3 - Check manifest.xlsx file is included in each folder
            printmd("")
            printmd('-------------------------')
            printmd('**3 - Checking that manifest.xlsx file is included in all folders**')
            
            missingmanifest = 0
            for foldername in myfolders:
                folderpath = join(mydataset_path, foldername)
                folderfiles = [f for f in listdir(folderpath) if isfile(join(folderpath, f))] #list of all files in the folder   
                if 'manifest.xlsx' not in folderfiles:
                    missingmanifest += 1
                    printstring = '**Manifest is MISSING in folder ' + foldername + '**'
                    printmd(printstring)
                    
            if missingmanifest == 0:
                printmd('No manifest is missing')

            # 4 - Check if submission.xlsx is filled out 
            printmd("")
            printmd('-------------------------')
            printmd('**4 - Checking that submission.xlsx is filled out**')
            
            myfiles = [f for f in listdir(mydataset_path) if isfile(join(mydataset_path, f))] #list of all files in the main folder of the dataset
            if 'submission.xlsx' in myfiles:
                submissionfile= os.path.join(mydataset_path,'submission.xlsx')
                df = pd.read_excel(submissionfile)
                if  df['Value'].isnull().values.any():
                    printmd('**One or more information is missing in submission.xlsx**')
                else:
                    printmd('submission.xlsx is filled out')
            else:
                printmd('**submission.xlsx is missing**')
                
                    
        else:
            clear_output()
            printmd('**Dataset not found - please check path**')
            
            
                    
        
        
        # Check dataset_description.xlsx 
            
checkdataset_button.on_click(on_button_clicked)
display(output)



HBox(children=(Label(value='Select your validator:'), Dropdown(options=('Select', 'minimum SPARC requirements'…

HBox(children=(Label(value='Specify the path to your dataset:'), Text(value='', layout=Layout(width='60%'), pl…

Button(description='Validate dataset', style=ButtonStyle())

Output()