# ShEX shapes creation from Bioschemas YAML

* Authors: Leyla Garcia (1)
* (1) ZBMED Information Centre for Life Sciences, Cologne, Germany

* GitHub repository:  https://github.com/biotea/validation-shapes-bioschemas
* License: Apache 2.0

* Acknowledgements: This notebook was created during the NBDC / DBCLS BioHackathon 2019, we thank the organizer for their invitation to participate in this event. We also thank the Schemas group created during the event, special thanks to Jose Labra.

## Input
* Bioschemas YAML file 
* Example at https://github.com/biotea/validation-shapes-bioschemas/blob/master/journal.yaml

## Output
* ShEX shape
* Example at https://github.com/biotea/validation-shapes-bioschemas/blob/master/generatedJournal.shex

## Process
* Make sure journal.yaml is at the same path as this book
* This notebook works as follow
  * Load a YAML file generated from https://github.com/BioSchemas/bioschemas-goweb (or a compatible one created by any other means)
  * By now, only working with local files (ToDo: allow loading from URL, and from Bioschemas HTML pages in github.io)
  * Parse profile properties in order to generate shapes
  * Call main function parseProperties three times, for minimum, recommended and optional properties
  * Add partial shapes to profile final shape
* Disclaimer: We have tested this ShEX shapes creator with Biotea-Bioschemas profile for Journal, further testing and adjusts are needed. Please report any bugs via GitHub issues


## Trying out the shape validation
* Go to http://rdfshape.weso.es/validate
* Run the validator with the generated shapes and the input example, everything should pass


In [35]:
#Import libraries
import json
from yaml import load
try:
    from yaml import CLoader as Loader
except ImportError:
    from yaml import Loader
import pandas as pd

In [55]:
#Parse expected types and populate arrays for data and object types
def parseExpectedTypes(elem, exprDataType, exprObjType):
    for exType in elem['expected_types']:
        if exType == 'Boolean':
            exprDataType.append('xsd:boolean')
        elif exType == 'Date':
            exprDataType.append('xsd:date')
        elif exType == 'DateTime':
            exprDataType.append('xsd:dateTime')
        elif exType == 'Number':
            exprDataType.append('xsd:double')
        elif exType == 'Text': 
            exprDataType.append('xsd:string')
        elif exType == 'Time':
            exprDataType.append('xsd:time')
        elif exType == 'Float':
            exprDataType.append('xsd:float')
        elif exType == 'Integer':
            exprDataType.append('xsd:integer')
        elif exType == 'URL':
            exprDataType.append('@<URL>')
        else:
            exprObjType.append(exType) 

In [30]:
#Parse object type properties to get information for the main shape and additional supporting shapes 
#(contatining all possible types when multiple are allowed for a property)
def parseObjProperties (exprObjType, addShapes):
    shape = ''
    if len(exprObjType) == 1:
        shape += ' {a [schema:' + exprObjType[0] + ']} OR IRI'
    elif len(exprObjType) > 0:
        separator = 'Or'
        exprObjTypeName = '<' + separator.join(exprObjType) + '>'
        shape += ' @' + exprObjTypeName
        shape += ' OR IRI'
        separator = ' schema:'
        exprObjType.insert(0, '')
        addShapes.append('\n' + exprObjTypeName + '{\n  rdf:type [' + separator.join(exprObjType) + ']\n}')
    return shape

In [31]:
#Parse properties corresponding to a particular group Minimum, Recommended or Optional
def parseProperties (mainShapeName, profileType, propList, symbolOne, symbolMany, addShapes):  
    shape = '\n<' + mainShapeName + '> {\n  rdf:type [' + profileType + '] ;'
    for elem in propList:
        shape += '\n  schema:' + elem['property']
        exprDataType = []
        exprObjType = []
        shapeObjType = ''
        
        parseExpectedTypes(elem, exprDataType, exprObjType)
        
        separator = ' OR '
        shape += ' ' + separator.join(exprDataType)

        if (len(exprDataType) > 0) and  (len(exprObjType) > 0):
            shape += ' OR'

        shape += parseObjProperties (exprObjType, addShapes)

        if elem['cardinality'] == 'ONE':
            shape += ' ' + symbolOne
        else:
            shape += ' ' + symbolMany
        
        shape += ' ;'
        
    shape += '\n}\n'  
    return shape


In [32]:
#Parse some properties by marginality from the JSON array obtained from YAML file
def parseMarginality(profileName, profileType, marginality, marginalShapes, symbolOne, symbolMany, data):
    #Parse minimum properties from the JSON array obtained from YAML file
    someProp = [el for el in data if el['marginality'] == marginality]
    someShape = parseProperties (profileName + marginality, profileType, someProp, symbolOne, symbolMany, marginalShapes)
    return someShape

In [33]:
#Create all required shapes to validate a profile
def createShapesFromProfile(profileName, profileType, data): 
    fullShape = 'PREFIX schema: <http://schema.org/> \n\
    PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> \n\
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> \n\
    PREFIX xsd: <http://www.w3.org/2001/XMLSchema#> \n\
    <URL> \n\
      xsd:string OR IRI\n\
    '
    #Parse minimum properties from the JSON array obtained from YAML file
    minAddShapes = []
    minShape = parseMarginality(profileName, profileType, 'Minimum', minAddShapes, '', '+', data)
    fullShape += minShape

    #Parse recommended properties from the JSON array obtained from YAML file
    recAddShapes = []
    recShape = parseMarginality(profileName, profileType, 'Recommended', recAddShapes, '?', '*', data)
    fullShape += recShape

    #Parse optional properties from the JSON array obtained from YAML file
    optAddShapes = []
    optShape = parseMarginality(profileName, profileType, 'Optional', recAddShapes, '?', '*', data)
    fullShape += optShape

    separator = '\n'
    fullShape += separator.join(minAddShapes)
    fullShape += separator.join(recAddShapes)
    fullShape += separator.join(optAddShapes)
    return fullShape 

In [59]:
#Load config profiles file
profiles = pd.read_csv('profiles.csv', delimiter=',', names=['profileName','profileType','fileName'])

for index, row in profiles.iterrows():
    stream = open(row['fileName'], 'r')
    data = load(stream)
    fullShape = createShapesFromProfile(row['profileName'], 'schema:' + row['profileType'], data)
    output = open(row['profileName'] + '.shex','w')
    output.write(fullShape)
    output.close()
    stream.close()
    
print('end')


end
