# Fill the FDP with relevant data from the Molgenis

## `Todo` further discribtions

This Jupyter notebook takes information regarding catalogs, datasets and distributions from a Molgenis instance and adds it to a Fair Data Point. 

The information from the Molgenis instance must be present in a FDP table, following the schema shown in the template files. (TODO @PN)

## API Libraries 
### Molgenis
[Documentation](https://molgenis.gitbook.io/molgenis/interoperability/guide-client-python)

[PyPI](https://pypi.org/project/molgenis-py-client/)

### FDP our own
[Code](https://github.com/bibbox/fdpAPIconnector.py)

[PyPI](https://pypi.org/project/fdpAPIconnector/)

In [69]:
pip install rdflib requests fdpAPIconnector molgenis-py-client

Note: you may need to restart the kernel to use updated packages.


In [70]:
# imports
import requests
import rdflib
import molgenis.client as mg
from fdpAPIconnector.fdpclient import FDPClient
import json

## Get relevent Env variables URLS, Logins etc.

In [61]:
# TODO EJ (einheitlich für alle apps)
# TODO PN (für docker molgenis umbauen)

# FDP ()
fdp_baseurl = "http://bibbox-molgenis-fair-fdp-client:80"
# fdp_email = "abmin@FDP.org"
# fdp_password = "password"
fdp_api_key = "rEuD5N1eNdZIDPbgLXiiO7pIEwuOdfYygvbMpgCwRmp3OxRttQsszyfFa8vJpesf6Gpa17OFhHxE06ywRWOHD9UnL54sDZRWF8KotU1WNh81a5NZEA3OejAcHWU7B19V"
fdp_catalog_isPartOf = 'http://localhost:8088'

# MOLGENIS
molgenis_fdp_base_url = "http://bibbox-molgenis-frontend/api/fdp/"
# molgenis_fdp_base_url = "http://molgenis-fdp.silicolabv4.bibbox.org/api/fdp/"
molgenis_fdp_package_catalog = 'fdp_Catalog'
molgenis_fdp_package_dataset = 'fdp_Dataset'
molgenis_fdp_package_distribution = 'fdp_Distribution'
molgenis_api_base_url = "http://bibbox-molgenis-frontend/api/"
# molgenis_api_base_url = "http://molgenis-fdp.silicolabv4.bibbox.org/api/"
molgenis_user = "admin"
molgenis_pswd = "admin"

## Define Helper Functions

In [62]:
def parse_response(data, format='turtle'):
    g = rdflib.Graph()
    g.parse(data=data, format=format)
    return g


def add_isPartOf(graph, id, isPartOf, namespace = rdflib.namespace.DCTERMS.isPartOf):
    try:
        return graph.add((id, namespace, isPartOf))
    except Exception as e:
        print(f"Could not add isPartOf attribute. {e}")


def get_entity_from_molgenis_fdp(entity_id):
    r = requests.get(entity_id)
    if r.ok:
        return parse_response(r.text)
    else:
        print(f"## Error. Could not retrieve entity {entity_id} from fdp endpoint.")


def create_fdp_catalog(catalog_id, graph):
    catalog_URI = rdflib.URIRef(catalog_id)
    isPartOf = rdflib.URIRef(fdp_catalog_isPartOf)
    graph = add_isPartOf(graph, catalog_URI, isPartOf)
    try:
        return fdpclient.create(type='catalog', data=graph.serialize())
    except Exception as e:
        print(f"unexpected error creating fdp entity. {e}")


def create_fdp_dataset(dataset_id, graph, parent_id):
    # print(str(fdp_catalog_isPartOf) + '/catalog/' + str(parent_id))
    dataset_URI = rdflib.URIRef(dataset_id)
    isPartOf = rdflib.URIRef(str(fdp_catalog_isPartOf) + '/catalog/' + str(parent_id))
    graph = add_isPartOf(graph, dataset_URI, isPartOf)
    try:
        return fdpclient.create(type='dataset', data=graph.serialize())
    except Exception as e:
        print(f"unexpected error creating fdp entity. {e}")

def create_fdp_distribution(distribution_id, graph, parent_id):
    distribution_URI = rdflib.URIRef(distribution_id)
    isPartOf = rdflib.URIRef(str(fdp_catalog_isPartOf) + '/dataset/' + str(parent_id))
    graph = add_isPartOf(graph, distribution_URI, isPartOf)
    try:
        return fdpclient.create(type='distribution', data=graph.serialize())
    except Exception as e:
        print(f"unexpected error creating fdp entity. {e}")

## Get the API clients/connectors

In [63]:
# init clients
# FDP
fdpclient = FDPClient(fdp_baseurl, api_token=fdp_api_key, publicurl=fdp_catalog_isPartOf)
# MOLGENIS
mg_session = mg.Session(molgenis_api_base_url)
mg_session.login(molgenis_user, molgenis_pswd)

## Set Index FDP

In [None]:
fdp_ping={"metadataMetrics":[
    {
        "metricUri":"https://purl.org/fair-metrics/FM_F1A",
        "resourceUri":"https://www.ietf.org/rfc/rfc3986.txt"
        },
        {
            "metricUri":"https://purl.org/fair-metrics/FM_A1.1",
            "resourceUri":"https://www.wikidata.org/wiki/Q8777"}
            ],
            "ping":{
                "enabled":True,
                "endpoints":["http://bibbox-sys-commander-master-fdp-client"],
                "interval":60000}
                }


fdpclient.update(type='settings',data=json.dumps(fdp_ping),format="json-ld")
#json.dumps(fdp_ping)

## Get all Catalogs from Molgenis

In [64]:
catalogs = mg_session.get(molgenis_fdp_package_catalog)
# print(catalogs)

## Loop through Catalog ids and create catalogs, datasets and distribution in FDP

In [65]:
    # TODO PN (store IDs in array or dict to later publish everything; alternatively)
    fdp_catalog_ids = []
    fdp_dataset_ids = []
    fdp_distribution_ids = []
    
    
    for catalog in catalogs:
        catalog_molgenis_fdp_id = molgenis_fdp_base_url + molgenis_fdp_package_catalog + '/' + catalog['identifier']
        g = get_entity_from_molgenis_fdp(catalog_molgenis_fdp_id)
        fdp_catalog_id = create_fdp_catalog(catalog_molgenis_fdp_id, g)
        fdp_catalog_ids.append(fdp_catalog_id)
        #print(f'fdp_catalog_id: {fdp_catalog_id}')
        
        # Dataset
        for ds in catalog['dataset']:
            dataset = mg_session.get_by_id(molgenis_fdp_package_dataset, ds['identifier'])
            dataset_molgenis_fdp_id = molgenis_fdp_base_url + molgenis_fdp_package_dataset + '/' + dataset['identifier']
            g = get_entity_from_molgenis_fdp(dataset_molgenis_fdp_id)
            fdp_dataset_id = create_fdp_dataset(dataset_molgenis_fdp_id, g, fdp_catalog_id)
            fdp_dataset_ids.append(fdp_dataset_id)
            #print(f'fdp_dataset_id: {fdp_dataset_id}')

            # Distribution
            for dis in dataset['distribution']:
                distribution = mg_session.get_by_id(molgenis_fdp_package_distribution, dis['identifier'])
                distribution_molgenis_fdp_id = molgenis_fdp_base_url + molgenis_fdp_package_distribution + '/' + distribution['identifier']
                g = get_entity_from_molgenis_fdp(distribution_molgenis_fdp_id)
                fdp_distribution_id = create_fdp_distribution(distribution_molgenis_fdp_id, g, fdp_dataset_id)
                fdp_distribution_ids.append(fdp_distribution_id)
                # print(f'fdp_distribution_id: {fdp_distribution_id}')
                
    print(f'Catalog IDs: {fdp_catalog_ids}')
    print(f'Dataset IDs: {fdp_dataset_ids}')
    print(f'Distribution IDs: {fdp_distribution_ids}')

Catalog IDs: ['05c0efc4-a63d-49d6-be0c-080d298b1ffb']
Dataset IDs: ['79de4c18-9008-4583-875d-6b090f417048', '5a09f183-8cf3-40dd-921c-2b4c4993c22b']
Distribution IDs: ['f114a07b-c5c4-4be5-805e-dc638b5b26a7']


## Publish everything

In [72]:
# for loops publish stored ids
state_published=json.dumps({"current":"PUBLISHED"})

for catalog_id in fdp_catalog_ids:
    fdpclient.update(type='catalog',id=catalog_id,subtype='meta/state',data=state_published,format="json-ld")
for dataset_id in fdp_dataset_ids:
    fdpclient.update(type='dataset',id=dataset_id,subtype='meta/state',data=state_published,format="json-ld")
for distribution_id in fdp_distribution_ids:
    fdpclient.update(type='distribution',id=distribution_id,subtype='meta/state',data=state_published,format="json-ld")
    
print('Done')

HTTP error: 400  for http://bibbox-molgenis-fair-fdp-client:80/catalog/05c0efc4-a63d-49d6-be0c-080d298b1ffb/meta/state 
Response message: Metadata is already published


RuntimeError: No active exception to reraise