# Fill the FDP with relevant data from the os collection protocol


## FDP <-> OpenSpecimen Field mapping for type **Catalog**
| FDP | OS|
|----------|:-------------|
|publisher| 'Mr. Catalog' |
|DESCRIPTION| 'Main catalog Biobank Graz' |
|title| 'Main catalog Biobank Graz'|
|version| '1.0.0' |
|LANGUAGE|'http://id.loc.gov/vocabulary/iso639-1/en' |
|ispartof|fdpclient.publicurl|
|HOMEPAGE|os_homepage|


## FDP <-> OpenSpecimen Field mapping for type **Dataset**
| FDP | OS|
|----------|:-------------|
|publishername| CPForm.PUBLISHER.NAME |
|PUBLISHEREMAIL| CPForm.PUBLISHER.EMAIL|
|PUBLISHERUID| CPForm.PUBLISHER.UUID |
|DESCRIPTION|CPForm.DESCRIPTION|
|title|CP.title|
|version|CPForm.VERSION|
|LANGUAGE|CPForm.LANGUAGE|
|LICENSE|CPForm.LICENSE|
|RIGHTS|CPForm.RIGHTS|
|ISSUED|CP.startDate|
|MODIFIED|CPForm.MODIFIED|
|themes_list|Themes_subform.THEMES_SUBFORM|
|CONTACTPOINT|CP.principalInvestigator.emailAddress|
|KEYWORDS|CPForm.KEYWORDS_SUBFORM|
|LANDINGPAGE|CPFORM.LANDING_PAGE|
|CATALOGID|*id of created catalog*|

## FDP <-> OpenSpecimen Field mapping for type **Distribution**
| FDP | OS|
|----------|:-------------|
|title|"HTML distribution"|
|version| "1.0.0" |
|datasetid| *dat_id: id of the created dataset* |
|publisher| "Mr. Catalog" |
|mediatype| "text/html" |
|ACCESSURL| *URL to CP in OS* |


 - *OS -> OpenSeciment*
 - *CP -> Collection Protocol*
 - *FDP -> FAIR Data Point*



## API Libraraies 
### OpenSpecimin - OpenSpecimenAPIconnector
[Code](https://github.com/bibbox/OpenSpecimenAPIconnector.py)

[Docu](https://openspecimenapiconnectorpy.readthedocs.io/en/latest/index.html)

### FDP our own
[Code](https://github.com/bibbox/fdpAPIconnector.py)

[PyPI](https://pypi.org/project/fdpAPIconnector/)

In [1]:
pip install OpenSpecimenAPIconnector rdflib requests fdpAPIconnector


Note: you may need to restart the kernel to use updated packages.


In [2]:
import OpenSpecimenAPIconnector as OSconn
import OpenSpecimenAPIconnector.os_core as os_core
import OpenSpecimenAPIconnector.os_util as os_util
from fdpAPIconnector.fdpclient import FDPClient
from pprint import pprint
import datetime
import json

## Get the API clients/connectors

In [10]:
# Set Login for OS
baseurl = 'http://oseutops.silicolabv4.bibbox.org/openspecimen/rest/ng'
loginname = "admin"
password = "KqfEUNtpz665h2c"
auth = (loginname, password)
OSconn.config_manager.set_login(url = baseurl, auth = auth)

# Login in FDP
fdpclient=FDPClient("http://openspecimen-fdp-fdp:80","albert.einstein@example.com","password",publicurl="http://localhost:8088",
                    catalog_template='./templates/catalog_template.ttl',
                    dataset_template='./templates/dataset_template.ttl',
                    distribution_template='./templates/distribution_template.ttl')

## Get all CPs from OS

In [4]:
# Get all collection protocols
collection_protocol_util = os_core.collection_protocol()
collection_prots = collection_protocol_util.get_all_collection_protocols()

## Some Functions we need to extract the relevant data from the OS CP

In [5]:
fdp_os_attr_dict = {'title':'CP.title',
                    'catalogid':'CP.id',
                    'version':'CPForm.VERSION',
                    'publishername':'CPForm.PUBLISHER.NAME',
                    'PUBLISHEREMAIL':'CPForm.PUBLISHER.EMAIL',
                    'PUBLISHERUID':'CPForm.PUBLISHER.UID',
                    'themes_list':'CPForm.THEMES_SUBFORM',
                    'DESCRIPTION':'CPForm.DESCRIPTION',
                    'ISSUED':'CP.startDate',
                    'MODIFIED':'CPForm.MODIFIED',
                    'LANGUAGE':'CPForm.LANGUAGE',
                    'LICENSE':'CPForm.LICENSE',
                    'RIGHTS':'CPForm.RIGHTS',
                    'CONTACTPOINT':'CP.principalInvestigator.emailAddress', 
                    'KEYWORDS':'CPForm.KEYWORDS_SUBFORM',
                    'LANDINGPAGE':'CPForm.LANDING_PAGE'}





def getFDPExtensionDetails(extension_details, form_caption = 'CP FDP Attributes'):
    if isinstance(extension_details,list):
        for detail in extension_details:
            if detail['formCaption'] == form_caption:
                return extension_attrs_to_dict(detail['attrs'])
    
    if extension_details['formCaption'] == form_caption:
        return extension_attrs_to_dict(extension_details['attrs'])
    
    return None
    
def extension_attrs_to_dict(attrs):
    attrs_dict={}
    for attr in attrs:
        if isinstance(attr['value'], list):
            attrs_dict[attr['name']]=[]
            #print(attr['value'])
            for val in attr['value']:
                # todo Patrick fragen warum hier value eine liste ist
                # print(val)
                # print(len(val))
                # print(attr['name'])
                if len(val)>1:
                    attrs_dict[attr['name']] = {}
                    for single_val in val:
                        attrs_dict[attr['name']][single_val['name']] = single_val['value']
                    continue
                attrs_dict[attr['name']].append(val[0]['value'])
            continue
        attrs_dict[attr['name']]=attr['value']
    return attrs_dict


def getDictValueFromSubLevel(dictionary,keys):
    if isinstance(keys,list):
        if len(keys)>1:
            return getDictValueFromSubLevel(dictionary[keys[0]],keys[1:])
        return dictionary[keys[0]]
    return dictionary[keys]

def extract_fdp_attr_from_cp(cp_detail,attr_key_dict, extension_details_field = 'extensionDetail'):
    fdp_attributes = getFDPExtensionDetails(cp_detail[extension_details_field])
    fdp_data={}
    for fdp_key, os_key in attr_key_dict.items():
        os_key_hierarchie = os_key.split('.')
        #if os_key_hierarchie[1] == 'PUBLISHER':
        #    #fdp_data[fdp_key] = f"{cp_detail['principalInvestigator']['firstName']} {cp_detail['principalInvestigator']['lastName']}"
        #    fdp_data[fdp_key] = f"{cp_detail['principalInvestigator']['firstName']} {cp_detail['principalInvestigator']['lastName']}"
        #elif os_key_hierarchie[0] == 'CP':
        if os_key_hierarchie[0] == 'CP':
            fdp_data[fdp_key] =  getDictValueFromSubLevel(cp_detail,os_key_hierarchie[1:])           
        elif os_key_hierarchie[0] == 'CPForm':
            fdp_data[fdp_key] = getDictValueFromSubLevel(fdp_attributes,os_key_hierarchie[1:])   
        
        if fdp_key == 'ISSUED' or fdp_key == 'MODIFIED':
            #print(fdp_key)
            fdp_data[fdp_key] = datetime.datetime.fromtimestamp(int(fdp_data[fdp_key]) / 1e3)
            
    return fdp_data



## Get details on each CP
  - [x] get detail on each CP
  - [x] check if FDP subform is present
  - [x] extract all details needed for the FDP

In [6]:
fdp_datasets = []
for cp in collection_prots:
    cp_details=collection_protocol_util.get_collection_protocol(cp['id'])
    fdp_data = extract_fdp_attr_from_cp(cp_details,fdp_os_attr_dict)
    #print(fdp_data)
    fdp_datasets.append(fdp_data)
    

## Wirte data in FDP

### Create a Catalog for the Entire OpenSpecimen

In [7]:
os_homepage='http://oseutops.silicolabv4.bibbox.org'
os_cat=fdpclient.createCatalogRDF(DESCRIPTION='Main catalog Biobank Graz',
                               title='Main catalog Biobank Graz',
                               version="1.0.0",
                               ispartof=fdpclient.publicurl, # Repository the catalog is a part of
                               publishername='Mr. Catalog',
                               LANGUAGE='http://id.loc.gov/vocabulary/iso639-1/en', 
                               HOMEPAGE=os_homepage)

id=fdpclient.create(type='catalog',data=os_cat)
print(id)

154d46a3-4be5-4e7f-b616-58ff55abda2f


### Create the Datasets for each Collection Protocol

In [8]:
#newdis=client.createDistributionRDF(title="HTML distribution", # HTML distribution
#                               version="1.0.0",
#                               datasetid=dat_id,
#                               publisher="Mr. Catalog",
#                               mediatype="text/html", #text/html
#                               ACCESSURL="https://youtu.be/dQw4w9WgXcQ")
#pprint(newdis)

dataset_ids=[]
distribution_ids=[]
for fdp_dataset in fdp_datasets:
    internal_cp_id =  fdp_dataset['catalogid']
    fdp_dataset['catalogid'] = id
    newdat=fdpclient.createDatasetRDF(**fdp_dataset)
    dat_id=fdpclient.create(type='dataset',data=newdat)
    newdis=fdpclient.createDistributionRDF(title="HTML distribution", # HTML distribution
                               version="1.0.0",
                               datasetid=dat_id,
                               publishername="Mr. Catalog",
                               mediatype="text/html", #text/html
                               ACCESSURL=f"{os_homepage}/openspecimen/#/cps/{internal_cp_id}/overview")
    
    dis_id=fdpclient.create(type='distribution',data=newdis)
    print(f"New Dataset: {dat_id}")
    dataset_ids.append(dat_id)
    print(f"New Distribution: {dis_id}")
    distribution_ids.append(dis_id)
    

New Dataset: 632fd50d-a770-4cbd-8ade-9b47ac731f6d
New Distribution: b7bbe92a-192a-4ade-85e1-6e9f30916fc7


## Lets publish everyting

In [9]:
sate_published=json.dumps({"current":"PUBLISHED"})

fdpclient.update(type='catalog',id=id,subtype='meta/state',data=sate_published,format="json-ld")
for dataset_id in dataset_ids:
    fdpclient.update(type='dataset',id=dataset_id,subtype='meta/state',data=sate_published,format="json-ld")
for distribution_id in distribution_ids:
    fdpclient.update(type='distribution',id=distribution_id,subtype='meta/state',data=sate_published,format="json-ld")