# Create RO-Crate from NIRD dataset

In [1]:
import requests
import json
from rocrate.rocrate import ROCrate
from rocrate.model.person import Person
import pandas as pd
from datetime import datetime
import geopandas
import shapely

## Get Institutions

In [2]:
filename = "/Users/annef/Documents/NIRD-Archive/instituion-v3.json"
with open(filename, 'r') as file:
    data = file.read()
    institutions = json.loads(data)
institutions = pd.DataFrame.from_dict(institutions)
institutions.head()

Unnamed: 0,id,name,realm,support_url,support_email,type,ror
0,54,"{'nb': 'Politihøgskolen', 'nn': 'Politihøgskul...",phs.no,{},it@phs.no,"[home_organization, higher_education, service_...",https://ror.org/05486d596
1,55,"{'nb': 'Universitetet i Agder', 'nn': 'Univers...",uia.no,{'nb': 'https://www.uia.no/student/uia-hjelp/b...,hjelp@uia.no,"[home_organization, higher_education, service_...",https://ror.org/03x297z98
2,58,"{'nb': 'NIFU', 'nn': 'NIFU', 'en': 'NIFU'}",nifu.no,{},bibliotek@nifu.no,"[home_organization, higher_education]",https://ror.org/032jh6m87
3,61,"{'nb': 'Universitetssenteret på Svalbard', 'en...",unis.no,{},support@unis.no,"[home_organization, higher_education]",https://ror.org/03cyjf656
4,70,"{'nb': 'Høgskolen i Østfold', 'nn': 'Høgskulen...",hiof.no,{'nb': 'https://www.hiof.no/tjenester/it/'},itvakt@hiof.no,"[home_organization, higher_education, service_...",https://ror.org/04gf7fp41


### Split name in several columns

In [3]:
rename_cols = {}
for name in institutions["name"].apply(pd.Series).columns:
    rename_cols[name] = "name-" + name
rename_cols

{'nb': 'name-nb', 'nn': 'name-nn', 'en': 'name-en', 'se': 'name-se'}

In [4]:
institutions = institutions.join( institutions["name"].apply(pd.Series))
institutions = institutions.rename(columns=rename_cols)
institutions.head()

Unnamed: 0,id,name,realm,support_url,support_email,type,ror,name-nb,name-nn,name-en,name-se
0,54,"{'nb': 'Politihøgskolen', 'nn': 'Politihøgskul...",phs.no,{},it@phs.no,"[home_organization, higher_education, service_...",https://ror.org/05486d596,Politihøgskolen,Politihøgskulen,The Norwegian Police University College,
1,55,"{'nb': 'Universitetet i Agder', 'nn': 'Univers...",uia.no,{'nb': 'https://www.uia.no/student/uia-hjelp/b...,hjelp@uia.no,"[home_organization, higher_education, service_...",https://ror.org/03x297z98,Universitetet i Agder,Universitetet i Agder,University of Agder,Agder Universitehta
2,58,"{'nb': 'NIFU', 'nn': 'NIFU', 'en': 'NIFU'}",nifu.no,{},bibliotek@nifu.no,"[home_organization, higher_education]",https://ror.org/032jh6m87,NIFU,NIFU,NIFU,
3,61,"{'nb': 'Universitetssenteret på Svalbard', 'en...",unis.no,{},support@unis.no,"[home_organization, higher_education]",https://ror.org/03cyjf656,Universitetssenteret på Svalbard,,The University Center in Svalbard,
4,70,"{'nb': 'Høgskolen i Østfold', 'nn': 'Høgskulen...",hiof.no,{'nb': 'https://www.hiof.no/tjenester/it/'},itvakt@hiof.no,"[home_organization, higher_education, service_...",https://ror.org/04gf7fp41,Høgskolen i Østfold,Høgskulen i Østfold,Østfold University College,Østfold Allaskuvla


## Access the NIRD Archive via its API

In [5]:
try:
    with open("/Users/annef/nird.json") as config:
        token = json.load(config)["token"]
except FileNotFoundError:
    print("config.json not found!")

In [6]:
url = "https://admin.ckan-archive-test.sigma2.no/api/3/action/package_search"
response = requests.get(url, headers={ "Authorization": token})

list_records = response.json()["result"]["results"]
list_records

[{'access_rights': 'public',
  'author': None,
  'author_email': None,
  'conforms_to': 'https://w3id.org/ro/crate/1.1',
  'creator_user_id': '566de054-6efa-40ee-ba2a-373ef6e1657f',
  'cron': {'state': 'succeeded',
   'message': '',
   'submitted_date': '2025-01-04T12:51:02',
   'completed_date': '2025-01-04T12:51:05'},
  'dataset_status': 'in_work',
  'doi': '10.82969/2025.u7avlttz',
  'id': 'fb71af3f-b9fd-437d-9d4a-c9d9450d6727',
  'import_done': True,
  'import_source': 'local',
  'isopen': True,
  'language': 'en',
  'license_id': 'CC-BY-4.0',
  'license_title': 'Creative Commons Attribution 4.0',
  'license_url': 'https://creativecommons.org/licenses/by/4.0/',
  'maintainer': None,
  'maintainer_email': None,
  'metadata_created': '2025-01-04T11:49:02.714258',
  'metadata_modified': '2025-01-04T11:51:05.770223',
  'name': 'test-for-ro-crate',
  'notes': 'This dataset is meant to test the creation of RO-Crate. It contains one netcdf file.',
  'num_resources': 2,
  'num_tags': 2,
  

## DOI to find in the NIRD archive

In [7]:
doi = "10.82969/2024.35d8m6eo"
doi = "10.82969/2024.kwjopkn1"
doi = "10.82969/2025.u7avlttz"

In [8]:
for rec in (x for x in list_records if x["doi"] == doi):
    pass

In [9]:
print(rec.keys())

dict_keys(['access_rights', 'author', 'author_email', 'conforms_to', 'creator_user_id', 'cron', 'dataset_status', 'doi', 'id', 'import_done', 'import_source', 'isopen', 'language', 'license_id', 'license_title', 'license_url', 'maintainer', 'maintainer_email', 'metadata_created', 'metadata_modified', 'name', 'notes', 'num_resources', 'num_tags', 'organization', 'owner_org', 'private', 'project', 'provenance', 'publisher', 'release_date', 'source', 'spatial', 'state', 'subject', 'temporal', 'theme', 'title', 'type', 'url', 'version', 'version_notes', 'version_type', 'contact_points', 'contributors', 'creators', 'dataset_owner', 'groups', 'related_resources', 'resources', 'tags', 'relationships_as_subject', 'relationships_as_object'])


## Create a new RO-Crate

In [10]:
crate = ROCrate()

## Add the license for the RO-Crate

In [11]:
crate.update_jsonld(
{
    "@id": "./",
    "license": { "@id":  rec["license_url"]},
})
license = {
                "@id": rec["license_url"],
                "@type": "CreativeWork",
                "name": rec["license_id"],
                "description": rec["license_title"],
                }
crate.add_jsonld(license)

<https://creativecommons.org/licenses/by/4.0/ CreativeWork>

## Add creators and their Organizations

We also need the ROR to create an affiliation for each organization found

In [12]:
def get_ror(institutions, org):
    ror = {}
    for name in rename_cols.values():
        if not institutions.loc[institutions[name].isin([org])].empty:
            ror_url = institutions.loc[institutions[name].isin([org])]["ror"].values[0]
            ror = {
                "@id": ror_url,
                "@type": "Organization",
                "name": org,
                "url": ror_url
                }
    return ror

In [13]:
list_authors = []
list_orcids = []
for creator in rec["creators"]:
    creator['givenName'] = creator.pop('first_name')
    creator['familyName'] = creator.pop('last_name') 
    list_authors.append(creator['givenName'] + " " +  creator['familyName'])
    org_name = creator.pop("organisation")
    ror = get_ror(institutions, org_name)
    creator["affiliation"] = {"@id": ror["url"]}
    orcid = creator.pop("orcid")
    print(creator)
    crate.add(Person(crate, "https://orcid.org/" + orcid, properties=creator))
    list_orcids.append({ "@id":  "https://orcid.org/" + orcid })
    crate.add_jsonld(ror)

# Add the list of authors
crate.update_jsonld({
    "@id": "./",
    "author": list_orcids,
})

rec.pop("creators")

{'email': 'annef@simula.no', 'givenName': 'Anne', 'familyName': 'Fouilloux', 'affiliation': {'@id': 'https://ror.org/00vn06n10'}}
{'email': 'adilhasan2@gmail.com', 'givenName': 'Adil', 'familyName': 'Hasan', 'affiliation': {'@id': 'https://ror.org/04q12yn84'}}


[{'email': 'annef@simula.no',
  'givenName': 'Anne',
  'familyName': 'Fouilloux',
  'affiliation': {'@id': 'https://ror.org/00vn06n10'}},
 {'email': 'adilhasan2@gmail.com',
  'givenName': 'Adil',
  'familyName': 'Hasan',
  'affiliation': {'@id': 'https://ror.org/04q12yn84'}}]

## Add data

In [14]:
for toc in (x for x in rec["resources"] if x["name"] == "table_of_contents"):
    pass
toc["url"]

'https://admin.ckan-archive-test.sigma2.no/dataset/fb71af3f-b9fd-437d-9d4a-c9d9450d6727/download/table_of_contents_10.82969_2025.u7avlttz.csv'

In [15]:
for resource in rec["resources"]:
    print(resource["name"])

table_of_contents
summary_contents


In [16]:
df = pd.read_csv(toc["url"], sep="|")
df = df.drop([0, 0])
df.head()

Unnamed: 0,filename,format,size,modified_date,fixity,http_url,s3_url
1,/archive/fb71af3f-b9fd-437d-9d4a-c9d9450d6727/...,application/x-hdf; charset=binary,939782.0,2025-01-04T12:49:34.230329310 +0100,c11041bf9133147d714eddc05d0712e0,https://admin.ckan-archive-test.sigma2.no/dat...,/dataset/fb71af3f-b9fd-437d-9d4a-c9d9450d6727...


In [17]:
df.columns

Index(['filename ', ' format ', ' size ', ' modified_date ', ' fixity ',
       ' http_url ', ' s3_url'],
      dtype='object')

### Prepare Temporal coverage if available

In [18]:
if "temporal" in rec.keys():
    temporal_coverage = rec["temporal"]["start"] + "/" + rec["temporal"]["end"] 
else:
    temporal_coverage = None

### Prepare Spatial coverage if available

In [19]:
def get_geoshape(spatial):
    if spatial["spatial_type"] == "wkt":
        geo = shapely.wkt.loads(spatial["value"])
        if hasattr(geo, 'geoms'):
            # take the first one
             geo = geo.geoms[0]
        geo = geo.wkt.replace("POLYGON", "").replace("(","").replace(")","").strip()   
        geolocation = { "@type": "GeoShape", "@id": geo, "polygon": geo}
        return geolocation
    else:
        # Not implemented yet
        return None
geolocation = get_geoshape(rec["spatial"])

In [20]:
geolocation

{'@type': 'GeoShape',
 '@id': '10.125819658606247 60.74450455347338, 10.125819658606247 59.2253827166615, 12.04672567350471 59.2253827166615, 12.04672567350471 60.74450455347338, 10.125819658606247 60.74450455347338',
 'polygon': '10.125819658606247 60.74450455347338, 10.125819658606247 59.2253827166615, 12.04672567350471 59.2253827166615, 12.04672567350471 60.74450455347338, 10.125819658606247 60.74450455347338'}

### Go through each data and add it in the RO-Crate 

In [21]:
# "https://admin.ckan-archive-test.sigma2.no" + 
for row in df.itertuples(index=False, name=None):
    resource2add = {df.columns[i].strip() : row[i] for i, _ in enumerate(row)}
    url = resource2add.pop("http_url").strip()
    resource2add["name"] = resource2add["filename"].strip()
    resource2add["s3_url"] = "s3:/" + resource2add["s3_url"].strip()
    resource2add["sdDatePublished"] = rec["cron"]["completed_date"]     # Is it correct?
    resource2add["dateCreated"] = rec["metadata_created"]
    resource2add["dateModified"] = rec["metadata_modified"]
    resource2add["contentSize"] = resource2add.pop("size")
    resource2add["encodingFormat"] = resource2add.pop("format")
    if geolocation is not None:
        resource2add["location"] = geolocation
    if temporal_coverage is not None:
        resource2add["temporalCoverage"] = temporal_coverage
    
    ## properties we remove because we do not know where to fit them
    resource2add.pop("filename")   # was put in "name"
    resource2add.pop("fixity") # Checksum?
    resource2add.pop("s3_url")  # may be ut in different distribution but only valid for datasets and not files
    print("properties = ", resource2add)
    resource = crate.add_file(url, fetch_remote = False, properties=resource2add)

properties =  {'modified_date': ' 2025-01-04T12:49:34.230329310 +0100 ', 'name': '/archive/fb71af3f-b9fd-437d-9d4a-c9d9450d6727/healpix.nc', 'sdDatePublished': '2025-01-04T12:51:05', 'dateCreated': '2025-01-04T11:49:02.714258', 'dateModified': '2025-01-04T11:51:05.770223', 'contentSize': 939782.0, 'encodingFormat': ' application/x-hdf; charset=binary ', 'location': {'@type': 'GeoShape', '@id': '10.125819658606247 60.74450455347338, 10.125819658606247 59.2253827166615, 12.04672567350471 59.2253827166615, 12.04672567350471 60.74450455347338, 10.125819658606247 60.74450455347338', 'polygon': '10.125819658606247 60.74450455347338, 10.125819658606247 59.2253827166615, 12.04672567350471 59.2253827166615, 12.04672567350471 60.74450455347338, 10.125819658606247 60.74450455347338'}, 'temporalCoverage': '1975-04-15T23:00:00.000Z/2025-01-03T23:00:00.000Z'}


## Add metadata to RO

### Add the title and description

In [22]:
title = rec.pop("title")
description = rec.pop("notes")
crate.update_jsonld({
    "@id": "./",
    "description": description,
    "title": title,
    "name": title,
})

<./ Dataset>

### Add the publisher and creator

In [23]:
publisher_name = rec.pop("publisher")  ## WEIRD! 
publisher_name = "Sigma2 AS"
publisher_url = "https://www.wikidata.org/wiki/Q12008197"
publisher = {
                "@id": publisher_url,
                "@type": "Organization",
                "name": publisher_name,
                "url": publisher_url
                }
crate.add_jsonld(publisher)
crate.update_jsonld(
{
    "@id": "./",
    "publisher": { "@id": publisher_url },
})

<./ Dataset>

### Add the creator of the RO-Crate

In [24]:
crate.update_jsonld(
{
    "@id": "ro-crate-metadata.json",
    "creator": { "@id": publisher_url },
})

<ro-crate-metadata.json CreativeWork>

### Add Publication date

In [25]:
date_published =  datetime.strptime(rec.pop("release_date"), '%Y-%m-%dT%H:%M:%S.%f')

crate.update_jsonld({
    "@id": "./",
    "datePublished":  date_published.strftime("%Y-%m-%d") ,
})

<./ Dataset>

### Add citation

In [26]:
doi = "https://doi.org/" + rec.pop("doi")
cite_as = " and ".join(list_authors) + ", " + title + ", " + publisher_name + ", " + date_published.strftime("%Y") + ". " +  doi + "."

crate.update_jsonld({
    "@id": "./",
    "identifier": doi,
    "url": doi,
    "cite-as":  cite_as ,
})


<./ Dataset>

### Add studySubject, keywords, etc.

In [27]:
def find_subject(subject, groups):
    for group in groups:
        if subject == group["id"]:
            return group
    return None

In [28]:
study_subjects = []
for subject in rec["subject"]:
    info_on_subject = find_subject(subject, rec["groups"])     
    study_subjects.append({
         "@id": "http://inspire.ec.europa.eu/metadata-codelist/TopicCategory/" + info_on_subject["name"]
    })

In [29]:
keywords = []
for tag in rec["tags"]:
    keywords.append(tag["display_name"])
    
keywords = ", ".join(keywords)

crate.update_jsonld({
    "@id": "./",
    "about": study_subjects,
    "keywords":  keywords,
})

<./ Dataset>

### Add version

In [30]:
crate.update_jsonld({
    "@id": "./",
    "version": rec["version"],
})

<./ Dataset>

### Add Language

In [31]:
#crate.update_jsonld({
#    "@id": ,
#    "@type": "Language",
#})

In [32]:
rec.keys()

dict_keys(['access_rights', 'author', 'author_email', 'conforms_to', 'creator_user_id', 'cron', 'dataset_status', 'id', 'import_done', 'import_source', 'isopen', 'language', 'license_id', 'license_title', 'license_url', 'maintainer', 'maintainer_email', 'metadata_created', 'metadata_modified', 'name', 'num_resources', 'num_tags', 'organization', 'owner_org', 'private', 'project', 'provenance', 'source', 'spatial', 'state', 'subject', 'temporal', 'theme', 'type', 'url', 'version', 'version_notes', 'version_type', 'contact_points', 'contributors', 'dataset_owner', 'groups', 'related_resources', 'resources', 'tags', 'relationships_as_subject', 'relationships_as_object'])

In [33]:
rec["theme"]

[{'id': 'e1520f0b-3361-4a82-91e6-94a26e0f3c3b',
  'name': 'age_of_stratospheric_air',
  'vocabulary_id': 'c4773028-d7d7-44f7-9e9f-cd5f0f77357c',
  'title': 'age of stratospheric air',
  'url': 'https://vocab.nerc.ac.uk/standard_name/age_of_stratospheric_air',
  'display_name': 'age_of_stratospheric_air'},
 {'id': 'b61c20ac-eda6-42b0-bab7-72b947e08d6c',
  'name': 'aggregate_quality_flag',
  'vocabulary_id': 'c4773028-d7d7-44f7-9e9f-cd5f0f77357c',
  'title': 'aggregate quality flag',
  'url': 'https://vocab.nerc.ac.uk/standard_name/aggregate_quality_flag',
  'display_name': 'aggregate_quality_flag'},
 {'id': 'a0b31d6c-14f3-42b2-ab66-83573680ef68',
  'name': 'Buildings',
  'vocabulary_id': '863a5b7c-3a29-4b5a-9fe7-6c874f3e474f',
  'title': 'Buildings',
  'url': 'https://www.eionet.europa.eu/gemet/en/inspire-theme/bu',
  'display_name': 'Buildings'},
 {'id': '363f9750-0846-4ed5-9cba-ad0ded4928cc',
  'name': 'Geographical names',
  'vocabulary_id': '863a5b7c-3a29-4b5a-9fe7-6c874f3e474f',
  

## Write to disk

In [34]:
crate.write("ro-crate")

In [35]:
from rocrateValidator import validate as validate

In [36]:
v = validate.validate("ro-crate")
v.validator()

This is an INVALID RO-Crate
{
    "File existence": [
        true
    ],
    "File size": [
        true
    ],
    "Metadata file existence": [
        true
    ],
    "Json check": [
        true
    ],
    "Json-ld check": [
        true
    ],
    "File descriptor check": [
        true
    ],
    "Direct property check": [
        true
    ],
    "Referencing check": [
        true
    ],
    "Encoding check": [
        true
    ],
    "Web-based data entity check": [
        true
    ],
    "Person entity check": [
        true
    ],
    "Organization entity check": [
        true
    ],
    "Contact information check": [
        true
    ],
    "Citation property check": [
        true
    ],
    "Publisher property check": [
        true
    ],
    "Funder property check": [
        true
    ],
    "Licensing property check": [
        false,
        "Semantic Error: Invalid ID Value at https://creativecommons.org/licenses/by/4.0/. It must be an URL."
    ],
    "Places prope