# Create RO-Crate from NIRD dataset

In [1]:
import requests
import json
from rocrate.rocrate import ROCrate
from rocrate.model.person import Person
import pandas as pd
from datetime import datetime

## Get Institutions

In [2]:
filename = "/Users/annef/Documents/NIRD-Archive/instituion-v3.json"
with open(filename, 'r') as file:
    data = file.read()
    institutions = json.loads(data)
institutions = pd.DataFrame.from_dict(institutions)
institutions.head()

Unnamed: 0,id,name,realm,support_url,support_email,type,ror
0,54,"{'nb': 'Politihøgskolen', 'nn': 'Politihøgskul...",phs.no,{},it@phs.no,"[home_organization, higher_education, service_...",https://ror.org/05486d596
1,55,"{'nb': 'Universitetet i Agder', 'nn': 'Univers...",uia.no,{'nb': 'https://www.uia.no/student/uia-hjelp/b...,hjelp@uia.no,"[home_organization, higher_education, service_...",https://ror.org/03x297z98
2,58,"{'nb': 'NIFU', 'nn': 'NIFU', 'en': 'NIFU'}",nifu.no,{},bibliotek@nifu.no,"[home_organization, higher_education]",https://ror.org/032jh6m87
3,61,"{'nb': 'Universitetssenteret på Svalbard', 'en...",unis.no,{},support@unis.no,"[home_organization, higher_education]",https://ror.org/03cyjf656
4,70,"{'nb': 'Høgskolen i Østfold', 'nn': 'Høgskulen...",hiof.no,{'nb': 'https://www.hiof.no/tjenester/it/'},itvakt@hiof.no,"[home_organization, higher_education, service_...",https://ror.org/04gf7fp41


### Split name in several columns

In [3]:
rename_cols = {}
for name in institutions["name"].apply(pd.Series).columns:
    rename_cols[name] = "name-" + name
rename_cols

{'nb': 'name-nb', 'nn': 'name-nn', 'en': 'name-en', 'se': 'name-se'}

In [4]:
institutions = institutions.join( institutions["name"].apply(pd.Series))
institutions = institutions.rename(columns=rename_cols)
institutions.head()

Unnamed: 0,id,name,realm,support_url,support_email,type,ror,name-nb,name-nn,name-en,name-se
0,54,"{'nb': 'Politihøgskolen', 'nn': 'Politihøgskul...",phs.no,{},it@phs.no,"[home_organization, higher_education, service_...",https://ror.org/05486d596,Politihøgskolen,Politihøgskulen,The Norwegian Police University College,
1,55,"{'nb': 'Universitetet i Agder', 'nn': 'Univers...",uia.no,{'nb': 'https://www.uia.no/student/uia-hjelp/b...,hjelp@uia.no,"[home_organization, higher_education, service_...",https://ror.org/03x297z98,Universitetet i Agder,Universitetet i Agder,University of Agder,Agder Universitehta
2,58,"{'nb': 'NIFU', 'nn': 'NIFU', 'en': 'NIFU'}",nifu.no,{},bibliotek@nifu.no,"[home_organization, higher_education]",https://ror.org/032jh6m87,NIFU,NIFU,NIFU,
3,61,"{'nb': 'Universitetssenteret på Svalbard', 'en...",unis.no,{},support@unis.no,"[home_organization, higher_education]",https://ror.org/03cyjf656,Universitetssenteret på Svalbard,,The University Center in Svalbard,
4,70,"{'nb': 'Høgskolen i Østfold', 'nn': 'Høgskulen...",hiof.no,{'nb': 'https://www.hiof.no/tjenester/it/'},itvakt@hiof.no,"[home_organization, higher_education, service_...",https://ror.org/04gf7fp41,Høgskolen i Østfold,Høgskulen i Østfold,Østfold University College,Østfold Allaskuvla


## Access the NIRD Archive via its API

In [5]:
try:
    with open("/Users/annef/nird.json") as config:
        token = json.load(config)["token"]
except FileNotFoundError:
    print("config.json not found!")

In [6]:
url = "https://admin.ckan-archive-test.sigma2.no/api/3/action/package_search"
response = requests.get(url, headers={ "Authorization": token})

list_records = response.json()["result"]["results"]
list_records

[{'access_rights': 'public',
  'author': None,
  'author_email': None,
  'creator_user_id': '04024ef3-9efe-4b51-86a9-599cc4f6b1a2',
  'cron': {'state': 'succeeded',
   'message': '',
   'submitted_date': '2025-01-01T12:24:07',
   'completed_date': '2025-01-01T12:24:11'},
  'doi': '10.82969/2024.kwjopkn1',
  'id': '98a8b149-9bc3-4017-a1ac-7e9a56d3c7d4',
  'import_done': True,
  'import_source': 'external',
  'isopen': True,
  'language': 'en',
  'license_id': 'CC-BY-4.0',
  'license_title': 'Creative Commons Attribution 4.0',
  'license_url': 'https://creativecommons.org/licenses/by/4.0/',
  'maintainer': None,
  'maintainer_email': None,
  'metadata_created': '2024-12-30T07:14:46.649692',
  'metadata_modified': '2025-01-01T11:24:11.940074',
  'name': 'test-dropbox-upload',
  'notes': 'Test Dropbox Upload dataset description',
  'num_resources': 2,
  'num_tags': 0,
  'organization': None,
  'owner_org': None,
  'private': False,
  'project': [],
  'publisher': 'NIRD RDA',
  'release_dat

## DOI to find in the NIRD archive

In [7]:
doi = "10.82969/2024.35d8m6eo"
doi = "10.82969/2024.kwjopkn1"

In [8]:
for rec in (x for x in list_records if x["doi"] == doi):
    pass

In [9]:
print(rec.keys())

dict_keys(['access_rights', 'author', 'author_email', 'creator_user_id', 'cron', 'doi', 'id', 'import_done', 'import_source', 'isopen', 'language', 'license_id', 'license_title', 'license_url', 'maintainer', 'maintainer_email', 'metadata_created', 'metadata_modified', 'name', 'notes', 'num_resources', 'num_tags', 'organization', 'owner_org', 'private', 'project', 'publisher', 'release_date', 'source', 'state', 'subject', 'theme', 'title', 'type', 'url', 'version', 'version_notes', 'version_type', 'creators', 'groups', 'resources', 'tags', 'relationships_as_subject', 'relationships_as_object'])


In [10]:
rec

{'access_rights': 'public',
 'author': None,
 'author_email': None,
 'creator_user_id': '04024ef3-9efe-4b51-86a9-599cc4f6b1a2',
 'cron': {'state': 'succeeded',
  'message': '',
  'submitted_date': '2025-01-01T12:24:07',
  'completed_date': '2025-01-01T12:24:11'},
 'doi': '10.82969/2024.kwjopkn1',
 'id': '98a8b149-9bc3-4017-a1ac-7e9a56d3c7d4',
 'import_done': True,
 'import_source': 'external',
 'isopen': True,
 'language': 'en',
 'license_id': 'CC-BY-4.0',
 'license_title': 'Creative Commons Attribution 4.0',
 'license_url': 'https://creativecommons.org/licenses/by/4.0/',
 'maintainer': None,
 'maintainer_email': None,
 'metadata_created': '2024-12-30T07:14:46.649692',
 'metadata_modified': '2025-01-01T11:24:11.940074',
 'name': 'test-dropbox-upload',
 'notes': 'Test Dropbox Upload dataset description',
 'num_resources': 2,
 'num_tags': 0,
 'organization': None,
 'owner_org': None,
 'private': False,
 'project': [],
 'publisher': 'NIRD RDA',
 'release_date': '2025-01-01T11:24:11.875547

## Create a new RO-Crate

In [11]:
crate = ROCrate()

## Add the license for the RO-Crate

In [12]:
crate.update_jsonld(
{
    "@id": "ro-crate-metadata.json",
    "license": { "@id":  "https://creativecommons.org/licenses/by/4.0/"},
})

<ro-crate-metadata.json CreativeWork>

## Add creators and their Organizations

We also need the ROR to create an affiliation for each organization found

In [13]:
def get_ror(institutions, org):
    ror = {}
    for name in rename_cols.values():
        if not institutions.loc[institutions[name].isin([org])].empty:
            ror_url = institutions.loc[institutions[name].isin([org])]["ror"].values[0]
            ror = {
                "@id": ror_url,
                "@type": "Organization",
                "name": org,
                "url": ror_url
                }
    return ror

In [14]:
list_authors = []
list_orcids = []
for creator in rec["creators"]:
    creator['givenName'] = creator.pop('first_name')
    creator['familyName'] = creator.pop('last_name') 
    list_authors.append(creator['givenName'] + " " +  creator['familyName'])
    org_name = creator.pop("organisation")
    ror = get_ror(institutions, org_name)
    creator["affiliation"] = {"@id": ror["url"]}
    orcid = creator.pop("orcid")
    print(creator)
    crate.add(Person(crate, "https://orcid.org/" + orcid, properties=creator))
    list_orcids.append({ "@id":  "https://orcid.org/" + orcid })
    crate.add_jsonld(ror)

# Add the list of authors
crate.update_jsonld({
    "@id": "./",
    "author": list_orcids,
})

rec.pop("creators")

{'email': 'gaga68@gmail.com', 'givenName': 'Dragan', 'familyName': 'Avramović', 'affiliation': {'@id': 'https://ror.org/05y8hw592'}}


[{'email': 'gaga68@gmail.com',
  'givenName': 'Dragan',
  'familyName': 'Avramović',
  'affiliation': {'@id': 'https://ror.org/05y8hw592'}}]

## Add data

In [15]:
for toc in (x for x in rec["resources"] if x["name"] == "table_of_contents"):
    pass
toc["url"]

'https://admin.ckan-archive-test.sigma2.no/dataset/98a8b149-9bc3-4017-a1ac-7e9a56d3c7d4/download/table_of_contents_10.82969_2024.kwjopkn1.csv'

In [16]:
for resource in rec["resources"]:
    print(resource["name"])

table_of_contents
summary_contents


In [17]:
df = pd.read_csv(toc["url"], sep="|")
df = df.drop([0, 0])
df.head()

Unnamed: 0,filename,format,size,modified_date,fixity,http_url,s3_url
1,/archive/98a8b149-9bc3-4017-a1ac-7e9a56d3c7d4/...,text/html; charset=us-ascii,8415.0,2024-12-30T08:16:44.942908000 +0100,87528a955215c0f5dfab4a2586004779,https://admin.ckan-archive-test.sigma2.no/dat...,/dataset/98a8b149-9bc3-4017-a1ac-7e9a56d3c7d4...
2,/archive/98a8b149-9bc3-4017-a1ac-7e9a56d3c7d4/...,text/html; charset=us-ascii,8415.0,2024-12-30T08:16:43.711558000 +0100,4aab5c412fa23ed83198c61554076566,https://admin.ckan-archive-test.sigma2.no/dat...,/dataset/98a8b149-9bc3-4017-a1ac-7e9a56d3c7d4...
3,/archive/98a8b149-9bc3-4017-a1ac-7e9a56d3c7d4/...,text/html; charset=us-ascii,8415.0,2024-12-30T08:16:43.102482000 +0100,2a474fc2136ed65d3fe9d3468948ef8b,https://admin.ckan-archive-test.sigma2.no/dat...,/dataset/98a8b149-9bc3-4017-a1ac-7e9a56d3c7d4...
4,/archive/98a8b149-9bc3-4017-a1ac-7e9a56d3c7d4/...,text/html; charset=us-ascii,8415.0,2024-12-30T08:16:45.145904000 +0100,bb0688cb2a552304a8f4f6ca41b1d13c,https://admin.ckan-archive-test.sigma2.no/dat...,/dataset/98a8b149-9bc3-4017-a1ac-7e9a56d3c7d4...
5,/archive/98a8b149-9bc3-4017-a1ac-7e9a56d3c7d4/...,text/html; charset=us-ascii,8415.0,2024-12-30T08:16:54.833488000 +0100,965ca3c02a574e0e353db91ff48383a2,https://admin.ckan-archive-test.sigma2.no/dat...,/dataset/98a8b149-9bc3-4017-a1ac-7e9a56d3c7d4...


In [18]:
df.columns

Index(['filename ', ' format ', ' size ', ' modified_date ', ' fixity ',
       ' http_url ', ' s3_url'],
      dtype='object')

In [19]:
for row in df.itertuples(index=False, name=None):
    resource2add = {df.columns[i].strip() : row[i] for i, _ in enumerate(row)}
    url = "https://admin.ckan-archive-test.sigma2.no" + resource2add.pop("http_url").strip()
    resource2add["filename"] = "https://admin.ckan-archive-test.sigma2.no" + resource2add["filename"].strip()
    resource2add["s3_url"] = "s3:/" + resource2add["s3_url"].strip()
    print(url)
    print("properties = ", resource2add)
    resource = crate.add_file(url, fetch_remote = False, properties=resource2add)

https://admin.ckan-archive-test.sigma2.nohttps://admin.ckan-archive-test.sigma2.no/dataset/98a8b149-9bc3-4017-a1ac-7e9a56d3c7d4/download/notes_2010_01_07_v3.txt
properties =  {'filename': 'https://admin.ckan-archive-test.sigma2.no/archive/98a8b149-9bc3-4017-a1ac-7e9a56d3c7d4/notes_2010_01_07_v3.txt', 'format': ' text/html; charset=us-ascii ', 'size': 8415.0, 'modified_date': ' 2024-12-30T08:16:44.942908000 +0100 ', 'fixity': ' 87528a955215c0f5dfab4a2586004779 ', 's3_url': 's3://dataset/98a8b149-9bc3-4017-a1ac-7e9a56d3c7d4/download/notes_2010_01_07_v3.txt'}
https://admin.ckan-archive-test.sigma2.nohttps://admin.ckan-archive-test.sigma2.no/dataset/98a8b149-9bc3-4017-a1ac-7e9a56d3c7d4/download/notes_2010_01_08_v1.txt
properties =  {'filename': 'https://admin.ckan-archive-test.sigma2.no/archive/98a8b149-9bc3-4017-a1ac-7e9a56d3c7d4/notes_2010_01_08_v1.txt', 'format': ' text/html; charset=us-ascii ', 'size': 8415.0, 'modified_date': ' 2024-12-30T08:16:43.711558000 +0100 ', 'fixity': ' 4aab5c

## Add metadata to RO

### Add the title and description

In [20]:
title = rec.pop("title")
description = rec.pop("notes")
crate.update_jsonld({
    "@id": "./",
    "description": description,
    "title": title,
    "name": title,
})

<./ Dataset>

### Add the publisher and creator

In [21]:
publisher_name = rec.pop("publisher")  ## WEIRD! 
publisher_name = "Sigma2 AS"
publisher_url = "https://www.wikidata.org/wiki/Q12008197"
publisher = {
                "@id": publisher_url,
                "@type": "Organization",
                "name": publisher_name,
                "url": publisher_url
                }
crate.add_jsonld(publisher)
crate.update_jsonld(
{
    "@id": "./",
    "publisher": { "@id": publisher_url },
})

<./ Dataset>

### Add the creator of the RO-Crate

In [22]:
crate.update_jsonld(
{
    "@id": "ro-crate-metadata.json",
    "creator": { "@id": publisher_url },
})


<ro-crate-metadata.json CreativeWork>

### Add the license of the dataset

In [23]:
crate.update_jsonld(
{
    "@id": "./",
    "license": { "@id":  rec["license_url"]},
})

<./ Dataset>

### Add Publication date

In [24]:
date_published =  datetime.strptime(rec.pop("release_date"), '%Y-%m-%dT%H:%M:%S.%f')

crate.update_jsonld({
    "@id": "./",
    "datePublished":  date_published.strftime("%Y-%m-%d") ,
})

<./ Dataset>

### Add citation

In [25]:
doi = "https://doi.org/" + rec.pop("doi")
cite_as = " and ".join(list_authors) + ", " + title + ", " + publisher_name + ", " + date_published.strftime("%Y") + ". " +  doi + "."

crate.update_jsonld({
    "@id": "./",
    "identifier": doi,
    "url": doi,
    "cite-as":  cite_as ,
})


<./ Dataset>

### Add studySubject, keywords, etc.

In [26]:
def find_subject(subject, groups):
    for group in groups:
        if subject == group["id"]:
            return group
    return None

In [27]:
study_subjects = []
for subject in rec["subject"]:
    info_on_subject = find_subject(subject, rec["groups"])     
    study_subjects.append({
         "@id": "http://inspire.ec.europa.eu/metadata-codelist/TopicCategory/" + info_on_subject["name"]
    })

In [28]:
keywords = ", ".join(rec["tags"])

crate.update_jsonld({
    "@id": "./",
    "about": study_subjects,
    "keywords":  keywords,
})

<./ Dataset>

### Add version

In [29]:
crate.update_jsonld({
    "@id": "./",
    "version": rec["version"],
})

<./ Dataset>

### Add Language

In [30]:
#crate.update_jsonld({
#    "@id": ,
#    "@type": "Language",
#})

In [31]:
rec.keys()

dict_keys(['access_rights', 'author', 'author_email', 'creator_user_id', 'cron', 'id', 'import_done', 'import_source', 'isopen', 'language', 'license_id', 'license_title', 'license_url', 'maintainer', 'maintainer_email', 'metadata_created', 'metadata_modified', 'name', 'num_resources', 'num_tags', 'organization', 'owner_org', 'private', 'project', 'source', 'state', 'subject', 'theme', 'type', 'url', 'version', 'version_notes', 'version_type', 'groups', 'resources', 'tags', 'relationships_as_subject', 'relationships_as_object'])

In [32]:
rec["theme"]

[{'id': '4194019f-9996-453f-9f69-610a5bc91beb',
  'name': 'acoustic_area_backscattering_strength_in_sea_water',
  'vocabulary_id': 'c4773028-d7d7-44f7-9e9f-cd5f0f77357c',
  'title': 'acoustic area backscattering strength in sea water',
  'url': 'https://vocab.nerc.ac.uk/standard_name/acoustic_area_backscattering_strength_in_sea_water',
  'display_name': 'acoustic_area_backscattering_strength_in_sea_water'},
 {'id': '9bd5127a-28ee-472f-a435-5d0f3d2554f6',
  'name': 'Statistical units',
  'vocabulary_id': '863a5b7c-3a29-4b5a-9fe7-6c874f3e474f',
  'title': 'Statistical units',
  'url': 'https://www.eionet.europa.eu/gemet/en/inspire-theme/su',
  'display_name': 'Statistical units'}]

## Write to disk

In [33]:
crate.write("ro-crate")