# Retrieval of vocabularies and mappings for CLARIN

In [1]:
# Preamble

import re
import requests
import shutil

# Define some common constants
CCR_API_BASE = 'https://vocabularies.clarin.eu/clavas/rest/v1/'
CCR_VOCAB = 'ccr'
VLO_API_BASE = 'https://vlo.clarin-dev.eu/api'
COMPONENT_REGISTRY_BASE = 'https://catalog.clarin.eu/ds/ComponentRegistry/rest'

# Create output directories for data

import os
DATA_DIR = 'data'
CCR_DATA_DIR = DATA_DIR + '/ccr'
PROFILES_DATA_DIR = DATA_DIR + '/profiles'
VLO_FACETS_DATA_DIR = DATA_DIR + '/facets'
VLO_MAPPING_DATA_DIR = DATA_DIR + '/mappings'
for dataDir in [DATA_DIR, CCR_DATA_DIR, PROFILES_DATA_DIR, VLO_FACETS_DATA_DIR, VLO_MAPPING_DATA_DIR]:
    if not os.path.exists(dataDir):
        print("Creating data directory: ", dataDir)
        os.mkdir(dataDir)

## Vocabularies

In [2]:
############################################################
### Concepts from CLARIN Concept Registry (CCR)
############################################################

from skosmos_client import SkosmosClient
from urllib.parse import urlencode, quote_plus

skosmos = SkosmosClient(CCR_API_BASE)
ccrVocab = skosmos.get_vocabulary(CCR_VOCAB)

# schemes = [{'label':'test', 'uri':'http://hdl.handle.net/11459/CCR_P-LexicalResources_ce3edd5c-07a7-b345-dcfa-789a9b4bc980'}]
ccrConceptSchemes = ccrVocab['conceptschemes']

In [3]:
for scheme in ccrConceptSchemes:
    outfileName = CCR_DATA_DIR + '/' + re.sub('.*/','', scheme['uri']) + '.xml'
    if os.path.exists(outfileName):
        print('- Skipping download of concept scheme', scheme['label'], 'to', outfileName, '(file already exists)')
    else:
        print('- Downloading concept scheme', scheme['label'], 'to', outfileName)
        # retrieve concept scheme
        payload = {'uri': scheme['uri'], 'format': 'application/rdf+xml'}
        url = CCR_API_BASE + CCR_VOCAB + '/data'
        
        # retrieve and save to data directory
        response = requests.get(url, params=payload)
        with open(outfileName, 'w') as f:
            f.write(response.text)

    ### If we wanted, we could also read the concept scheme into a graph
    # graph = rdflib.Graph()
    # sourceUrl=url + '?' + urlencode(payload)
    # graph = graph.parse(source=sourceUrl, format='xml')
    # for subject in graph.subjects():
    #     print(subject)

- Skipping download of concept scheme Dialogue Acts to data/ccr/CCR_P-DialogueActs_955814a6-6c07-c143-94eb-b2551c2d51cb.xml (file already exists)
- Skipping download of concept scheme Language Codes to data/ccr/CCR_P-LanguageCodes_a122c1a3-1912-fecd-07a2-8685522dfeca.xml (file already exists)
- Skipping download of concept scheme Language Resource Ontology to data/ccr/CCR_P-LanguageResourceOntology_05399e1d-4f23-8fc1-5088-eb5afbf0cd91.xml (file already exists)
- Skipping download of concept scheme Lexical Resources to data/ccr/CCR_P-LexicalResources_ce3edd5c-07a7-b345-dcfa-789a9b4bc980.xml (file already exists)
- Skipping download of concept scheme Lexical Semantics to data/ccr/CCR_P-LexicalSemantics_2906bf39-6db8-90ed-da57-c7a3cc03c9c9.xml (file already exists)
- Skipping download of concept scheme Lexicography to data/ccr/CCR_P-Lexicography_630911fa-0687-1e55-c846-37ebe24f3182.xml (file already exists)
- Skipping download of concept scheme Metadata to data/ccr/CCR_P-Metadata_6f3f84d1

In [4]:
############################################################
### Controlled vocabularies from profile specifications
############################################################

## Retrieve profile IDs (or load precompiled list)
vloProfileIdsFacetUrl = VLO_API_BASE + '/facets/_componentProfileId'
# make request and retrieve as json
response = requests.get(vloProfileIdsFacetUrl).json()
# collect ids from facet value/count response
profileIds = list(map(lambda v: v['value'], response['values']))
print("Found", len(profileIds), "profile identifiers")

Found 145 profile identifiers


In [5]:
## Retrieve profile specifications
for profileId in profileIds:
    outfileName = PROFILES_DATA_DIR + '/' + profileId + '.xml'
    if os.path.exists(outfileName):
        print('- Skipping download of profile specification', profileId, 'to', outfileName, '(file already exists)')
    else:
        print('- Downloading profile specification', profileId, 'to', outfileName)
        # retrieve and save to data directory
        url = COMPONENT_REGISTRY_BASE + '/registry/profiles/' + profileId + '/xml'
        response = requests.get(url)
        if response.status_code == 200:
            with open(outfileName, 'w') as f:
                f.write(response.text)
        else:
            print('Warning: profile specification not found for', profileId)

- Skipping download of profile specification clarin.eu:cr1:p_1288172614026 to data/profiles/clarin.eu:cr1:p_1288172614026.xml (file already exists)
- Skipping download of profile specification clarin.eu:cr1:p_1328259700954 to data/profiles/clarin.eu:cr1:p_1328259700954.xml (file already exists)
- Skipping download of profile specification clarin.eu:cr1:p_1610707853541 to data/profiles/clarin.eu:cr1:p_1610707853541.xml (file already exists)
- Skipping download of profile specification clarin.eu:cr1:p_1475136016208 to data/profiles/clarin.eu:cr1:p_1475136016208.xml (file already exists)
- Skipping download of profile specification clarin.eu:cr1:p_1407745712035 to data/profiles/clarin.eu:cr1:p_1407745712035.xml (file already exists)
- Skipping download of profile specification clarin.eu:cr1:p_1349361150622 to data/profiles/clarin.eu:cr1:p_1349361150622.xml (file already exists)
- Skipping download of profile specification clarin.eu:cr1:p_1288172614023 to data/profiles/clarin.eu:cr1:p_1288

In [6]:
## Collect XPaths of vocabs

from lxml import etree
# profileIds = ['clarin.eu:cr1:p_1380106710826']

# Loop over profiles
allVocabs = {}
for profileId in profileIds:
    #Parse XML
    infileName = PROFILES_DATA_DIR + '/' + profileId + '.xml'
    try:
        xmlTree = etree.parse(infileName)
    except OSError:
        print("Skipping", profileId, " - specification file could not be read")
        break
    except LmxError as err:
        print("Skipping", profileId, " - file could not be processed:", err)
        break
        
    root = xmlTree.getroot()
    
    # Find vocabularies
    
    # Element vocabs
    elementEnums = root.findall('.//CMD_Element/ValueScheme/enumeration')
    attributeEnums = root.findall('.//AttributeList/Attribute/ValueScheme/enumeration')
    
    vocabs = []
    
    # Process enums
    for enumElement in elementEnums + attributeEnums:
        containers = enumElement.xpath('../..')
        if len(containers) > 0:
            container = containers[0]
            conceptLink = None
            # Look for concept link
            if container.tag == 'CMD_Element':
                conceptLink = container.get('ConceptLink')
            elif container.tag == 'Attribute':
                conceptLinkEl = container.xpath('./ConceptLink')
                if len(conceptLinkEl) > 0:
                    conceptLink = conceptLinkEl[0].text
            vocabs.append({'concept': conceptLink, 'path': xmlTree.getpath(enumElement)})
    
    print('- Found',len(vocabs),'vocabularies in',profileId)

    if len(vocabs) > 0:
        # Add info for proile to collection
        allVocabs[profileId] = vocabs

- Found 60 vocabularies in clarin.eu:cr1:p_1288172614026
- Found 4 vocabularies in clarin.eu:cr1:p_1328259700954
- Found 12 vocabularies in clarin.eu:cr1:p_1610707853541
- Found 2 vocabularies in clarin.eu:cr1:p_1475136016208
- Found 27 vocabularies in clarin.eu:cr1:p_1407745712035
- Found 105 vocabularies in clarin.eu:cr1:p_1349361150622
- Found 0 vocabularies in clarin.eu:cr1:p_1288172614023
- Found 11 vocabularies in clarin.eu:cr1:p_1407745712064
- Found 4 vocabularies in clarin.eu:cr1:p_1328259700947
- Found 0 vocabularies in clarin.eu:cr1:p_1456409483189
- Found 7 vocabularies in clarin.eu:cr1:p_1369140737145
- Found 11 vocabularies in clarin.eu:cr1:p_1633000337997
- Found 0 vocabularies in clarin.eu:cr1:p_1498745062850
- Found 4 vocabularies in clarin.eu:cr1:p_1328259700945
- Found 41 vocabularies in clarin.eu:cr1:p_1562754657370
- Found 4 vocabularies in clarin.eu:cr1:p_1328259700946
- Found 4 vocabularies in clarin.eu:cr1:p_1328259700951
- Found 76 vocabularies in clarin.eu:cr1

In [7]:
# Example: getting vocabulary values for a single concept
concept = 'http://hdl.handle.net/11459/CCR_C-2571_2be2e583-e5af-34c2-3673-93359ec1f7df'

conceptItems = set()
for profileId in list(allVocabs):
    xmlTree = None
    for info in allVocabs[profileId]:
        if info['concept'] == concept:
            # print(profileId, 'at', info['path'])
            if xmlTree == None:
                # parse specification
                infileName = PROFILES_DATA_DIR + '/' + profileId + '.xml'
                xmlTree = etree.parse(infileName)
            enumeration = xmlTree.xpath(info['path'])
            if len(enumeration) > 0:
                items = enumeration[0].xpath('./item')
                for item in items:
                    conceptItems.add(item.text)

{'concept': concept, 'items': conceptItems}
# TODO(?): we could materialize all vocabularies per concept

{'concept': 'http://hdl.handle.net/11459/CCR_C-2571_2be2e583-e5af-34c2-3673-93359ec1f7df',
 'items': {'application/epub+zip',
  'application/json',
  'application/msword',
  'application/pdf',
  'application/ssff',
  'application/vnd.adobe.flash-movie',
  'application/vnd.ms-excel',
  'application/vnd.ms-powerpoint',
  'application/vnd.oasis.opendocument.graphics',
  'application/vnd.oasis.opendocument.presentation',
  'application/vnd.oasis.opendocument.spreadsheet',
  'application/vnd.oasis.opendocument.text',
  'application/x-binary',
  'application/x-tar',
  'application/x-tex',
  'application/x-texinfo',
  'application/x-zip-compressed',
  'application/xhtml+xml',
  'application/zip',
  'audio/aiff',
  'audio/basic',
  'audio/midi',
  'audio/mod',
  'audio/mp4',
  'audio/mpeg',
  'audio/mpeg3',
  'audio/ogg',
  'audio/raw',
  'audio/vnd.wave',
  'audio/voc',
  'audio/vorbis',
  'audio/wav',
  'audio/webm',
  'audio/x-adpcm',
  'audio/x-aiff',
  'audio/x-au',
  'audio/x-esps',
  'a

In [8]:
############################################################
### VLO facets
############################################################

## Retrieve facets from VLO API
vloFacetsUrl = VLO_API_BASE + '/facets'
# make request and retrieve as json
response = requests.get(vloFacetsUrl).json()
# collect ids from facet value/count response
facets = list(map(lambda v: v['name'], response))
print("Found", len(facets), "facets")

# Collect facet values and store in a single list
facetsInfo=[]
for facet in facets:
    facetUrl = VLO_API_BASE + '/facets/' + facet
    response = requests.get(facetUrl).json()
    values = list(map(lambda v: v['value'], response['values']))
    print('Facet info from', facetUrl, ': found', len(values), 'values')
    facetsInfo.append({'facet': facet, 'values': values})

Found 10 facets
Facet info from https://vlo.clarin-dev.eu/api/facets/languageCode : found 9111 values
Facet info from https://vlo.clarin-dev.eu/api/facets/collection : found 944 values
Facet info from https://vlo.clarin-dev.eu/api/facets/resourceClass : found 669 values
Facet info from https://vlo.clarin-dev.eu/api/facets/modality : found 192 values
Facet info from https://vlo.clarin-dev.eu/api/facets/format : found 210 values
Facet info from https://vlo.clarin-dev.eu/api/facets/keywords : found 8607 values
Facet info from https://vlo.clarin-dev.eu/api/facets/genre : found 1476 values
Facet info from https://vlo.clarin-dev.eu/api/facets/subject : found 100000 values
Facet info from https://vlo.clarin-dev.eu/api/facets/country : found 336 values
Facet info from https://vlo.clarin-dev.eu/api/facets/organisation : found 1779 values


In [9]:
# Write VLO facet info to a file
import json

facetsOutFile = VLO_FACETS_DATA_DIR + '/vlo-facets.json'
facetsJson = json.dumps(facetsInfo, indent=4)
with open(facetsOutFile, "w") as outfile:
    outfile.write(facetsJson)

print('VLO facet values written to', facetsOutFile)

VLO facet values written to data/facets/vlo-facets.json


## Mappings

In [10]:
############################################################
### Concept - facet mapping
############################################################

facetConceptsMappingUrl = 'https://raw.githubusercontent.com/clarin-eric/VLO-mapping/refs/heads/master/mapping/facetConcepts.xml'
facetConceptsMappingOutFile = VLO_MAPPING_DATA_DIR + '/facetConcepts.xml'

## Retrieve from GitHub and store a copy
response = requests.get(facetConceptsMappingUrl)
with open(facetConceptsMappingOutFile, "w") as outfile:
    outfile.write(response.text)

print('VLO concept - facet mapping written to', facetConceptsMappingOutFile)

VLO concept - facet mapping written to data/mappings/facetConcepts.xml


In [11]:
############################################################
### Value maps
############################################################

from zipfile import ZipFile
from tempfile import TemporaryDirectory
from io import BytesIO
from zipfile import ZipFile

## Retrieve from GitHub
vloMappingRepoUrl = 'https://github.com/clarin-eric/VLO-mapping/archive/refs/heads/master.zip'
sourceDir = 'VLO-mapping-master/value-maps/dist'
targetDir = VLO_MAPPING_DATA_DIR + '/value-maps'

if os.path.exists(targetDir):
    print('deleting existing target dir', targetDir)
    shutil.rmtree(targetDir)

response = requests.get(vloMappingRepoUrl)
vloMappingRepoZip = response.content
with TemporaryDirectory() as tmpDir:
    with ZipFile(BytesIO(vloMappingRepoZip)) as zip_ref:
        files = [f for f in  zip_ref.namelist() if f.startswith(sourceDir)]
        zip_ref.extractall(members=files, path=tmpDir)
        shutil.move(tmpDir + '/' + sourceDir, targetDir)

print('Value map definitions extracted to', targetDir)

deleting existing target dir data/mappings/value-maps
Value map definitions extracted to data/mappings/value-maps


In [13]:
############################################################
### Post-processing maps
############################################################

## Retrieve from GitHub (same repository as value maps)
sourceDir = 'VLO-mapping-master/uniform-maps'
targetDir = VLO_MAPPING_DATA_DIR + '/uniform-maps'

if os.path.exists(targetDir):
    print('deleting existing target dir', targetDir)
    shutil.rmtree(targetDir)

# response = requests.get(vloMappingRepoUrl)
# vloMappingRepoZip = response.content
with TemporaryDirectory() as tmpDir:
    with ZipFile(BytesIO(vloMappingRepoZip)) as zip_ref:
        files = [f for f in  zip_ref.namelist() if f.startswith(sourceDir)]
        zip_ref.extractall(members=files, path=tmpDir)
        shutil.move(tmpDir + '/' + sourceDir, targetDir)

print('Post-processing map definitions extracted to', targetDir)

deleting existing target dir data/mappings/uniform-maps
Post-processing map definitions extracted to data/mappings/uniform-maps
