# Extract all unique head entities from MSE turtle file

In [20]:
!pip install rdflib



In [21]:
import rdflib
from collections import defaultdict
import pandas as pd

# Path to the RDF file (Turtle format)
file_path = 'output.ttl'

# Create a new RDF graph
g = rdflib.Graph()

# Try to parse the RDF file
try:
    g.parse(file_path, format='turtle')
except Exception as e:
    print(f"Error parsing RDF file: {e}")

# Initialize data structures
entities = set()
subject_entities = set()
relationships = set()
label_dict = {}
descriptions = defaultdict(list)  # Initialize descriptions with default empty lists

# Set of URIs for description
description_info_set = {
    rdflib.URIRef("http://purl.org/dc/terms/description"),
    rdflib.URIRef("http://purls.helmholtz-metadaten.de/mwo/description"),
    rdflib.URIRef("http://www.w3.org/2000/01/rdf-schema#comment"),
    rdflib.URIRef("http://purl.obolibrary.org/obo/IAO_0000115"),
    rdflib.URIRef("http://www.w3.org/2004/02/skos/core#definition")
}

# Do we need more Literals?

# Iterate over the graph
for subject, predicate, obj in g:
    relationships.add(predicate)  # Add predicate to relationships

    # Add entities
    if isinstance(subject, rdflib.URIRef):
        subject_entities.add(subject)
        entities.add(subject)

    # Extract labels for subject entities
    if predicate == rdflib.RDFS.label or predicate == rdflib.URIRef("http://purl.org/dc/terms/title") and isinstance(subject, rdflib.URIRef):
        label_dict[subject] = str(obj)

    # Append descriptions
    if predicate in description_info_set:
        descriptions[subject].append(str(obj))

# Combine multiple descriptions into a single string for each entity
for entity, desc_list in descriptions.items():
    descriptions[entity] = ' '.join(desc_list)

# Use URI as fallback label for subject entities without explicit label
for entity in entities:
    if entity not in label_dict:
        label_dict[entity] = str(entity) if entity in subject_entities else None

# Create DataFrame for entities with labels and descriptions
entity_list = []
for entity in entities:
    entity_label = label_dict.get(entity, "")
    entity_description = descriptions.get(entity, "")
    entity_list.append([str(entity), entity_label, entity_description])

df_entity = pd.DataFrame(entity_list, columns=['entity_uri', 'entity_label', 'entity_description'])

# To export the DataFrame 'df_entity' to an Excel file
df_entity.to_excel( 'extracted_headentity_list.xlsx' , index=False)

df_entity


Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/rdflib/term.py", line 2119, in _castLexicalToPython
    return conv_func(lexical)  # type: ignore[arg-type]
  File "/usr/local/lib/python3.10/dist-packages/isodate/isotime.py", line 148, in parse_time
    raise ISO8601Error('Unrecognised ISO 8601 time format: %r' % timestring)
isodate.isoerror.ISO8601Error: Unrecognised ISO 8601 time format: '2x90 min/week'
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/rdflib/term.py", line 2119, in _castLexicalToPython
    return conv_func(lexical)  # type: ignore[arg-type]
  File "/usr/local/lib/python3.10/dist-packages/isodate/isotime.py", line 148, in parse_time
    raise ISO8601Error('Unrecognised ISO 8601 time format: %r' % timestring)
isodate.isoerror.ISO8601Error: Unrecognised ISO 8601 time format: '1,5x90 min/week'
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/rdflib/term.py", line 2119, i

Unnamed: 0,entity_uri,entity_label,entity_description
0,http://demo.fiz-karlsruhe.de/matwerk/E221117,https://github.com/Materials-Consortia/optimad...,
1,http://demo.fiz-karlsruhe.de/matwerk/E512567,Aachen (Aix) Virtual Platform for Materials Pr...,AixViPMaP is a platform on which different sof...
2,http://www.ebi.ac.uk/swo/license/SWO_1000061,free to use license,A license which allows any form of usage of th...
3,http://demo.fiz-karlsruhe.de/matwerk/E294762,https://doi.org/10.1098/rspa.2020.0568,
4,http://demo.fiz-karlsruhe.de/matwerk/E1002821,https://doi.org/10.1107/S1600576714027575,
...,...,...,...
1818,http://demo.fiz-karlsruhe.de/matwerk/E337611,https://osf.io/,
1819,http://purl.org/dc/terms/FileFormat,File Format,A digital resource format. Examples include th...
1820,http://demo.fiz-karlsruhe.de/matwerk/E1247528,Thermocouples,Temperature calibration
1821,http://demo.fiz-karlsruhe.de/matwerk/E59654,Computer Science,


In [22]:
from rdflib import Graph
import pandas as pd

# Define the namespaces
namespaces = {
    "dc": "http://purl.org/dc/elements/1.1/",
    "dcterms": "http://purl.org/dc/terms/",
    "default1": "https://nfdi.fiz-karlsruhe.de/ontology/",
    "emmo": "http://emmo.info/emmo#",
    "foaf": "http://xmlns.com/foaf/0.1/",
    "modsci": "https://w3id.org/skgo/modsci#",
    "mwo": "http://purls.helmholtz-metadaten.de/mwo/",
    "nfdicore": "http://nfdi.fiz-karlsruhe.de/ontology/",
    "ns1": "https://w3id.org/scholarlydata/ontology/conference-ontology.owl#",
    "ns2": "http://purl.obolibrary.org/obo/",
    "ns3": "http://www.ebi.ac.uk/swo/",
    "ns4": "http://www.geneontology.org/formats/oboInOwl#",
    "owl": "http://www.w3.org/2002/07/owl#",
    "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
    "rdfs": "http://www.w3.org/2000/01/rdf-schema#",
    "schema": "https://schema.org/",
    "skos": "http://www.w3.org/2004/02/skos/core#",
    "void": "http://rdfs.org/ns/void#",
    "xsd": "http://www.w3.org/2001/XMLSchema#"
}

# Load the Turtle file into an RDF graph
g = Graph()
g.parse("output.ttl", format="turtle")

# Extract predicates
predicates_with_namespace = set()
predicate_uri = set()

for s, p, o in g:
    for prefix, namespace in namespaces.items():
        if p.startswith(namespace):
            predicates_with_namespace.add(f"{prefix}:{p.replace(namespace, '')}")
            predicate_uri.add(p)
            break

# Create a DataFrame
df_relation = pd.DataFrame({
    'Predicate with Namespace': list(predicates_with_namespace),
    'Predicate_uri': list(predicate_uri)
})

# Save DataFrame to Excel file
df_relation.to_excel("extracted_relation_list.xlsx", index=False)
df_relation

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/rdflib/term.py", line 2119, in _castLexicalToPython
    return conv_func(lexical)  # type: ignore[arg-type]
  File "/usr/local/lib/python3.10/dist-packages/isodate/isotime.py", line 148, in parse_time
    raise ISO8601Error('Unrecognised ISO 8601 time format: %r' % timestring)
isodate.isoerror.ISO8601Error: Unrecognised ISO 8601 time format: '2x90 min/week'
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/rdflib/term.py", line 2119, in _castLexicalToPython
    return conv_func(lexical)  # type: ignore[arg-type]
  File "/usr/local/lib/python3.10/dist-packages/isodate/isotime.py", line 148, in parse_time
    raise ISO8601Error('Unrecognised ISO 8601 time format: %r' % timestring)
isodate.isoerror.ISO8601Error: Unrecognised ISO 8601 time format: '1,5x90 min/week'
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/rdflib/term.py", line 2119, i

Unnamed: 0,Predicate with Namespace,Predicate_uri
0,mwo:hasDOI,http://purls.helmholtz-metadaten.de/mwo/hasRep...
1,dcterms:isPartOf,http://www.w3.org/2002/07/owl#someValuesFrom
2,mwo:hasExpertiseIn,http://purls.helmholtz-metadaten.de/mwo/descri...
3,mwo:hasRORID,http://purls.helmholtz-metadaten.de/mwo/mainTa...
4,mwo:hasSerialNumber,http://nfdi.fiz-karlsruhe.de/ontology/fileExte...
...,...,...
107,rdf:first,http://xmlns.com/foaf/0.1/depiction
108,mwo:usesTechnology,http://purls.helmholtz-metadaten.de/mwo/confor...
109,mwo:hasORCID,http://purl.obolibrary.org/obo/BFO_0000180
110,owl:annotatedSource,http://emmo.info/emmo#EMMO_967080e5_2f42_4eb2_...
