# Ontology for the metadata

This notebook is used to create the ontology for the metadata of the Judaicalink project. The ontology is created in RDF and is used to describe the metadata of the books and journals in the Judaicalink project. The ontology is created using the RDFlib library in Python.

Important Namespaces are:
GNDO - https://d-nb.info/standards/elementset/gnd
Judaicalink - http://data.judaicalink.org/ontology/

The generator ist based on the Ontology-Builder for the Judaicalink project.
https://github.com/judaicalink/judaicalink-ontology


@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix owl: <http://www.w3.org/2002/07/owl#> .
@prefix gndo: <https://d-nb.info/standards/elementset/gnd#> .
@prefix jl: <https://ontology.judaicalink.org/> .
@prefix cm: <https://sammlungen.ub.uni-frankfurt.de/cm/> .

# Define the class Journal
cm:Journal a owl:Class ;
    rdfs:label "Journal" ;
    rdfs:subClassOf gndo:Periodical .

# Define properties of Journal
cm:title a owl:DatatypeProperty ;
    rdfs:label "Title" ;
    rdfs:domain cm:Journal .

cm:governingBody a owl:ObjectProperty ;
    rdfs:label "Governing Body" ;
    rdfs:domain cm:Journal ;
    rdfs:range gndo:CorporateBody .

cm:description a owl:DatatypeProperty ;
    rdfs:label "Description" ;
    rdfs:domain cm:Journal .

cm:language a owl:DatatypeProperty ;
    rdfs:label "Language" ;
    rdfs:domain cm:Journal ;
    rdfs:range jl:string .

cm:onlineEdition a owl:DatatypeProperty ;
    rdfs:label "Online Edition" ;
    rdfs:domain cm:Journal ;
    rdfs:range jl:string .

cm:URN a owl:DatatypeProperty ;
    rdfs:label "URN" ;
    rdfs:domain cm:Journal ;
    rdfs:range jl:string .

# Define subclass Publication
cm:Publication a owl:Class ;
    rdfs:label "Publication" .

# Define properties of Publication
cm:place a owl:DatatypeProperty ;
    rdfs:label "Place" ;
    rdfs:domain cm:Publication ;
    rdfs:range jl:string .

cm:publisher a owl:DatatypeProperty ;
    rdfs:label "Publisher" ;
    rdfs:domain cm:Publication ;
    rdfs:range jl:string .

cm:printingPress a owl:DatatypeProperty ;
    rdfs:label "Printing Press" ;
    rdfs:domain cm:Publication ;
    rdfs:range jl:string .

cm:publicationDate a owl:DatatypeProperty ;
    rdfs:label "Publication Date" ;
    rdfs:domain cm:Publication ;
    rdfs:range jl:date .

# Define subclass Volume
cm:Volume a owl:Class ;
    rdfs:label "Volume" .

# Define properties of Volume
cm:hasIssue a owl:ObjectProperty ;
    rdfs:label "Has Issue" ;
    rdfs:domain cm:Volume ;
    rdfs:range cm:Issue .

# Define subclass Issue
cm:Issue a owl:Class ;
    rdfs:label "Issue" .

# Define properties of Issue
cm:issueNumber a owl:DatatypeProperty ;
    rdfs:label "Issue Number" ;
    rdfs:domain cm:Issue ;
    rdfs:range jl:integer .

cm:issueYear a owl:DatatypeProperty ;
    rdfs:label "Issue Year" ;
    rdfs:domain cm:Issue ;
    rdfs:range jl:gYear .

cm:issueLink a owl:DatatypeProperty ;
    rdfs:label "Issue Link" ;
    rdfs:domain cm:Issue ;
    rdfs:range jl:anyURI .

cm:digitalizationType a owl:DatatypeProperty ;
    rdfs:label "Digitalization Type" ;
    rdfs:domain cm:Issue ;
    rdfs:range jl:string .

# Define additional properties for Journal
cm:hasVolume a owl:ObjectProperty ;
    rdfs:label "Has Volume" ;
    rdfs:domain cm:Journal ;
    rdfs:range cm:Volume .

cm:hasTitlePage a owl:ObjectProperty ;
    rdfs:label "Has Title Page" ;
    rdfs:domain cm:Journal ;
    rdfs:range cm:TitlePage .

cm:hasTableOfContents a owl:ObjectProperty ;
    rdfs:label "Has Table of Contents" ;
    rdfs:domain cm:Journal ;
    rdfs:range cm:TableOfContents .

cm:alternativeTitle a owl:DatatypeProperty ;
    rdfs:label "Alternative Title" ;
    rdfs:domain cm:Journal ;
    rdfs:range jl:string .

cm:corpusBelongsTo a owl:ObjectProperty ;
    rdfs:label "Corpus Belongs To" ;
    rdfs:domain cm:Journal ;
    rdfs:range gndo:Corpus .

cm:continuedUnder a owl:ObjectProperty ;
    rdfs:label "Continued Under" ;
    rdfs:domain cm:Journal ;
    rdfs:range cm:Journal .

cm:linkedJournal a owl:ObjectProperty ;
    rdfs:label "Linked Journal" ;
    rdfs:domain cm:Journal ;
    rdfs:range cm:Journal .

# Define subclass TitlePage
cm:TitlePage a owl:Class ;
    rdfs:label "Title Page" .

# Define properties of TitlePage
cm:titleDescription a owl:DatatypeProperty ;
    rdfs:label "Title Description" ;
    rdfs:domain cm:TitlePage ;
    rdfs:range jl:string .

cm:titleLink a owl:DatatypeProperty ;
    rdfs:label "Title Link" ;
    rdfs:domain cm:TitlePage ;
    rdfs:range jl:anyURI .

# Define subclass TableOfContents
cm:TableOfContents a owl:Class ;
    rdfs:label "Table of Contents" .

# Define properties of TableOfContents
cm:contentsTitle a owl:DatatypeProperty ;
    rdfs:label "Contents Title" ;
    rdfs:domain cm:TableOfContents ;
    rdfs:range jl:string .

cm:contentsLink a owl:DatatypeProperty ;
    rdfs:label "Contents Link" ;
    rdfs:domain cm:TableOfContents ;
    rdfs:range jl:anyURI .


! The ontology is saved in the file ontology.ttl

In [None]:
# Load the Ontology

In [1]:
!pip install rdflib
import pandas as pd
import numpy as np
import rdflib
from rdflib import Graph, Literal, RDF, URIRef, Namespace


Collecting rdflib
  Downloading rdflib-7.0.0-py3-none-any.whl (531 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m531.9/531.9 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting isodate<0.7.0,>=0.6.0
  Using cached isodate-0.6.1-py2.py3-none-any.whl (41 kB)
Installing collected packages: isodate, rdflib
Successfully installed isodate-0.6.1 rdflib-7.0.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [3]:
from rdflib import Graph

# Create an empty RDF graph
g = Graph()

# Load the ontology from the TTL file into the graph
g.parse("ontology.ttl", format="ttl")

# Now, the ontology is loaded into the `g` Graph object


<Graph identifier=N3f9d7da539814fbb938211c92458ad54 (<class 'rdflib.graph.Graph'>)>

# Zeitschriften in Compact Memory


## Load the into pandas from the CSV file

In [38]:
import pandas as pd

# Specify the desired data types for each column
dtype_options = {
    'VLID_Seite': 'Int64',
    'OT_PATH': 'str',
    'VLID_Zs': 'Int64',
    'VLID_Parent': 'Int64',
    'Parent Knotentyp': 'str',
    'Parent-Type': 'str',
    'Zs_Caption': 'str',
    'Volume_Caption': 'str',
    'Heft_Caption': 'str',
    'Aufsatz_Caption': 'str',
    'Datum': 'str',  # You'll need to parse this as date later
    'Seite (OT_SORT)': 'Int64',
    'Seite_Caption': 'str',
    'Seitenzahl_Caption': 'str'
}
# Load the CSV file with dtype parameter
df = pd.read_csv("metadata/CM_Seiten_Metadaten.csv", sep="\t", dtype=dtype_options, parse_dates=['Datum'])
#df.info()

#df.describe()

df.head()

#df.tail()

#df.sample(5)

#df.corr()

#df.cov()



### Data Cleaning
Drop unnecessary columns.
We don't need the info on the files and the pages.

VLID_Seite, OT_PATH, VLID_Zs, VLID_Parent, Parent Knotentyp, Parent-Type, Heft_Caption, Aufsatz_Caption, Seite (OT_SORT), Seite_Caption, Seitenzahl_Caption

In [39]:
# drop columns VLID_Seite, OT_PATH, VLID_Zs, VLID_Parent, Parent Knotentyp, Parent-Type, Heft_Caption, Aufsatz_Caption, Datum, Seite (OT_SORT), Seite_Caption, Seitenzahl_Caption
df.drop(columns=['VLID_Seite', 'OT_PATH', 'VLID_Zs', 'VLID_Parent', 'Parent Knotentyp', 'Parent-Type', 'Aufsatz_Caption', 'Seite (OT_SORT)', 'Seite_Caption', 'Seitenzahl_Caption'], inplace=True)

df.head()
df.sample(5)

In [36]:
print(df.sample(5))

                                               Zs_Caption Volume_Caption  \
606465                  Allgemeine Zeitung des Judenthums       8 (1835)   
902077                        Israelitischer Jugendfreund       8 (1835)   
200869  Mitteilungen der Arbeitsgemeinschaft Jüdisch-L...       8 (1835)   
365136  Monatsschrift für Geschichte und Wissenschaft ...       8 (1835)   
670276                               Der jüdische Student       8 (1835)   

                   Heft_Caption       Datum  
606465      Heft 33 (15.8.1902)  1902-08-15  
902077                24 (1897)  1897-01-01  
200869  Heft 11 (November 1920)  1920-11-01  
365136                  Heft 10  1868-01-01  
670276        Heft 3 (Mai 1923)  1923-05-01  


In [28]:
# list all the unique values in the column 'Zs_Caption'
df['Zs_Caption'].unique()

array(['... Bericht ueber den Verein für die Provinz Westfalen zur Bildung von Elementar-Lehrern und Befoerderung von Handwerken und Kuensten unter den Juden',
       'Führer durch die jüdische Gemeindeverwaltung und Wohlfahrtspflege in Deutschland',
       'Sechzehnter Bericht über die Religionsschule der jüdischen Gemeinde, womit zu der Sonntag den 26. März in dem Hörsaale der jüdischen Knaben-Schule ... öffentlichen Prüfung ... einladet',
       '... Bericht ueber den Verein für Westfalen und Rheinprovinz zur Bildung von Elementar-Lehrern und Befoerderung von Handwerken und Kuensten unter den Juden',
       "... Verwaltungsbericht des Haupt-Grenz-Comité's zu Königsberg i. Pr. für Beseitigung der Nothstände unter den Israeliten West-Rußlands",
       'Führer durch die jüdische Wohlfahrtspflege in Deutschland',
       'Bericht der Isr. Religionsschule zu Kiel',
       'Actes et conférences de la Société des Études Juives',
       'Jüdischer Almanach', 'Österreichisch-ungarische Cantor

### Save the data to the graph

In [40]:
import pandas as pd
from rdflib import Graph, Namespace, Literal, URIRef
from rdflib.namespace import RDF, RDFS, XSD

# Load the DataFrame

# Filter out rows with NaN values in specified columns
#df = df.dropna(subset=['Zs_Caption', 'Volume_Caption', 'Heft_Caption', 'Datum'])

# Replace NaN values with empty strings
df = df.fillna('')

# Filter out rows with empty strings in specified columns
df = df[(df['Zs_Caption'] != '') &
        (df['Volume_Caption'] != '') &
        (df['Heft_Caption'] != '') &
        (df['Datum'] != '')]

# Create an RDF graph
g = Graph()

# Define namespaces
gndo = Namespace("https://d-nb.info/standards/elementset/gnd#")
cm = Namespace("https://data.judaicalink.org/ontology/cm")

# Create an RDF graph
g = Graph()

# Iterate over each row and add triples to the graph
for idx, row in df.iterrows():
    # Check for NaN values in specified columns
    if pd.notna(row['Zs_Caption']) and pd.notna(row['Volume_Caption']) and pd.notna(row['Heft_Caption']) and pd.notna(row['Datum']):
        # Create URIs for the journal, volume, and issue
        journal_uri = URIRef(f"{cm}{row['Zs_Caption']}")
        volume_uri = URIRef(f"{cm}{row['Volume_Caption']}")
        issue_uri = URIRef(f"{cm}{row['Heft_Caption']}")

        # Add triples for the journal, volume, and issue
        g.add((journal_uri, RDF.type, gndo.Periodical))
        g.add((journal_uri, RDFS.label, Literal(row['Zs_Caption'], datatype=XSD.string)))

        g.add((volume_uri, RDF.type, gndo.Volume))
        g.add((volume_uri, RDFS.label, Literal(row['Volume_Caption'], datatype=XSD.string)))
        g.add((volume_uri, gndo.partOf, journal_uri))

        g.add((issue_uri, RDF.type, gndo.Issue))
        g.add((issue_uri, RDFS.label, Literal(row['Heft_Caption'], datatype=XSD.string)))
        g.add((issue_uri, gndo.partOf, volume_uri))
        g.add((issue_uri, gndo.publicationDate, Literal(row['Datum'], datatype=XSD.date)))

# Save the graph as a Turtle file
g.serialize(destination='output.ttl', format='turtle')


## Load the Journal Metadata from the JSON file

In [None]:
import json

with open("metadata/journal_metadata/journal_metadata_title_lang.json") as f:
    data = json.load(f)

# Build the Graph

In [None]:
from rdflib import Graph, Literal, RDF, URIRef, Namespace
from rdflib.namespace import DC, FOAF

# create GRAPH
g = Graph()

# create Namespaces
