### Weber Vulgata: creazione del grafo RDF

Questo notebook implementa una procedura per la creazione del grafo RDF del testo della Vulgata secondo gli assiomi dell'ontologia [TresOnt](https://docs.google.com/document/d/1n-OlAy1KleovGgHV4ZHhOSgkgMfsOKPDWT603_fic5k/edit?tab=t.0#heading=h.ng7sdvi05k6u).  

__Grafo della struttura__ Il testo della Vulgata è trasformato in formato tabellare facendo il parsing dei file <a href="https://en.wikipedia.org/wiki/Open_Scripture_Information_Standard">Open Scripture Information Standard (OSIS)</a> presenti nella <a href="https://drive.google.com/drive/folders/1KEFTwR1kLz1Ec_-vU3k-yoFcaynBoP7x">cartella</a> del drive condiviso del WP8. Alle righe della tabella sono applicate le regole di trasformazione in triple.  
__Grafo delle caratteristiche linguistiche__ Alle risorse della struttura sono associate le caratteristiche linguistiche ottenute da LiLa


####  Librerie e configurazione

In [1]:
import pandas as pd
import os
import yaml
import rdflib
from itserr_lib import OsisParser as op
import re
from tqdm import tqdm

In [2]:
if (os.path.isfile('config.yaml')):
    configfile="config.yaml"
else:
    raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), "config.yaml")
try:
    with open(configfile, 'r') as stream:
        try:
            conf=yaml.safe_load(stream)
        except yaml.YAMLError as exc:
            print(exc)
except FileNotFoundError:
    print('Warning config.yaml file not present! Please create it and set the values, store it in the main directory')

#### 1 Creo il dataframe con il testo della Vulgata

In [3]:
files=conf['TEXTFILES']
source=conf['LIBRI_Vulgata']

In [4]:
opars=op()

In [5]:
df_opera=opars.getDataFrame(source, files)

100%|██████████████████████████████████████████████████████████████████████████████████████| 79/79 [00:00<00:00, 85.02it/s]


Creating IRIs...
done.


#### 2 Applico le regole per la creazione del grafo

Inizializzo la tabella con i lemmi

In [6]:
df_lemmi = pd.read_json(path_or_buf='data/output/lemmi.json', orient='table')
df_lemmi.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 661358 entries, 0 to 661357
Data columns (total 7 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   index       661358 non-null  int64 
 1   token       661358 non-null  object
 2   lemma       661358 non-null  object
 3   upos        661358 non-null  object
 4   spaceAfter  661358 non-null  object
 5   linking     661358 non-null  object
 6   irifrag     661358 non-null  int64 
dtypes: int64(2), object(5)
memory usage: 35.3+ MB


In [7]:
df_lemmi.head()

Unnamed: 0,index,token,lemma,upos,spaceAfter,linking,irifrag
0,0,In,in,ADP,,[lilaLemma:106748],0
1,1,principio,principium,NOUN,,[lilaLemma:119505],0
2,2,creavit,creo,VERB,,[lilaLemma:96898],0
3,3,Deus,deus,PROPN,,[lilaLemma:4810],0
4,4,caelum,caelum,NOUN,,"[lilaLemma:92231, lilaLemma:97622]",0


Inizializzo i namespaces

In [8]:
from rdflib.namespace import DC, DCAT, DCTERMS, OWL, \
                            RDF, RDFS, SKOS,  \
                           XMLNS, XSD, XMLNS
from rdflib import Namespace
from rdflib import URIRef, BNode, Literal
its=Namespace(conf['NAMESPACES']['its'])
ecrm=Namespace(conf['NAMESPACES']['ecrm'])
ontolex=Namespace(conf['NAMESPACES']['ontolex'])
tresont=Namespace(conf['NAMESPACES']['tresont'])
lila=Namespace(conf['NAMESPACES']['lila'])
orl=Namespace(conf['NAMESPACES']['orl'])
lilaLemma=Namespace(conf['NAMESPACES']['lilaLemma'])
lilaIpoLemma=Namespace(conf['NAMESPACES']['lilaIpoLemma'])

siglaopera=conf['SIGLAOPERA']
titoloopera=conf['TITOLOOPERA']
idencodes=conf['TITOLILIBRI']
sigleCat=conf['SIGLEPOS']
punct=['.', ',', ':', ';', '?', '!', '†', '※', '(', ')', '-', '..', '"', '[', ']', '•']

In [9]:
synt_rdf = rdflib.Graph(identifier = URIRef('https://itserr.it/struct'))
lang_rdf = rdflib.Graph(identifier = URIRef('https://itserr.it/lang'))
# Declare prefixes
synt_rdf.bind("its", its)
synt_rdf.bind("ecrm", ecrm)
lang_rdf.bind("ecrm", ecrm)
synt_rdf.bind("tresont", tresont)
lang_rdf.bind("tresont", tresont)
synt_rdf.bind("orl", orl)
synt_rdf.bind("lila", lila)
lang_rdf.bind("lila", lila)
synt_rdf.bind("ontolex", ontolex)
lang_rdf.bind("ontolex", ontolex)
lang_rdf.bind("lilaLemma", lilaLemma)
lang_rdf.bind("lilaIpoLemma", lilaIpoLemma)
synt_rdf.bind("orl", orl)


In [10]:
#IRIs

workiri=URIRef(its[f'{siglaopera}'])
uritf=URIRef(tresont.TextFragment)
uriinterval=URIRef(tresont.Interval)
opirititle=URIRef(its[f'{siglaopera}_ti'])
uritpi=URIRef(tresont.TextualPositionByIndex)
urile=URIRef(ontolex.LexicalEntry)
uriale= URIRef(tresont.AlphabetElement)
uripunct= URIRef(tresont.PunctuationMark)
uritilemma=URIRef(ontolex.Form)

#Aggiungo le triple al grafo

verseindividual=URIRef(tresont['verseType'])
chapterindividual=URIRef(tresont['chapterType'])
bookindividual=URIRef(tresont['bookType'])
synt_rdf.add((verseindividual, RDF.type, tresont.StructuralType))
synt_rdf.add((verseindividual, RDFS.label, Literal('Verse', lang='en')))
synt_rdf.add((chapterindividual, RDF.type, tresont.StructuralType))
synt_rdf.add((chapterindividual, RDFS.label, Literal('Chapter', lang='en')))
synt_rdf.add((bookindividual, RDF.type, tresont.StructuralType))
synt_rdf.add((bookindividual, RDFS.label, Literal('Book', lang='en')))

synt_rdf.add((verseindividual, ecrm.P127_has_broader_term, chapterindividual))
synt_rdf.add((chapterindividual, ecrm.P127_has_broader_term, bookindividual))

libri=df_opera.libro.unique()
synt_rdf.add((workiri, RDF.type, URIRef(tresont.OrderedTextualCollection)))
synt_rdf.add((opirititle, RDF.type, ecrm.E35_title))
synt_rdf.add((opirititle, RDFS.label, Literal(titoloopera, lang='la')))
synt_rdf.add((workiri, ecrm.P102_has_title,opirititle))
synt_rdf.add((workiri, tresont.hasLength, Literal(len(libri), datatype=XSD.integer)))

<Graph identifier=https://itserr.it/struct (<class 'rdflib.graph.Graph'>)>

In [11]:
# creazione del grafo della struttura dell'opera
from tqdm import tqdm
bs=0
idOp=siglaopera
# for libro in tqdm(libri):
print(libri)
for idlibro, libro in tqdm(enumerate(libri)):
    libroLength=0
    # idbasekey=f'{libro[0].upper()}{libro[1:]}'
    idbasekey=libro
    
    idbase=idencodes[idbasekey][0]
    titololibro=f'{idbasekey[0].upper()}{idbasekey[1:]}'
    titololibrolat=idencodes[idbasekey][1]
    titololibrolatalt=''
    if (len(idencodes[idbasekey])==3):
        titololibrolatalt=idencodes[idbasekey][2]
    # print(f" processing {idbase}, {titololibro}, {titololibrolat}, {titololibrolatalt}")
    
    bs=idlibro+1
    
    # LIBRO
    libroiri=URIRef(its[f'{idOp}_{idbase}'])
    libroirititle=URIRef(its[f'{idOp}_{idbase}_ti'])
    libroiriseq=URIRef(its[f'{idOp}_{idbase}_{bs}_seq'])
    synt_rdf.add((libroirititle, RDF.type, ecrm.E35_title))
    synt_rdf.add((libroirititle, RDFS.label, Literal(titololibro, lang='en')))
    synt_rdf.add((libroirititle, RDFS.label, Literal(titololibrolat, lang='la')))
    if (titololibrolatalt!=''):
        synt_rdf.add((libroirititle, SKOS.altLabel, Literal(titololibrolatalt, lang='la')))
    synt_rdf.add((libroiri, RDF.type, tresont.Book))
    synt_rdf.add((libroiri, ecrm.P102_has_title, libroirititle))
    
        
    synt_rdf.add((libroiriseq, RDF.type, tresont.TextSequenceElement))
    synt_rdf.add((libroiriseq, tresont.occurrenceOf, libroiri))
    synt_rdf.add((libroiriseq,tresont.hasPosition, Literal(bs, datatype=XSD.integer)))
    synt_rdf.add((libroiriseq,tresont.inSequence, workiri))
    synt_rdf.add((workiri, tresont.hasSequenceElement, libroiriseq))
    
    testdf=df_opera[df_opera.libro==libro].copy() #seleziono i versetti del libro
    
    testdf.reset_index(drop=True, inplace=True)
    cps={'test'}
    prevchap=''
    chaptokenindex=0
    for i, v in testdf['testo'].items():
       
        # CAPITOLO
        # print (testdf.iloc[i].numcap+', '+v)
        if (not testdf.iloc[i].numcap in cps):
            
            idcapitolo=(testdf.iloc[i].numcap).split(".")[1]  
            capoccintiri=URIRef(its[f'{idOp}_{idbase}_{idcapitolo}_i'])
            capocciri=URIRef(its[f'{idOp}_{idbase}_{idcapitolo}_o'])
            capiri=URIRef(its[f'{idOp}_{idbase}_{idcapitolo}_c_se'])
            synt_rdf.add((capocciri, RDF.type, uritf))
            synt_rdf.add((capocciri, tresont.hasStructuralType, chapterindividual))
            synt_rdf.add((capocciri, tresont.hasNumber, Literal(idcapitolo, datatype=XSD.integer)))
            synt_rdf.add((capocciri, tresont.inSequence, libroiri))
            synt_rdf.add((capocciri, tresont.occurrenceOf, capiri))
            synt_rdf.add((capiri, RDF.type, tresont.SyntacticEntity))
            
            #Intervallo del capitolo
            synt_rdf.add((capocciri, tresont.hasInterval, capoccintiri))
            synt_rdf.add((capoccintiri, RDF.type, uriinterval))
            synt_rdf.add((capoccintiri, tresont.intervalFrom, Literal(libroLength+1, datatype=XSD.integer)))
            if(prevchap==''):
                prevchap=capoccintiri
            else:
                synt_rdf.add((prevchap, tresont.intervalTo, Literal(libroLength, datatype=XSD.integer)))
                prevchap=capoccintiri
            cps.add(testdf.iloc[i].numcap)
        
        #OCCORRENZA VERSO
        # print(testdf.iloc[i].numverso)
        versoocciri=URIRef(its[f'{idOp}_{idbase}_{idcapitolo}_{testdf.iloc[i].numverso}_o'])
        versooccintiri=URIRef(its[f'{idOp}_{idbase}_{idcapitolo}_{testdf.iloc[i].numverso}_i'])
        versoiri=URIRef(its[f'{idOp}_{testdf.iloc[i].irifrag}_v_se'])
    
        synt_rdf.add((versoocciri, RDF.type, uritf))
        synt_rdf.add((versoocciri, tresont.hasStructuralType, verseindividual))
        synt_rdf.add((versoocciri, tresont.hasNumber, Literal(testdf.iloc[i].numverso, datatype=XSD.integer)))
        synt_rdf.add((versoocciri, tresont.inSequence, libroiri))
        synt_rdf.add((versoocciri, tresont.inSequence, capocciri))
        
        #Intervallo del verso
        synt_rdf.add((versoocciri, tresont.hasInterval, versooccintiri))
        synt_rdf.add((versooccintiri, RDF.type, uriinterval))
        synt_rdf.add((versooccintiri, tresont.intervalFrom, Literal(libroLength+1, datatype=XSD.integer)))

        # versostart=libroLength+1
        
        synt_rdf.add((versoocciri, tresont.occurrenceOf, versoiri))
        versotext=Literal(v, lang='la')
        synt_rdf.add((versoiri, RDF.type, tresont.SyntacticEntity))
        synt_rdf.add((versoiri, ecrm.P190_has_symbolic_content, versotext))
        synt_rdf.add((versoiri, tresont.isPartOf, capiri))
        #TESTO
        verso=v
        for si in punct:
            verso=verso.replace(si, '')
        listaitems=verso.split()
        mylemmi_df=df_lemmi[df_lemmi.irifrag==testdf.iloc[i].irifrag]#seleziono i lemmi asociati alle parole
        idfrag=testdf.iloc[i].irifrag
        #aggiunto per problema pairiseq
        vnumt=testdf.iloc[i].numverso
        for idpa, pa in enumerate(listaitems):
            chaptokenindex+=1
            pairi=URIRef(its[f'{idOp}_{idbase}_{idfrag}_{idpa+1}_f_se'])

            # pairi=URIRef(its[f'{idOp}_{idbase}_f_se'])
            
            pairiseq=URIRef(its[f'{idOp}_{idbase}_{idcapitolo}_{vnumt}_{idfrag}_{chaptokenindex}_seq'])
            pale=pa.strip()
            if (pale==mylemmi_df.iloc[idpa].token):
                lemma=mylemmi_df.iloc[idpa].lemma
                pale=f"{pale}_{lemma.strip()}"
            else:
                print(f" omg {pa}... {idpa} - {mylemmi_df.iloc[idpa].token} - {testdf.iloc[i].testo}")
                print(listaitems)
            
            indicat=''
            if (mylemmi_df.iloc[idpa].upos.strip()!='X'):
                ciri=f"{sigleCat[mylemmi_df.iloc[idpa].upos.strip()][1]}"
                adde=''
                adde=('').join(re.findall(r'[A-Z]',pale)).lower()
                if (adde.strip()!=''):
                    if (len (adde)>3):
                        adde=adde[0:3]
                    adde=f"-{adde.strip()}"
                indicat=f"_{ciri}{adde}"
            
            # lexical entry    
            lepairi=URIRef(its[f'{idOp}_{pale}{indicat}'])
            # pairi=URIRef(its[f'{idOp}_{pale}{indicat}_f_se'])
            
            synt_rdf.add((lepairi, RDF.type, urile))
            synt_rdf.add((lepairi, ontolex.lexicalForm, pairi))
            
            if (pa in punct):
                synt_rdf.add((pairi, RDF.type, uripunct))
            else:        
                synt_rdf.add((pairi, RDF.type, ontolex['Form']))
                
            synt_rdf.add((pairi, ontolex.writtenRep, Literal(pa, lang='la')))
            synt_rdf.add((pairi, tresont.isPartOf, versoiri))
            # synt_rdf.add((pairi, tresont.isPartOf, capiri))
            
            

            synt_rdf.add((pairiseq, RDF.type, tresont.TextSequenceElement))
            synt_rdf.add((pairiseq, tresont.occurrenceOf, pairi))
            synt_rdf.add((pairiseq, tresont.hasPosition, Literal(chaptokenindex, datatype=XSD.integer)))
            synt_rdf.add((pairiseq, tresont.inSequence, libroiri))
            synt_rdf.add((pairiseq, tresont.inSequence, capocciri))
            
            #lemma
            irilemma=URIRef(its[f'{idOp}_lm_{lemma}{indicat}'])
                
            lang_rdf.add((irilemma, RDF.type, uritilemma))
            lang_rdf.add((irilemma, ontolex.writtenRep, Literal(lemma, lang='la')))
            lang_rdf.add((lepairi, ontolex.canonicalForm, irilemma))
            if(mylemmi_df.iloc[idpa].upos.strip()!='X'):
                categoriav=sigleCat[mylemmi_df.iloc[idpa].upos.strip()][0]
                uricatv=URIRef(lila[categoriav])
                lang_rdf.add((irilemma, lila['hasPOS'], uricatv))
                if (len(mylemmi_df.iloc[idpa].linking)>0):
                    sas=mylemmi_df.iloc[idpa].linking[0]
                    irisas=''
                    if ('lilaLemma:' in sas):
                        sas=sas.replace('lilaLemma:', '')
                        irisas=URIRef(lilaLemma[sas])
                        
                    if ('lilaIpoLemma:' in sas):
                        sas=sas.replace('lilaIpoLemma:', '')
                        irisas=URIRef(lilaIpoLemma[sas])
                    if (irisas!=''):    
                        lang_rdf.add((irilemma, OWL.sameAs, irisas))
               
                
        libroLength+=len(listaitems)
        synt_rdf.add((versooccintiri, tresont.intervalTo, Literal(libroLength, datatype=XSD.integer)))
    synt_rdf.add((libroiri, tresont.hasLength, Literal(chaptokenindex, datatype=XSD.integer)))

['genesis' 'exodus' 'leviticus' 'numbers' 'deuteronomy' 'joshua' 'judges'
 'ruth' 'samuel-1' 'samuel-2' 'kings-1' 'kings-2' 'chronicles-1'
 'chronicles-2' 'esdrae-1' 'esdrae-2' 'tobit' 'judith' 'esther' 'job'
 'psalms' 'psalms-iuxtra-hebraicum' 'proverbs' 'ecclesiastes'
 'song-of-solomon' 'wisdom' 'sirach' 'isaiah' 'jeremiah' 'lamentations'
 'baruch' 'ezekiel' 'daniel' 'hosea' 'joel' 'amos' 'obadiah' 'jonah'
 'micah' 'nahum' 'habakkuk' 'zephaniah' 'haggai' 'zachariah' 'malachi'
 'maccabees-1' 'maccabees-2' 'matthew' 'mark' 'luke' 'john' 'acts'
 'romans' 'corinthias-1' 'corinthias-2' 'galatians' 'ephesians'
 'philippians' 'colossians' 'thessalonians-1' 'thessalonians-2'
 'timothy-1' 'timothy-2' 'titus' 'philemon' 'hebrews' 'james' 'peter-1'
 'peter-2' 'john-1' 'john-2' 'john-3' 'jude' 'revelation'
 'prayer-of-manasseh' 'esdrae-3' 'esdrae-4' 'psalm-151' 'laodicenses']


79it [03:12,  2.44s/it]


In [None]:
synt_rdf.serialize(destination=(f'data/output/{siglaopera}_str_prod_test.ttl'), format="n3");#format="xml")
lang_rdf.serialize(destination=(f'data/output/{siglaopera}_lemmi1__prod_test.rdf'), format="xml");#format="xml");

In [None]:
len(synt_rdf)