### S_Vl: creazione del grafo RDF

Questo notebook implementa una procedura per la creazione del grafo RDF del testo della S_Vl secondo gli assiomi dell'ontologia [TresOnt](https://docs.google.com/document/d/1n-OlAy1KleovGgHV4ZHhOSgkgMfsOKPDWT603_fic5k/edit?tab=t.0#heading=h.ng7sdvi05k6u).  

__Grafo della struttura__ Il testo è trasformato in formato tabellare facendo il parsing dei file <a href="https://en.wikipedia.org/wiki/Open_Scripture_Information_Standard">Open Scripture Information Standard (OSIS)</a> presenti nella <a href="https://drive.google.com/drive/folders/1KEFTwR1kLz1Ec_-vU3k-yoFcaynBoP7x">cartella</a> del drive condiviso del WP8. Alle righe della tabella sono applicate le regole di trasformazione in triple.  
__Grafo delle caratteristiche linguistiche__ Alle risorse della struttura sono associate le caratteristiche linguistiche ottenute da LiLa


####  Librerie e configurazione

In [1]:
import pandas as pd
import os
import yaml
import rdflib
from itserr_lib import OsisParser as op
import re
from tqdm import tqdm

In [2]:
if (os.path.isfile('config-s_vl.yaml')):
    configfile="config-s_vl.yaml"
else:
    raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), "config-s_vl.yaml")
try:
    with open(configfile, 'r') as stream:
        try:
            conf=yaml.safe_load(stream)
        except yaml.YAMLError as exc:
            print(exc)
except FileNotFoundError:
    print('Warning config.yaml file not present! Please create it and set the values, store it in the main directory')

#### 1 Creo il dataframe con il testo della Vulgata

In [3]:
files=conf['TEXTFILES']
source=conf['LIBRI_S_VL']

In [4]:
opars=op()

In [5]:
df_opera=opars.getDataFrame(source, files)

100%|█████████████████████████████████████████████████████████████████████████████████████| 72/72 [00:00<00:00, 135.35it/s]


Creating IRIs...
done.


#### 2 Applico le regole per la creazione del grafo

Inizializzo la tabella con i lemmi

In [6]:
mydf0=pd.read_pickle('data/output/s_vl_lemmi_0k-5k.pickle')
mydf1=pd.read_pickle('data/output/s_vl_lemmi_5k-10k.pickle')
mydf2=pd.read_pickle('data/output/s_vl_lemmi_10k-e.pickle')
df_lemmi=pd.concat([mydf0, mydf1, mydf2])
df_lemmi.info()

<class 'pandas.core.frame.DataFrame'>
Index: 366314 entries, 0 to 208564
Data columns (total 6 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   token       366314 non-null  object
 1   lemma       366314 non-null  object
 2   upos        366314 non-null  object
 3   spaceAfter  366314 non-null  object
 4   linking     366314 non-null  object
 5   irifrag     366314 non-null  int64 
dtypes: int64(1), object(5)
memory usage: 19.6+ MB


In [7]:
df_opera[df_opera.libro.str.contains('acts') & df_opera.testo.str.contains('quemquam')].tail(10)

Unnamed: 0,libro,idverso,numcap,numverso,testo,irifrag
18104,acts,acts.8.16,acts.8,16,Nondum enim' in quemquam eorum de ciderat: sol...,18104


In [8]:
df_lemmi[df_lemmi.token.str.contains('enim')].iloc[1370:1390]

Unnamed: 0,token,lemma,upos,spaceAfter,linking,irifrag
137101,enim,enim,ADV,,[lilaLemma:101119],17955
137124,enim,enim,ADV,,[lilaLemma:101119],17956
137131,enim,enim,ADV,,[lilaLemma:101119],17957
137221,enim,enim,ADV,,[lilaLemma:101119],17962
137362,enim,enim,ADV,,[lilaLemma:101119],17969
137369,enim,enim,ADV,,[lilaLemma:101119],17969
137798,invenimus,inuenio,VERB,,[lilaLemma:108806],17995
137812,invenimus,inuenio,VERB,,[lilaLemma:108806],17995
137870,enim,enim,ADV,,[lilaLemma:101119],17998
138011,enim,enim,ADV,,[lilaLemma:101119],18008


In [9]:
df_lemmi.loc[139709:139720]

Unnamed: 0,token,lemma,upos,spaceAfter,linking,irifrag
139709,ut,ut,SCONJ,,[lilaLemma:130906],18103
139710,acciperent,accipio,VERB,,[lilaLemma:87119],18103
139711,Spiritum,spiritus,NOUN,,[lilaLemma:125406],18103
139712,sanctum,sanctus,ADJ,,[lilaIpoLemma:39324],18103
139713,Nondum,nondum,ADV,,[lilaLemma:114073],18104
139714,enim,enim,ADV,,[lilaLemma:101119],18104
139715,',',PUNCT,,[],18104
139716,in,in,ADP,,[lilaLemma:106748],18104
139717,quemquam,quisquam,PRON,,[lilaLemma:121315],18104
139718,eorum,is,DET,,[lilaLemma:109083],18104


Inizializzo i namespaces

In [10]:
from rdflib.namespace import DC, DCAT, DCTERMS, OWL, \
                            RDF, RDFS, SKOS,  \
                           XMLNS, XSD, XMLNS
from rdflib import Namespace
from rdflib import URIRef, BNode, Literal
its=Namespace(conf['NAMESPACES']['its'])
ecrm=Namespace(conf['NAMESPACES']['ecrm'])
ontolex=Namespace(conf['NAMESPACES']['ontolex'])
tresont=Namespace(conf['NAMESPACES']['tresont'])
lila=Namespace(conf['NAMESPACES']['lila'])
orl=Namespace(conf['NAMESPACES']['orl'])
lilaLemma=Namespace(conf['NAMESPACES']['lilaLemma'])
lilaIpoLemma=Namespace(conf['NAMESPACES']['lilaIpoLemma'])

siglaopera=conf['SIGLAOPERA']
titoloopera=conf['TITOLOOPERA']
idencodes=conf['TITOLILIBRI']
sigleCat=conf['SIGLEPOS']
punct=['.', ',', ':', ';', '?', '!', '†', '※', '(', ')', '-', '..', '"', '[', ']', '•']

In [11]:
synt_rdf = rdflib.Graph(identifier = URIRef('https://itserr.it/struct'))
lang_rdf = rdflib.Graph(identifier = URIRef('https://itserr.it/lang'))
# Declare prefixes
synt_rdf.bind("its", its)
synt_rdf.bind("ecrm", ecrm)
lang_rdf.bind("ecrm", ecrm)
synt_rdf.bind("tresont", tresont)
lang_rdf.bind("tresont", tresont)
synt_rdf.bind("orl", orl)
synt_rdf.bind("lila", lila)
lang_rdf.bind("lila", lila)
synt_rdf.bind("ontolex", ontolex)
lang_rdf.bind("ontolex", ontolex)
lang_rdf.bind("lilaLemma", lilaLemma)
lang_rdf.bind("lilaIpoLemma", lilaIpoLemma)
synt_rdf.bind("orl", orl)


In [12]:
#IRIs

workiri=URIRef(its[f'{siglaopera}'])
uritf=URIRef(tresont.TextFragment)
uriinterval=URIRef(tresont.Interval)
opirititle=URIRef(its[f'{siglaopera}_ti'])
uritpi=URIRef(tresont.TextualPositionByIndex)
urile=URIRef(ontolex.LexicalEntry)
uriale= URIRef(tresont.AlphabetElement)
uripunct= URIRef(tresont.PunctuationMark)
uritilemma=URIRef(ontolex.Form)

#Aggiungo le triple al grafo

verseindividual=URIRef(tresont['verseType'])
chapterindividual=URIRef(tresont['chapterType'])
bookindividual=URIRef(tresont['bookType'])
synt_rdf.add((verseindividual, RDF.type, tresont.StructuralType))
synt_rdf.add((verseindividual, RDFS.label, Literal('Verse', lang='en')))
synt_rdf.add((chapterindividual, RDF.type, tresont.StructuralType))
synt_rdf.add((chapterindividual, RDFS.label, Literal('Chapter', lang='en')))
synt_rdf.add((bookindividual, RDF.type, tresont.StructuralType))
synt_rdf.add((bookindividual, RDFS.label, Literal('Book', lang='en')))

synt_rdf.add((verseindividual, ecrm.P127_has_broader_term, chapterindividual))
synt_rdf.add((chapterindividual, ecrm.P127_has_broader_term, bookindividual))

libri=df_opera.libro.unique()
synt_rdf.add((workiri, RDF.type, URIRef(tresont.OrderedTextualCollection)))
synt_rdf.add((opirititle, RDF.type, ecrm.E35_title))
synt_rdf.add((opirititle, RDFS.label, Literal(titoloopera, lang='la')))
synt_rdf.add((workiri, ecrm.P102_has_title,opirititle))
synt_rdf.add((workiri, tresont.hasLength, Literal(len(libri), datatype=XSD.integer)))

<Graph identifier=https://itserr.it/struct (<class 'rdflib.graph.Graph'>)>

In [13]:
print(libri)

['Gen' 'Ex' 'Lev' 'Num' 'Deut' 'Josh' 'Judg' 'Ruth' '1Sam' '2Sam' '1Kings'
 '2Kings' '1Chr' '2Chr' 'esdrae-2' 'Tob' 'Judith' 'Esth' 'Job' 'Prov'
 'Eccl' 'Song' 'Wis' 'Sir' 'Is' 'Jeremiah' 'Ier' 'Bar' 'Ezek' 'Dan' 'Hos'
 'Joel' 'Amos' 'Obad' 'Jonah' 'Mic' 'Nah' 'Hab' 'Zeph' 'Hag' 'Zech' 'Mal'
 '1Mac' '2Mac' 'matthew' 'mark' 'luke' 'john' 'acts' 'romans'
 'corinthians-1' 'corinthians-2' 'galatians' 'ephesians' 'philippians'
 'colossians' 'thessalonians-1' 'thessalonians-2' 'timothy-1' 'timothy-2'
 'titus' 'philemon' 'hebrews' 'james' 'peter-1' 'peter-2' 'john-1'
 'john-2' 'john-3' 'jude' 'revelation' 'esdrae-3']


In [14]:
# creazione del grafo della struttura dell'opera
from tqdm import tqdm
bs=0
idOp=siglaopera
# for libro in tqdm(libri):
for idlibro, libro in tqdm(enumerate(libri)):
    libroLength=0
    # idbasekey=f'{libro[0].upper()}{libro[1:]}'
    idbasekey=f'{libro}'
    # print(idbasekey)
    idbase=idencodes[idbasekey][0].lower()
    titololibro=idencodes[idbasekey][-1] #idbasekey
    titololibrolat=idencodes[idbasekey][1]
    titololibrolatalt=''
    if (len(idencodes[idbasekey])==4):
        titololibrolatalt=idencodes[idbasekey][2]
    # print(f" processing {idbase}, {titololibrolat}, {titololibrolatalt}")
    
    bs=idlibro+1
    
    # LIBRO
    libroiri=URIRef(its[f'{idOp}_{idbase}'])
    libroirititle=URIRef(its[f'{idOp}_{idbase}_ti'])
    libroiriseq=URIRef(its[f'{idOp}_{idbase}_{bs}_seq'])
    synt_rdf.add((libroirititle, RDF.type, ecrm.E35_title))
    synt_rdf.add((libroirititle, RDFS.label, Literal(titololibro, lang='en')))
    synt_rdf.add((libroirititle, RDFS.label, Literal(titololibrolat, lang='la')))
    if (titololibrolatalt!=''):
        synt_rdf.add((libroirititle, SKOS.altLabel, Literal(titololibrolatalt, lang='la')))
    synt_rdf.add((libroiri, RDF.type, tresont.Book))
    synt_rdf.add((libroiri, ecrm.P102_has_title, libroirititle))
    
        
    synt_rdf.add((libroiriseq, RDF.type, tresont.TextSequenceElement))
    synt_rdf.add((libroiriseq, tresont.occurrenceOf, libroiri))
    synt_rdf.add((libroiriseq,tresont.hasPosition, Literal(bs, datatype=XSD.integer)))
    synt_rdf.add((libroiriseq,tresont.inSequence, workiri))
    synt_rdf.add((workiri, tresont.hasSequenceElement, libroiriseq))
    
    testdf=df_opera[df_opera.libro==libro].copy() #seleziono i versetti del libro
    
    testdf.reset_index(drop=True, inplace=True)
    cps={'test'}
    prevchap=''
    chaptokenindex=0
    for i, v in testdf['testo'].items():
       
        # CAPITOLO

        if (not testdf.iloc[i].numcap in cps):
            idcapitolo=(testdf.iloc[i].numcap).split(".")[1]  
            capoccintiri=URIRef(its[f'{idOp}_{idbase}_{idcapitolo}_i'])
            capocciri=URIRef(its[f'{idOp}_{idbase}_{idcapitolo}_o'])
            capiri=URIRef(its[f'{idOp}_{idbase}_{idcapitolo}_c_se'])
            synt_rdf.add((capocciri, RDF.type, uritf))
            synt_rdf.add((capocciri, tresont.hasStructuralType, chapterindividual))
            synt_rdf.add((capocciri, tresont.hasNumber, Literal(idcapitolo, datatype=XSD.integer)))
            synt_rdf.add((capocciri, tresont.inSequence, libroiri))
            synt_rdf.add((capocciri, tresont.occurrenceOf, capiri))
            synt_rdf.add((capiri, RDF.type, tresont.SyntacticEntity))
            
            #Intervallo del capitolo
            synt_rdf.add((capocciri, tresont.hasInterval, capoccintiri))
            synt_rdf.add((capoccintiri, RDF.type, uriinterval))
            synt_rdf.add((capoccintiri, tresont.intervalFrom, Literal(libroLength+1, datatype=XSD.integer)))
            if(prevchap==''):
                prevchap=capoccintiri
            else:
                synt_rdf.add((prevchap, tresont.intervalTo, Literal(libroLength, datatype=XSD.integer)))
                prevchap=capoccintiri
            cps.add(testdf.iloc[i].numcap)
        
        #OCCORRENZA VERSO
        versoocciri=URIRef(its[f'{idOp}_{idbase}_{idcapitolo}_{testdf.iloc[i].numverso}_o'])
        versooccintiri=URIRef(its[f'{idOp}_{idbase}_{idcapitolo}_{testdf.iloc[i].numverso}_i'])
        versoiri=URIRef(its[f'{idOp}_{testdf.iloc[i].irifrag}_v_se'])
    
        synt_rdf.add((versoocciri, RDF.type, uritf))
        synt_rdf.add((versoocciri, tresont.hasStructuralType, verseindividual))
        synt_rdf.add((versoocciri, tresont.hasNumber, Literal(testdf.iloc[i].numverso, datatype=XSD.integer)))
        synt_rdf.add((versoocciri, tresont.inSequence, libroiri))
        synt_rdf.add((versoocciri, tresont.inSequence, capocciri))
        
        #Intervallo del verso
        synt_rdf.add((versoocciri, tresont.hasInterval, versooccintiri))
        synt_rdf.add((versooccintiri, RDF.type, uriinterval))
        synt_rdf.add((versooccintiri, tresont.intervalFrom, Literal(libroLength+1, datatype=XSD.integer)))

        # versostart=libroLength+1
        
        synt_rdf.add((versoocciri, tresont.occurrenceOf, versoiri))
        versotext=Literal(v, lang='la')
        synt_rdf.add((versoiri, RDF.type, tresont.SyntacticEntity))
        synt_rdf.add((versoiri, ecrm.P190_has_symbolic_content, versotext))
        synt_rdf.add((versoiri, tresont.isPartOf, capiri))
        #TESTO
        verso=v
        for si in punct:
            verso=verso.replace(si, '')
        verso=verso.replace("irri’dentes","irri ’dentes")
        verso=verso.replace("et partes* Libyae","et partes * Libyae")
        verso=verso.replace("docerent* ei populum","docerent * ei populum")
        verso=verso.replace("multiplicantium discipulorum*","multiplicantium discipulorum *")
        verso=verso.replace("servitute* redigent","servitute * redigent")
        verso=verso.replace("me* quem","me * quem")
        verso=verso.replace("impetum* fecerunt","impetum * fecerunt")
        verso=verso.replace("Nondum enim'","Nondum enim '")
        verso=verso.replace("dantes* eum", "dantes * eum")
        verso=verso.replace("potestatem a*", "potestatem a *")
        verso=verso.replace("ecce* venit","ecce * venit")
        verso=verso.replace(" ut permanent*"," ut permanent *")
        verso=verso.replace(" et contumeliis*", " et contumeliis *")
        verso=verso.replace(" vobis* ab ", " vobis * ab ")
        verso=verso.replace("David* quod ", "David * quod ")
        verso=verso.replace(" saeculo* sunt ", " saeculo * sunt ")
        verso=verso.replace(" nobis* coactis "," nobis * coactis " )
        verso=verso.replace(" enim sancto* "," enim sancto * ")
        verso=verso.replace(" permisit eos* "," permisit eos * ")
        verso=verso.replace(" protinus* ostia "," protinus * ostia ")
        verso=verso.replace(" viros* quosdam "," viros * quosdam ")
        verso=verso.replace(" philosophorum* disserebant "," philosophorum * disserebant ")
        verso=verso.replace(" Sosthenem* principem "," Sosthenem * principem ")
        verso=verso.replace(" Aculas* qui "," Aculas * qui ")
        verso=verso.replace(" Iudaei* principis "," Iudaei * principis ")
        verso=verso.replace(" destrui* maiestas "," destrui * maiestas ")
        verso=verso.replace(" Asiae constituti* "," Asiae constituti * ")
        verso=verso.replace(" Ecclesia* absolvi"," Ecclesia * absolvi")
        verso=verso.replace(" seditionis hodiernae* "," seditionis hodiernae * ")
        verso=verso.replace(" Iudaeis* incipienti "," Iudaeis * incipienti ")
        verso=verso.replace("Omnia ostendi* ","Omnia ostendi * ")
        verso=verso.replace(" tibi contra* ", " tibi contra * ")
        verso=verso.replace(" licet vobis* ", " licet vobis * ")
        verso=verso.replace(" sedes* iudicans ", " sedes * iudicans ")
        verso=verso.replace(" quam adpropiet* ", " quam adpropiet * ")
        verso=verso.replace(" nos accusamus* ", " nos accusamus * ")
        verso=verso.replace(" dicunt* haeresim ", " dicunt * haeresim ")
        verso=verso.replace(" die* deservientes ", " die * deservientes ")
        verso=verso.replace(" Romae* vocatis ", " Romae * vocatis ")
        verso=verso.replace(" Deum* glorificaverunt ", " Deum * glorificaverunt ")
        verso=verso.replace("d* ", "d * ").replace("m* ", "m * ").replace("o* ", "o * ")
        verso=verso.replace("r* ", "r * ").replace("t* ", "t * ").replace("s* ", "s * ")
        verso=verso.replace("i* ", "i * ").replace("e* ", "e * ").replace('m“ ','m “ ').replace ("Tamen* ","Tamen * ")
        verso=verso.replace(" per * inobedientiam", " per* inobedientiam")
        verso=verso.replace(" liberabitur a* ", " liberabitur a * ")
        verso=verso.replace(" gloriae** filiorum ", " gloriae * * filiorum ")
        verso=verso.replace("a* ", "a * ").replace(" et’ "," et ’ ")
        verso=verso.replace(" Solutus es * "," Solutus es* ")
        verso=verso.replace(" hoc* induerit "," hoc * induerit ")
        verso=verso.replace(" hoc* ipsum "," hoc * ipsum ")
        verso=verso.replace(" haec* custodias "," haec * custodias ")
        verso=verso.replace(" enim haec* dicitur "," enim haec * dicitur ")
        verso=verso.replace(" conscientiâ* "," conscientiâ * ")
        verso=verso.replace(" sumus per * oblationem "," sumus per* oblationem ")
        
        listaitems=verso.split()
        mylemmi_df=df_lemmi[df_lemmi.irifrag==testdf.iloc[i].irifrag]#seleziono i lemmi asociati alle parole
        idfrag=testdf.iloc[i].irifrag
        for idpa, pa in enumerate(listaitems):
            chaptokenindex+=1
            pairi=URIRef(its[f'{idOp}_{idbase}_{idfrag}_{idpa+1}_f_se'])
            
            pairiseq=URIRef(its[f'{idOp}_{idbase}_{idfrag}_{chaptokenindex}_seq'])
            pale=pa.strip()
            if (pale==mylemmi_df.iloc[idpa].token):
                lemma=mylemmi_df.iloc[idpa].lemma
                pale=f"{pale}_{lemma.strip()}"
            else:
                print(f" omg {pa}... {idpa} - {mylemmi_df.iloc[idpa].token} - {testdf.iloc[i].numverso}")
                print(listaitems)
            
            indicat=''
            if (mylemmi_df.iloc[idpa].upos.strip()!='X'):
                ciri=f"{sigleCat[mylemmi_df.iloc[idpa].upos.strip()][1]}"
                adde=''
                adde=('').join(re.findall(r'[A-Z]',pale)).lower()
                if (adde.strip()!=''):
                    if (len (adde)>3):
                        adde=adde[0:3]
                    adde=f"-{adde.strip()}"
                indicat=f"_{ciri}{adde}"
            
            # lexical entry    
            lepairi=URIRef(its[f'{idOp}_{pale}{indicat}'])
            # pairi=URIRef(its[f'{idOp}_{pale}{indicat}_f_se'])
            
            synt_rdf.add((lepairi, RDF.type, urile))
            synt_rdf.add((lepairi, ontolex.lexicalForm, pairi))
            
            if (pa in punct):
                synt_rdf.add((pairi, RDF.type, uripunct))
            else:        
                synt_rdf.add((pairi, RDF.type, ontolex['Form']))
                
            synt_rdf.add((pairi, ontolex.writtenRep, Literal(pa, lang='la')))
            synt_rdf.add((pairi, tresont.isPartOf, versoiri))
            # synt_rdf.add((pairi, tresont.isPartOf, capiri))
            
            

            synt_rdf.add((pairiseq, RDF.type, tresont.TextSequenceElement))
            synt_rdf.add((pairiseq, tresont.occurrenceOf, pairi))
            synt_rdf.add((pairiseq, tresont.hasPosition, Literal(chaptokenindex, datatype=XSD.integer)))
            synt_rdf.add((pairiseq, tresont.inSequence, libroiri))
            synt_rdf.add((pairiseq, tresont.inSequence, capocciri))
            
            #lemma
            irilemma=URIRef(its[f'{idOp}_lm_{lemma}{indicat}'])
                
            lang_rdf.add((irilemma, RDF.type, uritilemma))
            lang_rdf.add((irilemma, ontolex.writtenRep, Literal(lemma, lang='la')))
            lang_rdf.add((lepairi, ontolex.canonicalForm, irilemma))
            if(mylemmi_df.iloc[idpa].upos.strip()!='X'):
                categoriav=sigleCat[mylemmi_df.iloc[idpa].upos.strip()][0]
                uricatv=URIRef(lila[categoriav])
                lang_rdf.add((irilemma, lila['hasPOS'], uricatv))
                if (len(mylemmi_df.iloc[idpa].linking)>0):
                    sas=mylemmi_df.iloc[idpa].linking[0]
                    irisas=''
                    if ('lilaLemma:' in sas):
                        sas=sas.replace('lilaLemma:', '')
                        irisas=URIRef(lilaLemma[sas])
                        
                    if ('lilaIpoLemma:' in sas):
                        sas=sas.replace('lilaIpoLemma:', '')
                        irisas=URIRef(lilaIpoLemma[sas])
                    if (irisas!=''):    
                        lang_rdf.add((irilemma, OWL.sameAs, irisas))
               
                
        libroLength+=len(listaitems)
        synt_rdf.add((versooccintiri, tresont.intervalTo, Literal(libroLength, datatype=XSD.integer)))
    synt_rdf.add((libroiri, tresont.hasLength, Literal(chaptokenindex, datatype=XSD.integer)))

72it [01:39,  1.38s/it]


In [None]:
synt_rdf.serialize(destination=(f'data/output/{siglaopera}_str5_prod.ttl'), format="n3");#format="xml")
lang_rdf.serialize(destination=(f'data/output/{siglaopera}_lemmi5__prod.rdf'), format="xml");#format="xml");

In [None]:
len(synt_rdf)