In [49]:
from datetime import datetime
import pandas as pd
from rdflib import Graph, URIRef, Literal, Namespace, XSD
from rdflib.namespace import RDF, RDFS

In [50]:

def edition2rdf(data, g, eb):
    edition = URIRef("https://w3id.org/eb/i/Edition/"+str(data["MMSID"]))
    edition_title= "Edition "+ str(data["editionNum"])+"," +str(data["year"])
    g.add((edition, RDF.type, eb.Edition))
    g.add((edition, eb.number, Literal(data["editionNum"], datatype=XSD.integer)))
    g.add((edition, eb.title, Literal(edition_title, datatype=XSD.string)))
    g.add((edition, eb.subtitle, Literal(data["editionSubTitle"], datatype=XSD.string)))
    g.add((edition, eb.publicationYear, Literal(data["year"], datatype=XSD.integer)))
    g.add((edition, eb.printedAt, Literal(data["place"], datatype=XSD.string)))
    g.add((edition, eb.mmsid, Literal(str(data["MMSID"]), datatype=XSD.string)))
    g.add((edition, eb.physicalDescription, Literal(data["physicalDescription"], datatype=XSD.string)))
    g.add((edition, eb.genre, Literal(data["genre"], datatype=XSD.string)))
    g.add((edition, eb.language, Literal(data["language"], datatype=XSD.string)))
    g.add((edition, eb.shelfLocator, Literal(data["shelfLocator"], datatype=XSD.string)))
    g.add((edition, eb.numberOfVolumes, Literal(data["numberOfVolumes"], datatype=XSD.integer)))

    #### Editor 

    name=data["editor"].replace(" ", "")
    editor = URIRef("https://w3id.org/eb/i/Person/"+str(name))
    g.add((editor, RDF.type, eb.Person))
    g.add((editor, eb.name, Literal(data["editor"], datatype=XSD.string)))

    if data["editor_date"]!=0:
        tmpDate=data["editor_date"].split("-")
        birthDate=datetime.strptime(tmpDate[0], '%Y')
        deathDate=datetime.strptime(tmpDate[1], '%Y')
        g.add((editor, eb.birthDate, Literal(birthDate, datatype=XSD.dateTime)))
        g.add((editor, eb.deathDate, Literal(deathDate, datatype=XSD.dateTime)))
    
    if data["termsOfAddress"] != 0:
        g.add((editor, eb.termsOfAddress, Literal(data["termsOfAddress"], datatype=XSD.string)))

    g.add((edition, eb.editor, editor))

    #### Publishers Persons 

    #This was the result to pass entity recognition to publisher

    if data["publisherPersons"] != 0:
        publisherPersons=name=data["publisherPersons"]
        for p in publisherPersons: 
            name=p.replace(" ", "")
            publisher = URIRef("https://w3id.org/eb/i/Person/"+name)
            g.add((publisher, RDF.type, eb.Person))
            g.add((publisher, eb.name, Literal(p, datatype=XSD.string)))
            g.add((edition, eb.publisher, publisher))
        
    #### Is Referenced by  

    if data["referencedBy"] != 0:
        references=data["referencedBy"]
        for r in references: 
            name=r.replace(" ", "")
            book = URIRef("https://w3id.org/eb/i/Book/"+name)
            g.add((book, RDF.type, eb.Book))
            g.add((book, eb.title, Literal(r, datatype=XSD.string)))
            g.add((edition, eb.referencedBy, book))
            
    return g, edition

In [52]:
df= pd.read_json('../data/final_eb_7_dataframe_clean', orient="index") 

In [53]:
df.loc[0]

term                                                                   A
definition             The first letter of the alphabet in every know...
MMSID                                                   9910796273804340
edTitle                                   Seventh edition, General index
editor                                                   Stewart, Dugald
editor_date                                                    1753-1828
genre                                                       encyclopedia
language                                                             eng
termsOfAddress                                                       Sir
numberOfPages                                                          1
physicalDescription                                   21 v. in 22 ; 4to.
place                                                          Edinburgh
publisher                                                  A. & C. Black
referencedBy                                       

In [54]:
print(df.shape)

(23122, 39)


In [58]:
df = df.sort_values(by="term")
df.head(10)

Unnamed: 0,term,definition,MMSID,edTitle,editor,editor_date,genre,language,termsOfAddress,numberOfPages,...,numberOfVolumes,relatedTerms,header,startsAt,endsAt,numberOfTerms,numberOfWords,positionPage,typeTerm,altoXML
0,A,The first letter of the alphabet in every know...,9910796273804340,"Seventh edition, General index","Stewart, Dugald",1753-1828,encyclopedia,eng,Sir,1,...,20,Not specified,Not specified,1,1,Not specified,1440,1,Article,eb07-v1.2-TXT/a2/kp-eb0702-000101-9822-v1.txt
1,A,as an abbreviation is likewise of frequent occ...,9910796273804340,"Seventh edition, General index","Stewart, Dugald",1753-1828,encyclopedia,eng,Sir,1,...,20,Not specified,Not specified,1,1,Not specified,135,2,Topic,eb07-v1.2-TXT/a2/kp-eb0702-000101-9822-v1.txt
2925,A',= 4 rf . Γ rn.^dχ . J J [ 1 + (m — 1) x 2] . ...,9910796273804340,"Seventh edition, General index","Stewart, Dugald",1753-1828,encyclopedia,eng,Sir,1,...,20,Not specified,Not specified,185,185,Not specified,1800,1,Article,eb07-v1.2-TXT/a4/kp-eb0704-018501-1720-v1.txt
3,AA,a river in the province of Overyssel. in the N...,9910796273804340,"Seventh edition, General index","Stewart, Dugald",1753-1828,encyclopedia,eng,Sir,1,...,20,Not specified,Not specified,2,2,Not specified,16,2,Topic,eb07-v1.2-TXT/a2/kp-eb0702-000201-9835-v1.txt
4,AA,a river of the province of Antwerp in the Neth...,9910796273804340,"Seventh edition, General index","Stewart, Dugald",1753-1828,encyclopedia,eng,Sir,1,...,20,Not specified,Not specified,2,2,Not specified,17,3,Topic,eb07-v1.2-TXT/a2/kp-eb0702-000201-9835-v1.txt
5,AA,a river of France rising in the Pas de Calais;...,9910796273804340,"Seventh edition, General index","Stewart, Dugald",1753-1828,encyclopedia,eng,Sir,1,...,20,Not specified,Not specified,2,2,Not specified,59,4,Topic,eb07-v1.2-TXT/a2/kp-eb0702-000201-9835-v1.txt
6,AA,a river in the Russian government of Courland ...,9910796273804340,"Seventh edition, General index","Stewart, Dugald",1753-1828,encyclopedia,eng,Sir,1,...,20,Not specified,Not specified,2,2,Not specified,37,5,Topic,eb07-v1.2-TXT/a2/kp-eb0702-000201-9835-v1.txt
7,AA,a river in the Russian province of Livonia whi...,9910796273804340,"Seventh edition, General index","Stewart, Dugald",1753-1828,encyclopedia,eng,Sir,1,...,20,Not specified,Not specified,2,2,Not specified,19,6,Topic,eb07-v1.2-TXT/a2/kp-eb0702-000201-9835-v1.txt
2,AA,"a river of the province of Groningen, in the k...",9910796273804340,"Seventh edition, General index","Stewart, Dugald",1753-1828,encyclopedia,eng,Sir,1,...,20,Not specified,Not specified,2,2,Not specified,28,1,Topic,eb07-v1.2-TXT/a2/kp-eb0702-000201-9835-v1.txt
8,AAHUS,"a little town of Germany, in the circle of Wes...",9910796273804340,"Seventh edition, General index","Stewart, Dugald",1753-1828,encyclopedia,eng,Sir,1,...,20,Not specified,Not specified,2,2,Not specified,41,1,Topic,eb07-v1.2-TXT/a2/kp-eb0702-000202-9835-v1.txt


In [55]:
df_year=df[df['year'] == 1771].reset_index(drop=True)
related_df_entries=df_year[df_year["term"] == "DRAWING"].reset_index(drop=True)
related_df_entries["relatedTerms"]
vl=related_df_entries["volumeNum"].unique()
vl

array([], dtype=int64)

In [56]:
# Create a Graph
g = Graph()

g.namespace_manager.bind('eb', Namespace("https://w3id.org/eb#"), override="False")
eb = Namespace("https://w3id.org/eb#")

#### Edition-1771 Information

list_years=df["year"].unique()
ed_revisions=[]

for y in range(0, len(list_years)):
    
    ### EDITION
    print("YEAR %s" %list_years[y])
    
    df_year=df[df['year'] == list_years[y]].reset_index(drop=True)
    edition_data = df_year.loc[0]
    g, edition = edition2rdf(edition_data,g, eb)
    ed_revisions.append(edition)
    
    ### VOLUMES 
    list_vols = df_year["volumeNum"].unique()
    for v in range(0,len(list_vols)):
        print("Vol %s" % list_vols[v])
        df_year_vl=df_year[df_year["volumeNum"] == list_vols[v]].reset_index(drop=True)
        volume_data=df_year_vl.loc[0]
        volume_id=volume_data["volumeId"]
        volume = URIRef("https://w3id.org/eb/i/Volume/"+str(volume_data["MMSID"])+"_"+str(volume_data["volumeId"]))
        g.add((volume, RDF.type, eb.Volume))
        g.add((volume, eb.number, Literal(volume_data["volumeNum"], datatype=XSD.integer)))
        g.add((volume, eb.letters, Literal(volume_data["letters"], datatype=XSD.string)))
        g.add((volume, eb.volumeId, Literal(volume_data["volumeId"], datatype=XSD.int)))
        g.add((volume, eb.title, Literal(volume_data["volumeTitle"], datatype=XSD.string)))
        
        if volume_data["part"]!=0:
            g.add((volume, eb.part, Literal(volume_data["part"], datatype=XSD.string)))
    
        g.add((volume, eb.metsXML, Literal(volume_data["metsXML"], datatype=XSD.string)))
        g.add((volume, eb.permanentURL, Literal(volume_data["permanentURL"], datatype=XSD.string)))
        g.add((volume, eb.numberOfPages, Literal(volume_data["numberOfPages"], datatype=XSD.string)))
    
        g.add((edition, eb.hasPart, volume))
    
        df_by_term=df_year_vl.groupby(['term'],)["term"].count().reset_index(name='counts')
                        
        #### TERMS
        for t_index in range(0, len(df_by_term)):
            t=df_by_term.loc[t_index]["term"]
            c=df_by_term.loc[t_index]["counts"]
            df_entries= df_year_vl[df_year_vl["term"] == t].reset_index(drop=True)
            for t_count in range(0, c):
                df_entry= df_entries.loc[t_count]
                if df_entry["typeTerm"] == "Article" :
                    term= URIRef("https://w3id.org/eb/i/Article/"+str(df_entry["MMSID"])+"_"+str(df_entry["volumeId"])+"_"+t.replace(" ", "_")+"_"+str(t_count))
                    g.add((term, RDF.type, eb.Article))
                elif df_entry["typeTerm"] == "Topic" :
                    term= URIRef("https://w3id.org/eb/i/Topic/"+str(df_entry["MMSID"])+"_"+str(df_entry["volumeId"])+"_"+t.replace(" ", "_")+"_"+str(t_count))
                    g.add((term, RDF.type, eb.Topic))
                else:
                    pass
                g.add((term, eb.name, Literal(t, datatype=XSD.string)))
                g.add((term, eb.definition, Literal(df_entry["definition"], datatype=XSD.string)))
                g.add((term, eb.position, Literal(df_entry["positionPage"], datatype=XSD.int)))
                g.add((term, eb.numberOfWords, Literal(df_entry["numberOfWords"], datatype=XSD.int)))
                g.add((volume, eb.hasPart, term))
            
                ## startsAt
                page_startsAt= URIRef("https://w3id.org/eb/i/Page/"+ str(df_entry["MMSID"])+"_"+str(df_entry["volumeId"])+"_"+str(df_entry["startsAt"]))
                g.add((page_startsAt, RDF.type, eb.Page))
                g.add((page_startsAt, eb.number, Literal(df_entry["startsAt"], datatype=XSD.int)))
                g.add((page_startsAt, eb.header, Literal(df_entry["header"], datatype=XSD.string)))
                g.add((page_startsAt, eb.numberOfTerms, Literal(df_entry["numberOfTerms"], datatype=XSD.stri)))
                g.add((volume, eb.hasPart, page_startsAt))
                g.add((term, eb.startsAtPage, page_startsAt))
                g.add((page_startsAt, eb.hasPart, term))
                g.add((page_startsAt, eb.altoXML, Literal(df_entry["altoXML"], datatype=XSD.string)))
            
                ## endsAt
                page_endsAt= URIRef("https://w3id.org/eb/i/Page/"+ str(df_entry["MMSID"])+"_"+str(df_entry["volumeId"])+"_"+str(df_entry["endsAt"]))
                g.add((page_endsAt, RDF.type, eb.Page))
                g.add((page_endsAt, eb.number, Literal(df_entry["endsAt"], datatype=XSD.int)))
                g.add((volume, eb.hasPart, page_endsAt))
                g.add((term, eb.endsAtPage, page_endsAt))
                g.add((page_endsAt, eb.hasPart, term))
               
                #altoXML
                
                
                ## related terms
                
                if df_entry["relatedTerms"]:
                    for rt in df_entry["relatedTerms"]:
                        if rt!= t:
                            related_df_entries= df_year[df_year["term"] == rt].reset_index(drop=True)
                            list_r_vl=related_df_entries["volumeNum"].unique()
                            for r_vl in list_r_vl:
                                df_r_vl=related_df_entries[related_df_entries["volumeNum"] == r_vl].reset_index(drop=True)
                                for r_c in range (0, len(df_r_vl)):
                                    r_entry= df_r_vl.loc[r_c]
                                    if r_entry["typeTerm"] == "Article" :
                                        r_term= URIRef("https://w3id.org/eb/i/Article/"+str(r_entry["MMSID"])+"_"+str(r_entry["volumeId"])+"_"+rt+"_"+str(r_c))
                                    elif r_entry["typeTerm"] == "Topic" :
                                        r_term= URIRef("https://w3id.org/eb/i/Topic/"+str(r_entry["MMSID"])+"_"+str(r_entry["volumeId"])+"_"+rt+"_"+str(r_c))
                                        
                                    g.add((term, eb.relatedTerms, r_term))
                        


try:
    g.add((ed_revisions[1], eb.revisionOf, ed_revisions[0]))
except:
    pass

YEAR 1842
Vol 2
Vol 4
Vol 3
Vol 5
Vol 7
Vol 6
Vol 8
Vol 9
Vol 10
Vol 11
Vol 12
Vol 13
Vol 15
Vol 14
Vol 16
Vol 18
Vol 17
Vol 19
Vol 20
Vol 21


In [61]:
g.serialize(format="turtle", destination="edition7_clean.ttl")

<Graph identifier=Nda11a3debe174fbb87190203b30810c6 (<class 'rdflib.graph.Graph'>)>

: 

In [59]:
for s,p,o in g.triples((edition, None, None)):
  print(s,p,o)

https://w3id.org/eb/i/Edition/9910796273804340 http://www.w3.org/1999/02/22-rdf-syntax-ns#type https://w3id.org/eb#Edition
https://w3id.org/eb/i/Edition/9910796273804340 https://w3id.org/eb#number 7
https://w3id.org/eb/i/Edition/9910796273804340 https://w3id.org/eb#title Edition 7,1842
https://w3id.org/eb/i/Edition/9910796273804340 https://w3id.org/eb#subtitle 0
https://w3id.org/eb/i/Edition/9910796273804340 https://w3id.org/eb#publicationYear 1842
https://w3id.org/eb/i/Edition/9910796273804340 https://w3id.org/eb#printedAt Edinburgh
https://w3id.org/eb/i/Edition/9910796273804340 https://w3id.org/eb#mmsid 9910796273804340
https://w3id.org/eb/i/Edition/9910796273804340 https://w3id.org/eb#physicalDescription 21 v. in 22 ; 4to.
https://w3id.org/eb/i/Edition/9910796273804340 https://w3id.org/eb#genre encyclopedia
https://w3id.org/eb/i/Edition/9910796273804340 https://w3id.org/eb#language eng
https://w3id.org/eb/i/Edition/9910796273804340 https://w3id.org/eb#shelfLocator EB.15
https://w3id