<h1> Create Documents </h1>

This notebook will take the [Knowledge Graph](https://github.com/alexyoung13/frances_dissertation_ay55/blob/main/Notebooks/DataFrame2RDF_7thEdition.ipynb) and use some queries to create a documents, uris, and details files for all terms, and also just for topics and this is necessary for summariziation later. 

Currently this file is set up to read in the ttl file directly but can also be easily adapted to read from a Fueski server as evidenced by the last cell.

Output:
- terms_definitions.txt/topics_defintions.txt -> the definitions of either all terms or just topics
- terms_details.txt/topics_details.txt: the article, edition number, the year, the volume number, the part number (optional), and letter of all terms or just topics 
- terms_uris.txt/topics_uris.txt -> the terms and topics uris fromt he KG

In [17]:
from datetime import datetime
import pandas as pd
from rdflib import Graph, URIRef, Literal, Namespace, XSD
from rdflib.namespace import RDF, RDFS
from rdflib.plugins.sparql import prepareQuery
import pickle


In [18]:
g = Graph()
g.parse('../data/edition7_clean.ttl', format='turtle')
eb = Namespace("https://w3id.org/eb#")

In [19]:
#queries the KG for all terms

query_string="""
PREFIX eb: <https://w3id.org/eb#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
SELECT ?definition ?uri ?term ?vnum ?year ?enum ?letters ?part
        WHERE {{
    	?uri a eb:Article .
    	?uri eb:name ?term .
        ?uri eb:definition ?definition . 
        ?v eb:hasPart ?uri.
        ?v eb:number ?vnum.
        ?v eb:letters ?letters .
        ?e eb:hasPart ?v.
        ?e eb:publicationYear ?year.
        ?e eb:number ?enum.
        OPTIONAL {?v eb:part ?part; }
        }
  		UNION {
    	?uri a eb:Topic .
    	?uri eb:name ?term . 
        ?uri eb:definition ?definition .
        ?v eb:hasPart ?uri.
        ?v eb:number ?vnum.
        ?v eb:letters ?letters .
        ?e eb:hasPart ?v.
        ?e eb:publicationYear ?year.
        ?e eb:number ?enum.
        OPTIONAL {?v eb:part ?part; }
        
        }

   } 
""" 
query = prepareQuery(query_string, initNs = { "eb": eb})
# Execute the query on the graph
results = g.query(query)
print(len(results))


23122


In [20]:
#parses the query into the respective files

documents=[]
terms_info=[]
uris=[]
results_len = len(results)
for r in results:
    documents.append(r.definition.value)
    uris.append(r.uri)
    if r.part.value != "Not specified":
        terms_info.append([r.term.value, r.enum.value, r.year.value, r.part.value, r.vnum.value, r.letters.value])
    else:
        terms_info.append([r.term.value, r.enum.value, r.year.value, "" , r.vnum.value, r.letters.value])

with open('../data/terms_definitions.txt', 'wb') as fp:
    pickle.dump(documents, fp)
    
with open('../data/terms_details.txt', 'wb') as fp2:
    pickle.dump(terms_info, fp2)
    
with open('../data/terms_uris.txt', 'wb') as fp3:
    pickle.dump(uris, fp3)

In [21]:
#queries for only topics for summarization

query_string2="""
PREFIX eb: <https://w3id.org/eb#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
SELECT ?definition ?uri ?term ?vnum ?year ?enum ?letters ?part
        WHERE {
    	?uri a eb:Topic .
    	?uri eb:name ?term . 
        ?uri eb:definition ?definition .
        ?v eb:hasPart ?uri.
        ?v eb:number ?vnum.
        ?v eb:letters ?letters .
        ?e eb:hasPart ?v.
        ?e eb:publicationYear ?year.
        ?e eb:number ?enum.
        OPTIONAL {?v eb:part ?part; }
        
        }
""" 
query2 = prepareQuery(query_string2, initNs = { "eb": eb})
# Execute the query on the graph
results2 = g.query(query2)
print(len(results2))

2024


In [22]:
#converts queries into their respective files

documents=[]
topics_info=[]
uris=[]
for r in results2:
    documents.append(r.definition.value)
    uris.append(r.uri)
    if r.part.value != "Not specified":
        topics_info.append([r.term.value, r.enum.value, r.year.value, r.part.value, r.vnum.value, r.letters.value])
    else:
        topics_info.append([r.term.value, r.enum.value, r.year.value, "" , r.vnum.value, r.letters.value])

with open('../data/topics_definitions.txt', 'wb') as fp:
    pickle.dump(documents, fp)
    
with open('../data/topics_details.txt', 'wb') as fp2:
    pickle.dump(topics_info, fp2)
    
with open('../data/topics_uris.txt', 'wb') as fp3:
    pickle.dump(uris, fp3)

Following is example code of how to write the above code as way to parse from a fuseki server instead of directly loading the ttl file.

In [23]:
#from SPARQLWrapper import SPARQLWrapper, JSON

##For the query cells replace the query with the lines:
#
# sparql.setQuery(query)
# sparql.setReturnFormat(JSON)
# results = sparql.query().convert()


#for the parsing cells replace the parsing with the lines:
#
# documents=[]
# terms_info=[]
# uris=[]
# results_len = len(results)
# for r in results["results"]["bindings"]:
#     documents.append(r["definition"]["value"])
#     # uris.append(r["uri"]["value"])
#     # if "part" in r:
#     #     terms_info.append([r["term"]["value"], r["enum"]["value"], r["year"]["value"], r["part"]["value"], r["vnum"]["value"], r["letters"]["value"]])
#     # else:
#     #     terms_info.append([r["term"]["value"], r["enum"]["value"], r["year"]["value"], "" , r["vnum"]["value"], r["letters"]["value"]])