In [1]:
import json, operator, requests, time, pycm, wikipedia, glob, pandas as pd
from langchain import HuggingFaceHub, OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.chains import LLMChain, SequentialChain
from langchain.prompts import PromptTemplate
from conceptual_engineering_toolkit import Concept, Entity
from cet_experiments import serialize, summarize, WIKIPEDIA_ARTICLE_QUERY, KNOWLEDGE_GRAPH
from datetime import datetime
from string import Template
from pathlib import Path

In [2]:
QUERY_HEADERS = {
    'User-Agent': 'ConceptualEngineeringAgent/0.2 (https://github.com/bradleypallen/conceptual-engineering-using-llms; b.p.allen@uva.nl)',
}

QUERY_LIMIT = 20

ENTITY_TRIPLES_QUERY = Template("""SELECT DISTINCT ?s ?p ?o WHERE {
{ 
  VALUES ?s { <$id> }
  ?s ?p ?o . 
}
UNION
{ 
  VALUES ?o { <$id> }
  ?s ?p ?o . 
}
}
LIMIT $limit
""")

In [3]:
def serialize(e, graph=KNOWLEDGE_GRAPH):
    headers = QUERY_HEADERS
    headers['Accept'] = 'text/tab-separated-values'
    query = ENTITY_TRIPLES_QUERY.substitute({"id": e, "limit": QUERY_LIMIT})
    response = requests.get(graph, params={'query' : query}, headers=headers)
    response.raise_for_status()
    return '\n'.join(response.text.split('\n')[1:])

In [4]:
experiments = json.load(open('caligraph_experiments.json', 'r'))

In [5]:
for experiment in experiments:
    experiment['concept']['serialization'] = serialize(experiment['concept']['id'], graph='http://caligraph.org/sparql')
    for entity in experiment['data']:
        entity['serialization'] = serialize(entity['id'], graph='http://caligraph.org/sparql')

In [6]:
json.dump(experiments, open('caligraph_benchmark_data.json', 'w+'))