In [1]:
import json, requests, time, wikipedia, pycm, numpy as np, pandas as pd, matplotlib.pyplot as plt, seaborn as sns
from conceptual_engineering_toolkit import Benchmark, Experiment
from string import Template
from pathlib import Path

In [49]:
def qid(wd_url):
    return wd_url.removeprefix("http://www.wikidata.org/entity/")

def concept_to_yaml(cls):
    concept_template = Template("""id: "$id"
label: "$label"
definition: "$definition"
reference: "$reference"
""")
    concept_yaml = concept_template.substitute({
        "id": qid(cls["class"]), 
        "label": cls["classLabel"], 
        "definition": wikipedia.summary(cls["classWikipediaName"], auto_suggest=False).replace('"', r'\"'),
        "reference": cls["classArticle"]
        })
    return concept_yaml

def class_positives_query(cls):
    query_template = Template("""SELECT ?item ?name ?article WHERE {
    ?item p:P31/ps:P31/wdt:P279* wd:$classId .
    ?article schema:about ?item ;
            schema:inLanguage "en" ;
            schema:isPartOf <https://en.wikipedia.org/> ;
            schema:name ?name .
}""")
    query = query_template.substitute({ "classId": qid(cls["class"]), })
    return query

def class_negatives_query(cls):
    query_template = Template("""SELECT ?item ?name ?article WHERE {
    ?item p:P31/ps:P31/wdt:P279* wd:$superclassId .
    ?article schema:about ?item ;
            schema:inLanguage "en" ;
            schema:isPartOf <https://en.wikipedia.org/> ;
            schema:name ?name .
    FILTER NOT EXISTS { ?item p:P31/ps:P31/wdt:P279* wd:$classId }
}""")
    query = query_template.substitute({ "classId": qid(cls["class"]), "superclassId": qid(cls["superclass"]), })
    return query

def run_wd_experiment(cls, limit=50):
    dir = f'wd_experiments/{qid(cls["class"])}'
    def_file = Path(f'{dir}/definition.yaml')
    def_file.parent.mkdir(parents=True, exist_ok=True)
    pos_file = Path(f'{dir}/positives.sparql')
    neg_file = Path(f'{dir}/negatives.sparql')
    def_file.write_text(concept_to_yaml(cls))
    pos_file.write_text(class_positives_query(cls))
    neg_file.write_text(class_negatives_query(cls))
    benchmark = Benchmark(qid(cls["class"]), f'{dir}/positives.sparql', f'{dir}/negatives.sparql', limit=limit)  
    benchmark.retrieve()
    data_file = benchmark.save(benchmark_dir='wd_experiments')
    experiment = Experiment(data_file, def_file, "gpt-4")
    sample = experiment.sample(n=20)
    experiment.run(sample)
    experiment_file = experiment.save(experiment_dir=dir)
    return experiment_file

In [8]:
candidates = json.load(open('benchmarks/candidates.json', 'r'))
len(candidates)

533

In [58]:
experiment_file = run_wd_experiment(candidates[287])



  lis = BeautifulSoup(html).find_all('li')
