In [1]:
import json, requests, time, pycm, yaml, glob, time, random, pandas as pd
from string import Template

In [3]:
SPARQL_ENDPOINT = "https://query.wikidata.org/sparql"

QUERY_HEADERS = {
    'User-Agent': 'ConceptualEngineeringAgent/0.2 (https://github.com/bradleypallen/conceptual-engineering-using-llms; b.p.allen@uva.nl)'
}

EXPERIMENTS_V0_FILENAME = 'benchmarks/planet/data.json'

EXPERIMENTS_FILENAME = 'planet_experiment_data.json'

CONCEPTS_DIRECTORY_NAME = 'concepts/planet'

QUERY_LIMIT = 20

DESCRIPTION_QUERY_TEMPLATE = Template("""SELECT DISTINCT ?s ?p ?o WHERE {
{ 
  VALUES ?s { <$id> }
  ?s ?p ?o . 
  FILTER(LANG(?o) = "en") .
}
UNION
{ 
  VALUES ?o { <$id> }
  ?s ?p ?o . 
  FILTER(LANG(?o) = "en") .
}
}
LIMIT $limit
""")
                                      
def serialize(id):
    headers = QUERY_HEADERS
    headers["Accept"] = 'text/tab-separated-values'
    query = DESCRIPTION_QUERY_TEMPLATE.substitute({"id": id, "limit": QUERY_LIMIT})
    response = requests.get(SPARQL_ENDPOINT, params={'query' : query}, headers=headers)
    response.raise_for_status()
    return '\n'.join(response.text.split('\n')[1:])

In [4]:
experiments = [ { "concept": yaml.safe_load(open(file, 'r')) } for file in glob.glob(f'{CONCEPTS_DIRECTORY_NAME}/*.yaml') ]

In [5]:
experiments

[{'concept': {'id': 'iau_2006_08_24_planet',
   'label': 'planet',
   'definition': 'A planet [1] is a celestial body that (a) is in orbit around the Sun, (b) has sufficient mass for its self-gravity to overcome rigid body forces so that it assumes a hydrostatic equilibrium (nearly round) shape, and (c) has cleared the neighbourhood around its orbit.',
   'reference': 'https://www.iau.org/static/resolutions/Resolution_GA26-5-6.pdf'}},
 {'concept': {'id': 'iau_2006_08_16_planet',
   'label': 'planet',
   'definition': 'A planet is a celestial body that (a) has sufficient mass for its self-gravity to overcome rigid body forces so that it assumes a hydrostatic equilibrium (nearly round) shape, and (b) is in orbit around a star, and is neither a star nor a satellite of a planet.',
   'reference': 'https://www.iau.org/news/pressreleases/detail/iau0601/'}},
 {'concept': {'id': 'oed_planet',
   'label': 'planet',
   'definition': 'Any of various rocky or gaseous bodies that revolve in approxi

In [6]:
experiments_old = json.load(open(EXPERIMENTS_V0_FILENAME, 'r'))

In [7]:
data = experiments_old['positives']['data'] + experiments_old['negatives']['data']

In [8]:
for instance in data:
    instance['id'] = instance.pop('@id')
    instance['label'] = instance.pop('name')
    time.sleep(0.25)
    print(f'{instance["label"]} ({instance["id"]})...')
    instance['serialization'] = serialize(instance['id'])

KELT-18b (http://www.wikidata.org/entity/Q30682840)...
Gliese 676 Ae (http://www.wikidata.org/entity/Q30899056)...
Ross 128 b (http://www.wikidata.org/entity/Q43196041)...
K2-155d (http://www.wikidata.org/entity/Q53616218)...
Kepler-19d (http://www.wikidata.org/entity/Q49080978)...
Kepler-223c (http://www.wikidata.org/entity/Q28503768)...
Gliese 163 b (http://www.wikidata.org/entity/Q28759375)...
Omicron Ursae Majoris b (http://www.wikidata.org/entity/Q28808186)...
TRAPPIST-1h (http://www.wikidata.org/entity/Q28822662)...
Lalande 21185 b (http://www.wikidata.org/entity/Q28773365)...
Kepler-1520b (http://www.wikidata.org/entity/Q27048791)...
Kepler-20g (http://www.wikidata.org/entity/Q27048793)...
HIP 57274 d (http://www.wikidata.org/entity/Q28453780)...
Kepler-167d (http://www.wikidata.org/entity/Q28502878)...
Kepler-174c (http://www.wikidata.org/entity/Q28502901)...
Kepler-160c (http://www.wikidata.org/entity/Q28502857)...
Mars (http://www.wikidata.org/entity/Q111)...
Kepler-24e (http

In [9]:
len(data)

1000

In [10]:
positives = [ instance for instance in data if instance["actual"] == 'positive' ]
negatives = [ instance for instance in data if instance["actual"] == 'negative' ]

In [11]:
positives_sample = random.sample(positives, 50)
negatives_sample = random.sample(negatives, 50)

In [12]:
len(positives_sample)

50

In [13]:
sampled_data = positives_sample + negatives_sample

In [14]:
sampled_data

[{'actual': 'positive',
  'description': 'HD 159868 b is an extrasolar planet that orbits HD 159868. It is a jovian planet. The orbit is nearly circular at the average distance of 2.25 AU.\n\n',
  'id': 'http://www.wikidata.org/entity/Q841870',
  'label': 'HD 159868 b',
  'serialization': '<http://www.wikidata.org/entity/Q841870>\t<http://schema.org/description>\t"extrasolar planet"@en\n<http://www.wikidata.org/entity/Q841870>\t<http://www.w3.org/2004/02/skos/core#altLabel>\t"HD 159868b"@en\n<http://www.wikidata.org/entity/Q841870>\t<http://www.w3.org/2000/01/rdf-schema#label>\t"HD 159868 b"@en\n'},
 {'actual': 'positive',
  'description': "HD 102272 c is an extrasolar planet approximately 1,200 light-years away in the constellation of Leo.  The planet is orbiting the K-type giant star HD 102272.  The planet was discovered by the radial velocity method, using the Hobby-Eberly Telescope.  Another planet, HD 102272 b, was also discovered orbiting the same star.  Although there is evidenc

In [15]:
for experiment in experiments:
    experiment["data"] = sampled_data

In [16]:
experiments

[{'concept': {'id': 'iau_2006_08_24_planet',
   'label': 'planet',
   'definition': 'A planet [1] is a celestial body that (a) is in orbit around the Sun, (b) has sufficient mass for its self-gravity to overcome rigid body forces so that it assumes a hydrostatic equilibrium (nearly round) shape, and (c) has cleared the neighbourhood around its orbit.',
   'reference': 'https://www.iau.org/static/resolutions/Resolution_GA26-5-6.pdf'},
  'data': [{'actual': 'positive',
    'description': 'HD 159868 b is an extrasolar planet that orbits HD 159868. It is a jovian planet. The orbit is nearly circular at the average distance of 2.25 AU.\n\n',
    'id': 'http://www.wikidata.org/entity/Q841870',
    'label': 'HD 159868 b',
    'serialization': '<http://www.wikidata.org/entity/Q841870>\t<http://schema.org/description>\t"extrasolar planet"@en\n<http://www.wikidata.org/entity/Q841870>\t<http://www.w3.org/2004/02/skos/core#altLabel>\t"HD 159868b"@en\n<http://www.wikidata.org/entity/Q841870>\t<http

In [17]:
json.dump(experiments, open(EXPERIMENTS_FILENAME, 'w+'))