Permalink
Switch branches/tags
Nothing to show
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
executable file 243 lines (197 sloc) 7.52 KB
#!/usr/bin/env python
import urllib
import sys
from rdflib import Graph
import logging # see https://docs.python.org/2/howto/logging.html
from time import gmtime, strftime
import re
# global variables
wdRegEx = re.compile(r"\<http://www.wikidata.org/entity/(?P<localName>.+)\>")
g = Graph() # graph where we'll store triples
endpoint = "https://query.wikidata.org/sparql"
maxValues = 40 # maximum number of VALUES entries per query
def finishQuery(queryToSplit,newQuery,queryNumber,queryFooter):
newQuery += queryFooter
url = endpoint + "?" + urllib.urlencode({"query": newQuery})
# Next line could use a try... except wrapper to report on timeouts, malformed queries, etc.
g.parse(url)
logging.info('Triples in graph g after ' + queryToSplit + ' query number ' + str(queryNumber) + ': ' + str(len(g)))
def splitAndRunRemoteQuery(queryToSplit,valuesList,queryHeader,queryFooter):
valuesInThisQuery = 0 # In the current query being built
queryNumber = 1
for row in valuesList:
if valuesInThisQuery == 0:
newQuery = queryHeader
ID = '<%s>' % row
result = wdRegEx.search(ID) # Convert URI to qname if possible (shortens eventual URL)
if result != None:
ID = 'wd:' + result.group('localName')
newQuery = newQuery + ID + '\n'
valuesInThisQuery += 1
if valuesInThisQuery > maxValues:
# Finish up this query and execute it.
finishQuery(queryToSplit,newQuery,queryNumber,queryFooter)
valuesInThisQuery = 0
queryNumber += 1
# If there are some leftover entries
if valuesInThisQuery > 0:
finishQuery(queryToSplit,newQuery,queryNumber,queryFooter)
def main(cityLocalID):
cityQname = "wd:" + cityLocalID
####### Start of SPARQL query variables ######
queryRetrieveGeoPoints = """
PREFIX wgs84: <http://www.w3.org/2003/01/geo/wgs84_pos#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
PREFIX p: <http://www.wikidata.org/prop/>
PREFIX psv: <http://www.wikidata.org/prop/statement/value/>
PREFIX schema: <http://schema.org/>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
CONSTRUCT {
?s ?p ?o .
?s wgs84:lat ?lat .
?s wgs84:long ?long .
}
WHERE {
BIND(CITY-QNAME AS ?geoEntityWikidataID)
?s wdt:P131+ ?geoEntityWikidataID .
?s p:P625 ?statement . # coordinate-location statement
?statement psv:P625 ?coordinate_node .
?coordinate_node wikibase:geoLatitude ?lat .
?coordinate_node wikibase:geoLongitude ?long .
}
"""
queryListSubjects ="""
PREFIX wgs84: <http://www.w3.org/2003/01/geo/wgs84_pos#>
SELECT ?s WHERE {
?s wgs84:lat ?lat
}
"""
entityDataQueryHeader = """
PREFIX wd: <http://www.wikidata.org/entity/>
PREFIX p: <http://www.wikidata.org/prop/>
PREFIX wgs84: <http://www.w3.org/2003/01/geo/wgs84_pos#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
CONSTRUCT
{ ?s ?p ?o.
?s ?p1 ?o1 .
?s wgs84:lat ?lat .
?s wgs84:long ?long .
?p rdfs:label ?pname .
?s wdt:P31 ?class .
}
WHERE {
VALUES ?s {
"""
# Notes pulled out of entityDataQueryFooter below: wdt:P131 means
# 'located in the administrative territorial entity' .
# p:P625 is the coordinate-location statement.
# ?directClaimP part is to reduce the indirection used by Wikidata
# triples. Based on Tommy Potter query at
# http://www.snee.com/bobdc.blog/2017/04/the-wikidata-data-model-and-yo.html.
# VALUES after ?p1 clause is actually faster than just
# having specific triple patterns for those 2 p1 values.
# 'en' part is if only English names desired. For English + something else, follow this pattern:
# FILTER (isURI(?o1) || lang(?o1) = 'en' || lang(?o1) = 'de')
# P31: Class membership. Pull this and higher level classes out in later query.
entityDataQueryFooter = """
}
?s wdt:P131+ ?geoEntityWikidataID .
?s p:P625 ?statement .
?statement psv:P625 ?coordinate_node .
?coordinate_node wikibase:geoLatitude ?lat .
?coordinate_node wikibase:geoLongitude ?long .
?s ?directClaimP ?o .
?p wikibase:directClaim ?directClaimP .
?p rdfs:label ?pname .
?s ?p1 ?o1 .
VALUES ?p1 {
schema:description
rdfs:label
skos:altLabel
}
?s wdt:P31 ?class .
FILTER (isURI(?o1) || lang(?o1) = 'en' )
FILTER(lang(?pname) = 'en')
}
"""
listClassesQuery = """
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
SELECT DISTINCT ?class WHERE {
?instance wdt:P31 ?class .
}
"""
queryGetClassesHeader = """
PREFIX wd: <http://www.wikidata.org/entity/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
CONSTRUCT {
?class rdfs:subClassOf ?superclass ;
rdfs:label ?className .
?superclass rdfs:label ?superclassName .
}
WHERE {
?instance wdt:P31 ?class .
VALUES ?instance {
"""
queryGetClassesFooter = """
}
?class rdfs:label ?className .
?class wdt:P279+ ?superclass .
?superclass rdfs:label ?superclassName .
FILTER ( lang(?className) = 'en' )
FILTER ( lang(?superclassName) = 'en' )
}
"""
queryObjectsThatNeedLabel = """
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
SELECT DISTINCT ?o WHERE {
?s ?p ?o .
MINUS { ?o rdfs:label ?label }
FILTER(strstarts(str(?o),'http://www.wikidata.org/entity/'))
}
"""
queryGetObjectLabelsHeader = """
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
CONSTRUCT { ?s rdfs:label ?o }
WHERE {
VALUES ?s {
"""
# FILTER below is for English only; for English + something else follow this pattern:
# FILTER (lang(?o) = 'en' || lang(?o) = 'de')
queryGetObjectLabelsFooter = """
}
?s rdfs:label ?o .
FILTER (lang(?o) = 'en' )
}
"""
########## End of SPARQL queries #############
# 1. Get the qnames for the geotagged entities within the city & store in graph g.
queryRetrieveGeoPoints = queryRetrieveGeoPoints.replace("CITY-QNAME",cityQname)
url = endpoint + "?" + urllib.urlencode({"query": queryRetrieveGeoPoints})
g.parse(url)
logging.info('Triples in graph g after queryRetrieveGeoPoints: ' + str(len(g)))
# 2. Take the subjects in graph g and create queries with a VALUES clause
# of up to maxValues of the subjects.
subjectQueryResults = g.query(queryListSubjects)
splitAndRunRemoteQuery("querySubjectData",subjectQueryResults,
entityDataQueryHeader,entityDataQueryFooter)
# 3. See what classes are used and get their names and those of their superclasses.
classList = g.query(listClassesQuery)
splitAndRunRemoteQuery("queryGetClassInfo",classList,
queryGetClassesHeader,queryGetClassesFooter)
# 4. See what objects need labels and get them.
objectsThatNeedLabel = g.query(queryObjectsThatNeedLabel)
splitAndRunRemoteQuery("queryObjectsThatNeedLabel",objectsThatNeedLabel,
queryGetObjectLabelsHeader,queryGetObjectLabelsFooter)
print(g.serialize(format = "n3")) # (Actually Turtle, which is what we want, not n3.)
if __name__ == "__main__":
if len(sys.argv) != 2:
print "Enter a city's Wikidata URI local name (e.g. Q123766)"
print "as a command line argument to retrieve data about that city."
else:
logging.basicConfig(filename='pipelining.log',level=logging.DEBUG)
cityLocalID = sys.argv[1]
logging.info(strftime("Starting at %Y-%m-%dT%H:%M:%S with ID ", gmtime()) + cityLocalID)
main(cityLocalID)
logging.info(strftime("Finishing at %Y-%m-%dT%H:%M:%S", gmtime()))