# Prepare Unbound Predicate Queries for WatDiv

The purpose of this notebook is to prepare a sample of WatDiv queries representing unbound predicates. We sample a set of 500 queries for every unbound predicate pattern (i.e. sUo, UUo,sUU).

In [1]:
watdiv_file = spark.read.parquet("hdfs://172.18.11.128:8020/user/amadkour/extvpdb-1B/base.parquet")
sample = watdiv_file.rdd.takeSample(False, 2000, seed=0)

In [2]:
num_preds = set()
for entry in sample:
    num_preds.add(entry.pred)

In [3]:
len(num_preds)

48

In [4]:
len(sample)

2000

Fix sample size taken from S2RDF dataset

In [5]:
modifiedsample = list()
for entry in sample:
    if not (entry.obj.startswith('"') and not entry.obj.endswith('"')):
        tpl1 = (entry.sub,entry.pred,entry.obj)
        modifiedsample.append(tpl1)

In [6]:
sample1 = modifiedsample[0:500]
sample2 = modifiedsample[500:1000]
sample3 = modifiedsample[1000:1500]

In [7]:
print "Original : {}\nSample 1 : {}\nSample 2 : {}\nSample 3 : {}".format(len(sample),len(sample1),len(sample2),len(sample3))

Original : 2000
Sample 1 : 500
Sample 2 : 500
Sample 3 : 500


In [8]:
prefix = dict()
prefix["dc"] = "http://purl.org/dc/terms/"
prefix["foaf"] = "http://xmlns.com/foaf/"
prefix["gr"] = "http://purl.org/goodrelations/"
prefix["gn"] = "http://www.geonames.org/ontology#"
prefix["mo"] = "http://purl.org/ontology/mo/"
prefix["og"] = "http://ogp.me/ns#"
prefix["rev"] = "http://purl.org/stuff/rev#"
prefix["rdf"] = "http://www.w3.org/1999/02/22-rdf-syntax-ns#"
prefix["rdfs"] = "http://www.w3.org/2000/01/rdf-schema#"
prefix["sorg"] = "http://schema.org/"
prefix["wsdbm"] = "http://db.uwaterloo.ca/~galuc/wsdbm/"

In [9]:
def resolvePrefix(entry):
    if not entry.startswith('"') and not entry.endswith('"'):
        parts = entry.split("__")
        newEntry = "<" + prefix[parts[0]]+parts[1] + ">"
        return newEntry
    else:
        return entry

In [10]:
def createQuery(select_statement,triple):
    return "SELECT %s WHERE { %s }" % (select_statement,triple)

In [11]:
sample1_ready = list()
sample2_ready = list()
sample3_ready = list()

sample1_file = open("/home/amadkour/projects/worq/test/queries/watdiv-unbound/sUo.txt",'w')
for entry in sample1:
    select = "?predicate"
    triple = "{} ?predicate {}".format(resolvePrefix(entry[0]),resolvePrefix(entry[2]))
    sample1_file.write(createQuery(select,triple)+"\n")
    
sample1_file.close()

sample2_file = open("/home/amadkour/projects/worq/test/queries/watdiv-unbound/UUo.txt",'w')
for entry in sample2:
    select = "?subject ?predicate"
    triple = "?subject ?predicate {}".format(resolvePrefix(entry[2]))
    sample2_file.write(createQuery(select,triple)+"\n")
    
sample2_file.close()

sample3_file = open("/home/amadkour/projects/worq/test/queries/watdiv-unbound/sUU.txt",'w')
for entry in sample3:
    select = "?predicate ?object"
    triple = "{} ?predicate ?object".format(resolvePrefix(entry[0]))
    sample3_file.write(createQuery(select,triple)+"\n")
    
sample3_file.close()

print "Files created successfully"

Files created successfully
