In [29]:
import csv
from wikimapper import WikiMapper
import rdflib
from collections import Counter

mapper = WikiMapper("index_simplewiki-20230501.db")

def best_types(qs):
    values = ['wd:' + q for q in qs]
    query = """
        PREFIX wd: <http://www.wikidata.org/entity/>
        PREFIX wdt: <http://www.wikidata.org/prop/direct/>
        SELECT *
        WHERE {
          SERVICE <https://query.wikidata.org/sparql> {
            SELECT ?type (COUNT(DISTINCT *) as ?count) {
              VALUES ?s { %s }
              ?s wdt:P31 ?type .
            }
            GROUP BY (?type)
          }
        } 
        """ % ' '.join(values)
    return Counter({t.replace('http://www.wikidata.org/entity/', ''):c.toPython()
                    for t,c in rdflib.Graph().query(query)})

def lookup_column_cells(fname):
    rows = csv.reader(open(fname).readlines()[1:])
    cols = zip(*rows)
    col_ids = {}
    for ci, col in enumerate(cols):
        wiki_ids = set()
        for val in set(col):
            i = mapper.title_to_id(val.replace(' ','_'))
            if i:
                wiki_ids.add(i)
        if wiki_ids:
            col_ids[ci] = wiki_ids
    return col_ids

def get_constrained_property_qualifier(column_classes):
    column_classes = ' '.join('wd:'+c for c in column_classes)
    query = """
    PREFIX wd: <http://www.wikidata.org/entity/>
    PREFIX wdt: <http://www.wikidata.org/prop/direct/>
    PREFIX p: <http://www.wikidata.org/prop/>
    PREFIX ps: <http://www.wikidata.org/prop/statement/>
    PREFIX pq: <http://www.wikidata.org/prop/qualifier/>

    SELECT ?p ?q ?sub_column_class ?obj_column_class ?qual_column_class WHERE {
      SERVICE <https://query.wikidata.org/sparql> {
        SELECT DISTINCT ?p ?q ?sub_column_class ?obj_column_class ?qual_column_class WHERE {
          VALUES ?sub_column_class { %(column_classes)s }
          VALUES ?obj_column_class { %(column_classes)s }
          VALUES ?qual_column_class { %(column_classes)s }
          FILTER ((?sub_column_class != ?obj_column_class) 
              && (?sub_column_class != ?qual_column_class) 
              && (?qual_column_class != ?obj_column_class))
          ?p p:P2302 [ # property constraint
            ps:P2302 wd:Q21503250; # subject type constraint
            pq:P2309 wd:Q21503252; # relation: instance of
            pq:P2308 ?sub_constraint_class
          ], [
            ps:P2302 wd:Q21510865; # value type constraint
            pq:P2309 wd:Q21503252; # relation: instance of
            pq:P2308 ?obj_constraint_class
          ], [ 
            ps:P2302 wd:Q21510851; # allowed qualifiers
            pq:P2306 ?q # property
          ].
          ?q p:P2302 [ # property constraint
            ps:P2302 wd:Q21510865; # value type constraint
            pq:P2309 wd:Q21503252; # relation: instance of
            pq:P2308 ?qual_constraint_class
          ].
          ?sub_column_class wdt:P279* ?sub_constraint_class .
          ?obj_column_class wdt:P279* ?obj_constraint_class .
          ?qual_column_class wdt:P279* ?qual_constraint_class .
        }
      }
    }
    """ % {'column_classes': column_classes}
    for p, q, *_ in rdflib.Graph().query(query):
        yield p.replace('http://www.wikidata.org/entity/', ''), q.replace('http://www.wikidata.org/entity/', '')

prop_label = dict(l.strip().split('\t', 1) for l in open('wikidata-prop-label.tsv').readlines()[1:])        

N_BEST_TYPES = 1
with open('baseline.csv', 'w') as fw:
    print(open('task.csv').readlines()[0].strip(), file=fw)
    
    tasks = csv.reader(open('task.csv').readlines()[1:])
    for task in tasks:
        fname, sub_col, obj_col, qual_col, *_ = task
        print(fname)
        
        sub_col, obj_col, qual_col = int(sub_col), int(obj_col), int(qual_col)
        col_ids = lookup_column_cells('tables/' + fname)

        # Get classes
        try:
            column_classes = set()
            for col in [sub_col, obj_col, qual_col]:
                types = best_types(col_ids.get(col, []))
                best, _ = zip(*types.most_common(N_BEST_TYPES))
                column_classes |= set(best)
        except ValueError:
            continue

        for p,q in get_constrained_property_qualifier(column_classes):
            pLabel = prop_label.get(p)
            qLabel = prop_label.get(q)
            print(fname, sub_col, obj_col, qual_col, p, pLabel, q, qLabel, sep=',', file=fw)
            break

table_1.csv
wd:Q5 wd:Q11424 wd:Q19020
table_2.csv
wd:Q5 wd:Q11424 wd:Q19020
table_3.csv
table_4.csv
wd:Q5 wd:Q652965 wd:Q13027888
table_5.csv
table_6.csv
wd:Q5 wd:Q11424 wd:Q19020
table_7.csv
wd:Q5 wd:Q11424 wd:Q19020



KeyboardInterrupt

