In [41]:
import csv
from wikimapper import WikiMapper
import rdflib
from collections import Counter
import random

mapper = WikiMapper("index_simplewiki-20230501.db")

def best_types(qs):
    values = ['wd:' + q for q in qs]
    query = """
        PREFIX wd: <http://www.wikidata.org/entity/>
        PREFIX wdt: <http://www.wikidata.org/prop/direct/>
        SELECT *
        WHERE {
          SERVICE <https://query.wikidata.org/sparql> {
            SELECT ?type (COUNT(DISTINCT *) as ?count) {
              VALUES ?s { %s }
              ?s wdt:P31 ?type .
            }
            GROUP BY (?type)
          }
        } 
        """ % ' '.join(values)
    return Counter({t.replace('http://www.wikidata.org/entity/', ''):c.toPython()
                    for t,c in rdflib.Graph().query(query)})

def lookup_column_cells(fname):
    rows = csv.reader(open(fname).readlines()[1:])
    cols = zip(*rows)
    col_ids = {}
    for ci, col in enumerate(cols):
        wiki_ids = set()
        for val in set(col):
            i = mapper.title_to_id(val.replace(' ','_'))
            if i:
                wiki_ids.add(i)
        if wiki_ids:
            col_ids[ci] = wiki_ids
    return col_ids

def get_constrained_property_qualifier(sub_classes, obj_classes, qlf_classes):
    sub_classes = ' '.join('wd:'+c for c in sub_classes)
    obj_classes = ' '.join('wd:'+c for c in obj_classes)
    qlf_classes = ' '.join('wd:'+c for c in qlf_classes)
    
    query = """
    PREFIX wd: <http://www.wikidata.org/entity/>
    PREFIX wdt: <http://www.wikidata.org/prop/direct/>
    PREFIX p: <http://www.wikidata.org/prop/>
    PREFIX ps: <http://www.wikidata.org/prop/statement/>
    PREFIX pq: <http://www.wikidata.org/prop/qualifier/>

    SELECT ?p ?q ?sub_column_class ?obj_column_class ?qlf_column_class WHERE {
      SERVICE <https://query.wikidata.org/sparql> {
        SELECT DISTINCT ?p ?q ?sub_column_class ?obj_column_class ?qlf_column_class WHERE {
          VALUES ?sub_column_class { %(sub_classes)s }
          VALUES ?obj_column_class { %(obj_classes)s }
          VALUES ?qlf_column_class { %(qlf_classes)s }
          FILTER ((?sub_column_class != ?obj_column_class) 
              && (?sub_column_class != ?qlf_column_class) 
              && (?qlf_column_class != ?obj_column_class))
          ?p p:P2302 [ # property constraint
            ps:P2302 wd:Q21503250; # subject type constraint
            pq:P2309 wd:Q21503252; # relation: instance of
            pq:P2308 ?sub_constraint_class
          ], [
            ps:P2302 wd:Q21510865; # value type constraint
            pq:P2309 wd:Q21503252; # relation: instance of
            pq:P2308 ?obj_constraint_class
          ], [ 
            ps:P2302 wd:Q21510851; # allowed qualifiers
            pq:P2306 ?q # property
          ].
          ?q p:P2302 [ # property constraint
            ps:P2302 wd:Q21510865; # value type constraint
            pq:P2309 wd:Q21503252; # relation: instance of
            pq:P2308 ?qlf_constraint_class
          ].
          ?sub_column_class wdt:P279* ?sub_constraint_class .
          ?obj_column_class wdt:P279* ?obj_constraint_class .
          ?qlf_column_class wdt:P279* ?qlf_constraint_class .
        }
      }
    }
    """ % {'sub_classes': sub_classes, 'obj_classes': obj_classes, 'qlf_classes': qlf_classes}
    for p, q, *_ in rdflib.Graph().query(query):
        p = p.replace('http://www.wikidata.org/entity/', '')
        q = q.replace('http://www.wikidata.org/entity/', '')
        yield p, q

prop_label = dict(
    l.strip().split('\t', 1) 
    for l in open('wikidata-prop-label.tsv').readlines()[1:]
)

N_BEST_TYPES = 1
with open('baseline.csv', 'w') as fw:
    print(open('task.csv').readlines()[0].strip(), file=fw)
    
    tasks = csv.reader(open('task.csv').readlines()[1:])
    for task in tasks:
        fname, sub_col, obj_col, qlf_col, *_ = task
        print(fname)
        
        sub_col, obj_col, qlf_col = int(sub_col), int(obj_col), int(qlf_col)
        col_ids = lookup_column_cells('tables/' + fname)

        # Get classes
        sub_best = best_types(col_ids.get(sub_col, []))
        obj_best = best_types(col_ids.get(obj_col, []))
        qlf_best = best_types(col_ids.get(qlf_col, []))
        
        if sub_best and obj_best and qlf_best:
            sub_cls, _ = zip(*sub_best.most_common(N_BEST_TYPES))
            obj_cls, _ = zip(*obj_best.most_common(N_BEST_TYPES))
            qlf_cls, _ = zip(*qlf_best.most_common(N_BEST_TYPES))
            for p,q in get_constrained_property_qualifier(sub_cls, obj_cls, qlf_cls):
                pLabel = prop_label.get(p)
                qLabel = prop_label.get(q)
                print(
                    fname, sub_col, obj_col, qlf_col, p[1:], pLabel, q[1:], qLabel, 
                    sep=',', file=fw
                )
                fw.flush()
                break

table_1.csv
table_2.csv
table_3.csv
table_4.csv
table_5.csv
table_6.csv
table_7.csv
table_8.csv
table_9.csv
table_10.csv
table_11.csv
table_12.csv
table_13.csv
table_14.csv
table_15.csv
table_16.csv
table_17.csv
table_18.csv
table_19.csv
table_20.csv
table_21.csv
table_22.csv
table_23.csv
table_24.csv
table_25.csv
table_26.csv
table_27.csv
table_28.csv
table_29.csv
table_30.csv
table_31.csv
table_32.csv
table_33.csv
table_34.csv
table_35.csv
table_36.csv
table_37.csv
table_38.csv
table_39.csv
table_40.csv
table_41.csv
table_42.csv
table_43.csv
table_44.csv
table_45.csv
table_46.csv
table_47.csv
table_48.csv
table_49.csv
table_50.csv
table_51.csv
table_52.csv
table_53.csv
table_54.csv
table_55.csv
table_56.csv
table_57.csv
table_58.csv
table_59.csv
table_60.csv
table_61.csv
table_62.csv
table_63.csv
table_64.csv
table_65.csv
table_66.csv
table_67.csv
table_68.csv
table_69.csv
table_70.csv
table_71.csv
table_72.csv
table_73.csv
table_74.csv
table_75.csv
table_76.csv
table_77.csv
table_78

In [42]:
!python evaluate.py solution.csv baseline.csv

0.3125
