In [1]:
import requests
import re
from collections import *
import bs4

In [2]:
# Freebase mids to Wikidata
qid_for_mid = {}
with open('mid_to_qid.tsv', 'rt') as fp:
    for line in fp:
        (mid, qid) = line.strip().split()
        qid_for_mid['www.freebase.com/' + mid.replace('.', '/')] = qid

qid_for_mid['www.freebase.com/m/05zppz'] = 'Q6581097' #sex: male
qid_for_mid['www.freebase.com/m/02zsn'] = 'Q6581072' #sex: female

print('topic mapping with {} matches'.format(len(qid_for_mid)))

topic mapping with 3827703 matches


In [3]:
# Freebase properties mapping

def get_reverse(name):
    name = name.replace('www.freebase.com', '')
    #very hacky way to retrieve the data 
    json = requests.get('http://archive.org/wayback/available', {'url': 'http://www.freebase.com:80' + name}).json()
    if 'closest' in json['archived_snapshots'] and json['archived_snapshots']['closest']['available']:
        soup = bs4.BeautifulSoup(requests.get(json['archived_snapshots']['closest']['url']).content, 'html.parser')
        elem = soup.find('td', class_='rev_predicate')
        if elem is not None:
            return 'www.freebase.com' + elem('a')[0].text
    return None

pid_for_prop = {}
wikitext = requests.get('https://www.wikidata.org/wiki/Wikidata:WikiProject_Freebase/Mapping?action=raw').text
(ns_part, key_part) = wikitext.split('(/key/ namespace)')
for match in re.findall(r'\|\-\n *\| *https?:\/\/www\.freebase\.com\/([a-zA-Z0-9\/_\-]+) *\n\| *(.*) *\n', ns_part):
    match_res = re.match(r'\{\{[pP]\|(\d+)\}\}', match[1])
    if match_res:
        prop = 'www.freebase.com/' + match[0]
        pid_for_prop[prop] = 'P' + match_res.group(1)
        inverse = get_reverse(prop)
        if inverse is not None and inverse not in pid_for_prop:
            pid_for_prop[inverse] = 'R' + match_res.group(1)
print('property mappings with {} matches'.format(len(pid_for_prop)))

property mappings with 404 matches


In [5]:
def map_dataset(file_name):
    before_conv_count = 0
    after_conv_count = 0
    missed_predicates = defaultdict(int)
    with open(file_name, 'rt') as fp:
        for line in fp:
            before_conv_count += 1
            (subj, pred, obj, question) = line.split('\t')
            if pred not in pid_for_prop:
                missed_predicates[pred] += 1
            elif subj in qid_for_mid and obj in qid_for_mid:
                after_conv_count += 1
                yield (qid_for_mid[subj], pid_for_prop[pred], qid_for_mid[obj], question)
    print('{} questions over {} have been converted for file {}'.format(after_conv_count, before_conv_count, file_name))
    print(dict((s,c) for s,c in missed_predicates.items() if c > 100))

def convert_dataset(input_file, output_file):
    with open(output_file, 'wt') as fp:
        for (s, p, o, q) in map_dataset(input_file):
            fp.write('{}\t{}\t{}\t{}'.format(s, p, o, q))

convert_dataset('SimpleQuestions_v2/annotated_fb_data_train.txt', 'annotated_wd_data_train.txt')
convert_dataset('SimpleQuestions_v2/annotated_fb_data_test.txt', 'annotated_wd_data_test.txt')
convert_dataset('SimpleQuestions_v2/annotated_fb_data_valid.txt', 'annotated_wd_data_valid.txt')

34374 questions over 75910 have been converted for file SimpleQuestions_v2/annotated_fb_data_train.txt
{'www.freebase.com/medicine/manufactured_drug_form/color': 103, 'www.freebase.com/music/artist/track': 791, 'www.freebase.com/common/topic/notable_types': 1610, 'www.freebase.com/music/album/album_content_type': 892, 'www.freebase.com/education/educational_institution/colors': 177, 'www.freebase.com/business/industry/companies': 107, 'www.freebase.com/music/release/track': 732, 'www.freebase.com/music/release/region': 761, 'www.freebase.com/medicine/drug_formulation/legal_status': 159, 'www.freebase.com/cvg/game_version/publisher': 153, 'www.freebase.com/medicine/drug_formulation/active_ingredient_moieties': 159, 'www.freebase.com/cvg/game_version/platform': 163, 'www.freebase.com/music/release/track_list': 679, 'www.freebase.com/music/composition/form': 208, 'www.freebase.com/book/book_edition/author_editor': 143, 'www.freebase.com/music/single/versions': 113, 'www.freebase.com/book/