In [14]:
import os
import sys
import argparse
import pickle
import math
import unicodedata
import pandas as pd

from fuzzywuzzy import fuzz
from nltk.tokenize.treebank import TreebankWordTokenizer
from nltk.corpus import stopwords

In [15]:
# arguments
index_entpath = "../indexes/entity_2M.pkl"
index_reachpath = "../indexes/reachability_2M.pkl"
index_namespath = "../indexes/names_2M.pkl"

In [16]:
tokenizer = TreebankWordTokenizer()
stopwords = set(stopwords.words('english'))

def tokenize_text(text):
    tokens = tokenizer.tokenize(text)
    return tokens

def www2fb(in_str):
    if in_str.startswith("www.freebase.com"):
        return 'fb:%s' % (in_str.split('www.freebase.com/')[-1].replace('/', '.'))
    return in_str

def get_index(index_path):
    print("loading index from: {}".format(index_path))
    with open(index_path, 'rb') as f:
        index = pickle.load(f)
    return index

def strip_accents(text):
    return ''.join(c for c in unicodedata.normalize('NFKD', text) if unicodedata.category(c) != 'Mn')


In [17]:
def find_ngrams(input_list, n):
    ngrams = zip(*[input_list[i:] for i in range(n)])
    return set(ngrams)

In [18]:
def calc_tf_idf(query, cand_ent_name, cand_ent_count, num_entities, index_ent):
    query_terms = tokenize_text(query)
    doc_tokens = tokenize_text(cand_ent_name)
    common_terms = set(query_terms).intersection(set(doc_tokens))

    # len_intersection = len(common_terms)
    # len_union = len(set(query_terms).union(set(doc_tokens)))
    # tf = len_intersection / len_union
    tf = math.log10(cand_ent_count + 1)
    k1 = 0.5
    k2 = 0.5
    total_idf = 0
    for term in common_terms:
        df = len(index_ent[term])
        idf = math.log10( (num_entities - df + k1) / (df + k2) )
        total_idf += idf
    return tf * total_idf

def pick_best_name(question, names_list):
    best_score = None
    best_name = None
    for name in names_list:
        score =  fuzz.ratio(name, question)
        if best_score == None or score > best_score:
            best_score = score
            best_name = name

    return best_name

In [19]:
index_ent = get_index(index_entpath)
index_names = get_index(index_namespath)
index_reach = get_index(index_reachpath)

loading index from: ../indexes/entity_2M.pkl
loading index from: ../indexes/names_2M.pkl
loading index from: ../indexes/reachability_2M.pkl


In [20]:
# type in the question
question = "where was sasha vujacic born?"

In [21]:
# FIXME: import methods to get query text, rel
query_text = "sasha vujacic"
pred_relation = www2fb("www.freebase.com/people/person/place_of_birth")
print(pred_relation)

fb:people.person.place_of_birth


In [22]:
query_tokens = tokenize_text(query_text)
print(query_tokens)
N = min(len(query_tokens), 3)
print(N)

['sasha', 'vujacic']
2


In [23]:
C = []  # candidate entities
for n in range(N, 0, -1):
    ngrams_set = find_ngrams(query_tokens, n)
    print("ngrams_set: {}".format(ngrams_set))
    for ngram_tuple in ngrams_set:
        ngram = " ".join(ngram_tuple)
        ngram = strip_accents(ngram)
        # unigram stopwords have too many candidates so just skip over
        if ngram in stopwords:
            continue
        print("ngram: {}".format(ngram))
        ## PROBLEM! - ngram doesnt exist in index - at test-2592 - KeyError: 'p.a.r.c.e. parce'
        try:
            cand_mids = index_ent[ngram]  # search entities
        except:
            continue
        C.extend(cand_mids)
        # print("C: {}".format(C))
    if (len(C) > 0):
        print("early termination...")
        break
    break
print(C)

ngrams_set: {('sasha', 'vujacic')}
ngram: sasha vujacic
early termination...
['fb:m.07f3jg']


In [24]:
C_pruned = []
for mid in set(C):
    if mid in index_reach.keys():  # PROBLEM: don't know why this may not exist??
        count_mid = C.count(mid)  # count number of times mid appeared in C
        C_pruned.append((mid, count_mid))
        if pred_relation in index_reach[mid]:
            count_mid = C.count(mid)  # count number of times mid appeared in C
            C_pruned.append((mid, count_mid))

print(C_pruned)

[('fb:m.07f3jg', 1), ('fb:m.07f3jg', 1)]


In [25]:
num_entities_fbsubset = 1959820  # 2M - 1959820 , 5M - 1972702
C_tfidf_pruned = []
for mid, count_mid in C_pruned:
    if mid in index_names.keys():
        cand_ent_name = pick_best_name(question, index_names[mid])
        tfidf = calc_tf_idf(query_text, cand_ent_name, count_mid, num_entities_fbsubset, index_ent)
        C_tfidf_pruned.append((mid, cand_ent_name, tfidf))
# print("C_tfidf_pruned[:10]: {}".format(C_tfidf_pruned[:10]))
print(C_tfidf_pruned)

C_tfidf_pruned.sort(key=lambda t: -t[2])
pred_ent, name_ent, score = C_tfidf_pruned[0]
print(pred_ent)
print(name_ent)

[('fb:m.07f3jg', 'sasha vujacic', 2.9542261202994435), ('fb:m.07f3jg', 'sasha vujacic', 2.9542261202994435)]
fb:m.07f3jg
sasha vujacic


In [27]:
# FIXME: store the Freebase graph with the name field
fb_path = "../indexes/fb_graph.pkl"
fb_graph = get_index(fb_path)

loading index from: ../indexes/fb_graph.pkl


In [40]:
def get_names(fb_graph, mids):
    names = []
    for mid in mids:
        names.extend( fb_graph[(mid, 'fb:type.object.name')] )
        names.extend( fb_graph[(mid, 'fb:common.topic.alias')] )
    names.sort(key = lambda s: -len(s))
    return names

In [42]:
# FIXME: lookup Freebase for object of (ent, rel)
result_mid = fb_graph[(pred_ent, pred_relation)]
result_mid = list(result_mid)
# print(result_mid)

# FIXME: lookup Freebase for the name predicate of that object
result = get_names(fb_graph, result_mid)[0]
print("Answer: {}".format(result))


Answer: Slovenia, Maribor
