In [4]:
import os
import sys
import argparse
import pickle
import math
import unicodedata
import pandas as pd
import numpy as np

from fuzzywuzzy import fuzz
from fuzzywuzzy import process
from nltk.tokenize.treebank import TreebankWordTokenizer
from nltk.corpus import stopwords

In [5]:
# arguments
index_entpath = "../indexes/entity_2M.pkl"
index_reachpath = "../indexes/reachability_2M.pkl"
index_namespath = "../indexes/names_2M.pkl"
train_ent_resultpath = "../entity_detection/query-text/train.txt"
valid_ent_resultpath = "../entity_detection/query-text/valid.txt"
test_ent_resultpath = "../entity_detection/query-text/test.txt"
gold_ent_resultpath = "../entity_detection/gold-query-text/valid.txt"
rel_resultpath = "../relation_prediction/results/topk-retrieval-valid-hits-3.txt"
outpath = "./tmp/results"

In [6]:
tokenizer = TreebankWordTokenizer()
stopwords = set(stopwords.words('english'))

def tokenize_text(text):
    tokens = tokenizer.tokenize(text)
    return tokens

def www2fb(in_str):
    if in_str.startswith("www.freebase.com"):
        return 'fb:%s' % (in_str.split('www.freebase.com/')[-1].replace('/', '.'))
    return in_str

def get_index(index_path):
    print("loading index from: {}".format(index_path))
    with open(index_path, 'rb') as f:
        index = pickle.load(f)
    return index

def strip_accents(text):
    return ''.join(c for c in unicodedata.normalize('NFKD', text) if unicodedata.category(c) != 'Mn')


In [59]:
def get_query_texts(ent_resultpath):
    print("getting query text...")
    lineids = []
    id2query = {}
    notfound = 0
    with open(ent_resultpath, 'r') as f:
        for line in f:
            items = line.strip().split(" %%%% ")
            try:
                lineid = items[0].strip()
                queries = items[1:]
                # mid = items[2].strip()
            except:
                # print("ERROR: line does not have >2 items  -->  {}".format(line.strip()))
                notfound += 1
                continue
            # print("{}   -   {}".format(lineid, query))
            lineids.append(lineid)
            id2query[lineid] = queries
    print("notfound (empty query text): {}".format(notfound))
    return lineids, id2query

def get_relations(rel_resultpath):
    print("getting relations...")
    lineids = []
    id2rels = {}
    with open(rel_resultpath, 'r') as f:
        for line in f:
            items = line.strip().split(" %%%% ")
            lineid = items[0].strip()
            rel = www2fb(items[1].strip())
            label = items[2].strip()
            score = items[3].strip()
            # print("{}   -   {}".format(lineid, rel))
            if lineid in id2rels.keys():
                id2rels[lineid].append( (rel, label, score) )
            else:
                id2rels[lineid] = [(rel, label, score)]
                lineids.append(lineid)
    return lineids, id2rels

In [60]:
def find_ngrams(input_list, n):
    ngrams = zip(*[input_list[i:] for i in range(n)])
    return set(ngrams)

In [61]:
def pick_best_name(question, names_list):
    best_score = None
    best_name = None
    for name in names_list:
        score =  fuzz.ratio(name, question)
        if best_score == None or score > best_score:
            best_score = score
            best_name = name

    return best_name

In [63]:
rel_lineids, id2rels = get_relations(rel_resultpath)

valid_ent_lineids, valid_id2queries = get_query_texts(valid_ent_resultpath)  # ent_lineids may have some examples missing
test_ent_lineids, test_id2queries = get_query_texts(test_ent_resultpath)

gold_ent_lineids, id2gold_query = get_query_text(gold_ent_resultpath)  # ent_lineids may have some examples missing

getting relations...
getting query text...
notfound (empty query text): 0
getting query text...
notfound (empty query text): 0
getting query text...
notfound (empty query text): 0


In [64]:
def get_questions(datapath):
    print("getting questions...")
    id2question = {}
    with open(datapath, 'r') as f:
        for line in f:
            items = line.strip().split("\t")
            lineid = items[0].strip()
            sub = items[1].strip()
            pred = items[2].strip()
            obj = items[3].strip()
            question = items[4].strip()
            id2question[lineid] = (sub, pred, question)
    return id2question

datapath = "../data/SimpleQuestions_v2_modified/all.txt"
id2question = get_questions(datapath)
print(len(id2question))
print(id2question['valid-1'])

getting questions...
108442
('fb:m.0f3xg_', 'fb:symbols.namesake.named_after', 'Who was the trump ocean club international hotel and tower named after')


In [67]:
print(len(valid_id2queries))
print(len(test_id2queries))
print(len(id2rels))

10845
21687
10845


In [20]:
num_entities_fbsubset = 200000  # 2M - 1959820 , 5M - 1972702
index_ent = get_index(index_entpath)
index_reach = get_index(index_reachpath)
index_names = get_index(index_namespath)

loading index from: ../indexes/entity_2M.pkl
loading index from: ../indexes/reachability_2M.pkl
loading index from: ../indexes/names_2M.pkl


In [22]:
def calc_tf_idf(question, query, cand_ent_name, cand_ent_count, num_entities, index_ent):
    query_terms = tokenize_text(cand_ent_name)
    doc_tokens = tokenize_text(question)
    common_terms = set(query_terms).intersection(set(doc_tokens))

    # len_intersection = len(common_terms)
    # len_union = len(set(query_terms).union(set(doc_tokens)))
    # tf = len_intersection / len_union
    tf = math.log10(cand_ent_count + 1)
    k1 = 0.5
    k2 = 0.5
    total_idf = 0
    for term in common_terms:
        df = len(index_ent[term])
        idf = math.log10( (num_entities + k1) / (df + k2) )
        total_idf += idf
    return tf * total_idf

def calc_idf(question, cand_ent_name, index_ent):
    query_terms = tokenize_text(cand_ent_name)
    doc_tokens = tokenize_text(question)
    common_terms = set(query_terms).intersection(set(doc_tokens))
    fix_terms = 80000
    total_idf = 0
    for term in common_terms:
        df = len(index_ent[term])
        if df > fix_terms:
            continue # too common term
        idf = math.log10( (fix_terms + 1) / (df + 1) )
        total_idf += idf
    return total_idf

In [36]:
import random
sample_lineids = random.sample(rel_lineids, 500)
sample_lineids[:10]

['valid-4249',
 'valid-2729',
 'valid-3909',
 'valid-4167',
 'valid-3363',
 'valid-3239',
 'valid-6960',
 'valid-4896',
 'valid-2431',
 'valid-3051']

In [47]:
def contains_rel(mid, rels, index_reach):
    found = False
    for rel in rels:
        if rel in index_reach[mid]:
            found = True
            break
    return found

In [68]:
with open('linking_lr.pkl', 'rb') as f:
    clf = pickle.load(f)

# features_order = [idf, length_name, length_query, length_question, pquer, pques, squer, sques, tf]

In [69]:
instance = np.array([[1, 1, 1, 1, 1, 1, 1, 1, 1]])
clf.predict_proba(instance)
# clf.predict(instance)

array([[ 0.31536798,  0.68463202]])

In [7]:
with open('df_valid_linking.pkl', 'rb') as f:
    df = pickle.load(f)

In [8]:
print(df.head())
print(df['true_label'].value_counts())
print(1-df['true_label'].mean())
y = df['name_match_label']
print(y.value_counts())
print(1-y.mean())
print(y.head())
# X = df[["idf", "length_name", "length_query", "length_question", "pquer", "pques", "squer", "sques", "tf"]]
fts = ["idf", "length_name", "length_query", "length_question", "sques", "tf"]
X = df[fts]
#print(X.head())                                                              

from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(C=0.01)
lr.fit(X, y)      

print(lr.score(X, y))
print(lr.coef_)

         idf  length_name  length_query  length_question       lineid  \
0   8.464138            5             7               12  valid-10845   
1   8.464138            5             7               12  valid-10845   
2   8.464138            8             7               12  valid-10845   
3  12.175283            7             7               12  valid-10845   
4   8.464138            5             7               12  valid-10845   

   name_match_label  pquer  pques         query  squer  sques  tf  true_label  
0                 0   0.89   0.89  billy corgan   0.86   0.67   2           0  
1                 0   0.89   0.89  billy corgan   0.86   0.67   2           0  
2                 0   0.76   0.77  billy corgan   0.75   0.67   2           0  
3                 1   1.00   1.00  billy corgan   1.00   0.79   5           1  
4                 0   0.89   0.89  billy corgan   0.86   0.67   2           0  
0    3991475
1      10254
Name: true_label, dtype: int64
0.9974376075941175
0    

In [1]:
id2mids = {}
HITS_TOP_ENTITIES = 20
sample_lineids = test_ent_lineids[:10]
for i, lineid in enumerate(sample_lineids):
    if lineid not in test_ent_lineids:
        notfound_ent += 1
        continue

    if i % 100 == 0:
        print("line {}".format(i))

    truth_mid, truth_rel, question = id2question[lineid]
    queries = test_id2queries[lineid]
#     try:
#         queries = id2gold_query[lineid.replace('valid', 'val')]    
#     except:
#         queries = [ id2question[lineid][2] ]
#     rels = [r[0] for r in id2rels[lineid]]
    
    C = []
    C_pruned = []
    C_tfidf_pruned = []
    
    for query in queries:   
        query_text = query.lower()  # lowercase the query
        query_tokens = tokenize_text(query_text)
        N = min(len(query_tokens), 3)
        # print("lineid: {}, query_text: {}, relation: {}".format(lineid, query_text, pred_relation))
        # print("query_tokens: {}".format(query_tokens))
        for n in range(N, 0, -1):
            ngrams_set = find_ngrams(query_tokens, n)
            # print("ngrams_set: {}".format(ngrams_set))
            for ngram_tuple in ngrams_set:
                ngram = " ".join(ngram_tuple)
                ngram = strip_accents(ngram)
                # unigram stopwords have too many candidates so just skip over
                if ngram in stopwords:
                    continue
                # print("ngram: {}".format(ngram))
                try:
                    cand_mids = index_ent[ngram]  # search entities
                except:
                    continue
                C.extend(cand_mids)
                # print("C: {}".format(C))
            if (len(C) > 0):
                # print("early termination...")
                break
        # print("C[:5]: {}".format(C[:5]))

        for mid in set(C):
            if mid in index_reach.keys():  # PROBLEM: don't know why this may not exist??
                count_mid = C.count(mid)  # count number of times mid appeared in C
                C_pruned.append((mid, count_mid))
#                 if contains_rel(mid, rels, index_reach):
#                     count_mid = C.count(mid)  # count number of times mid appeared in C
#                     C_pruned.append((mid, count_mid))

        for mid, count_mid in C_pruned:
            if mid in index_names.keys():
                cand_ent_name = pick_best_name(question, index_names[mid])                
                length_name = len(tokenize_text(cand_ent_name))
                length_question = len(tokenize_text(question))
                length_query = len(query_tokens)
                tf = count_mid
                idf = calc_idf(question, cand_ent_name, index_ent)
                sques = fuzz.ratio(cand_ent_name, question)/100.0
                squer = fuzz.ratio(cand_ent_name, query_text)/100.0
                pques = fuzz.partial_ratio(cand_ent_name, question)/100.0
                pquer = fuzz.partial_ratio(cand_ent_name, query_text)/100.0
                
                instance = np.array([[squer, sques]])
                score = lr.predict_proba(instance)[0][1] # get prob of pos-class for example 1

                C_tfidf_pruned.append((mid, cand_ent_name, score))
        # print("C_tfidf_pruned[:10]: {}".format(C_tfidf_pruned[:10]))

    if len(C_tfidf_pruned) == 0:
                continue

    C_tfidf_pruned.sort(key=lambda t: -t[2])
    cand_mids = C_tfidf_pruned[:HITS_TOP_ENTITIES]

    id2mids[lineid] = cand_mids


# evaluate on sample line ids
found = 0
notfound = 0
notfound_lineids = []

for lineid in sample_lineids:
    if lineid not in id2mids.keys():
        notfound_lineids.append( lineid )
        notfound += 1
        continue

    found_this_example = False
    truth_mid, truth_rel, question = id2question[lineid]
#     print(id2question[lineid])
    for (mid, mid_name, mid_score) in id2mids[lineid]:
        if mid == truth_mid:
                found_this_example = True
                break


    if found_this_example:
        found += 1
    else:
        notfound_lineids.append( lineid )
        notfound += 1    

retrieval = found / (found + notfound) * 100.0
print("retrieval: {}\t found: {}\tnotfound: {}".format(retrieval, found, notfound))
print("-" * 40)


print("done")

NameError: name 'test_ent_lineids' is not defined

In [109]:
print(id2question['test-1'])
print(test_id2queries['test-1'])
print(index_names['fb:m.01jp8ww'])
id2mids['test-1']

('fb:m.01jp8ww', 'fb:music.album.genre', 'Which genre of album is harder.....faster?')
['harder . . . . . faster']
['harder ... ..faster', 'harder ... faster']


[('fb:m.04wdfy6', 'dare to dream . . . then do it', 0.0068462255839989265),
 ('fb:m.04wcmmw', "the parade 's gone by . . .", 0.0060080172594988946),
 ('fb:m.030fpcs', 'unsafe . . at any speed', 0.0036289624043182368),
 ('fb:m.0f3j_7t', 'now and ... then', 0.0026530460829879382),
 ('fb:m.01jrzy8',
  "shoot loud , louder. . . i do n't understand",
  0.0022681977772409756),
 ('fb:m.04j32yl', "i 'm still here . . . damn it !", 0.0019640582615284958),
 ('fb:m.0b_1jj',
  'oh my gawd ! ! ! ... the flaming lips',
  0.0017672524230145556),
 ('fb:m.0gjbxhr', 'we who are about to ...', 0.0016897736487917312),
 ('fb:m.0413g4m', '. . . and some were human', 0.0015698947082201098),
 ('fb:m.04w6f_j', 'xxx sex . . . tonight', 0.0014125296028915291),
 ('fb:m.04w2xc7', 'true confessions . . .', 0.001161870239019192),
 ('fb:m.02q30_d',
  'm. choufleuri restera chez lui le . . .',
  0.00099311507215096417),
 ('fb:m.086l0b', 'and having writ ...', 0.00090504244921065988),
 ('fb:m.0ftbqy6',
  'christmas wit

In [50]:
from collections import defaultdict
data = defaultdict(list)

id2mids = {}
HITS_TOP_ENTITIES = 100
for i, lineid in enumerate(sample_lineids):
    if lineid not in ent_lineids:
        notfound_ent += 1
        continue

    if i % 100 == 0:
        print("line {}".format(i))

    truth_mid, truth_rel, question = id2question[lineid]
    queries = id2queries[lineid]    
    C = []
    C_pruned = []
    C_tfidf_pruned = []
    
    for query in queries:   
        query_text = query.lower()  # lowercase the query
        query_tokens = tokenize_text(query_text)
        N = min(len(query_tokens), 3)
        # print("lineid: {}, query_text: {}, relation: {}".format(lineid, query_text, pred_relation))
        # print("query_tokens: {}".format(query_tokens))
        for n in range(N, 0, -1):
            ngrams_set = find_ngrams(query_tokens, n)
            # print("ngrams_set: {}".format(ngrams_set))
            for ngram_tuple in ngrams_set:
                ngram = " ".join(ngram_tuple)
                ngram = strip_accents(ngram)
                # unigram stopwords have too many candidates so just skip over
                if ngram in stopwords:
                    continue
                # print("ngram: {}".format(ngram))
                try:
                    cand_mids = index_ent[ngram]  # search entities
                except:
                    continue
                C.extend(cand_mids)
                # print("C: {}".format(C))
            if (len(C) > 0):
                # print("early termination...")
                break
        # print("C[:5]: {}".format(C[:5]))

        for mid in set(C):
            if mid in index_reach.keys():  # PROBLEM: don't know why this may not exist??
                count_mid = C.count(mid)  # count number of times mid appeared in C
                C_pruned.append((mid, count_mid))

        for mid, count_mid in C_pruned:
            if mid in index_names.keys():
                cand_ent_name = pick_best_name(question, index_names[mid])
                try:
                    truth_name = pick_best_name(question, index_names[truth_mid])
                except:
                    continue
                if cand_ent_name == truth_name:  # if name is correct, we are good
                    data['label'].append(1)
                else:
                    data['label'].append(0)

    #             if mid == truth_mid:
    #                 data['label'].append(1)
    #             else:
    #                 data['label'].append(0)
                
                length_name = len(tokenize_text(cand_ent_name))
                length_question = len(tokenize_text(question))
                length_query = len(query_tokens)
                tf = count_mid
                idf = calc_idf(question, cand_ent_name, index_ent)
                sques = fuzz.ratio(cand_ent_name, question)/100.0
                squer = fuzz.ratio(cand_ent_name, query_text)/100.0
                pques = fuzz.partial_ratio(cand_ent_name, question)/100.0
                pquer = fuzz.partial_ratio(cand_ent_name, query_text)/100.0            
            
                data['length_name'].append(length_name)
                data['length_question'].append(length_question)
                data['length_query'].append(length_query)
                data['tf'].append(tf)
                data['idf'].append(idf)
                data['sques'].append(sques)
                data['squer'].append(squer)
                data['pques'].append(pques)
                data['pquer'].append()           

                C_tfidf_pruned.append((mid, cand_ent_name, data))
        # print("C_tfidf_pruned[:10]: {}".format(C_tfidf_pruned[:10]))

    if len(C_tfidf_pruned) == 0:
        continue
        
    id2mids[lineid] = C_tfidf_pruned

print("done")

line 0


KeyboardInterrupt: 

In [39]:
59353 / (59353 + 8002)

0.8822111021805077

In [29]:
df = pd.DataFrame(data)
print(df['label'].value_counts())

y = df['label']
print(y.head()) 
X = df.drop('label', axis=1)
print(X.head())

0    34026
1     4543
Name: label, dtype: int64
0    0
1    0
2    1
3    0
4    0
Name: label, dtype: int64
        idf  length_name  length_query  length_question  pquer  pques  squer  \
0  3.811336            3             4                7   1.00    1.0   0.93   
1  3.811336            3             4                7   1.00    1.0   0.93   
2  4.198213            4             4                7   1.00    1.0   1.00   
3  3.344607           13             4                7   0.88    0.6   0.36   
4  3.811336            3             4                7   1.00    1.0   0.93   

   sques  tf  
0   0.64   1  
1   0.64   1  
2   0.70   2  
3   0.41   1  
4   0.64   1  


In [50]:
# # create LR data
# data = []
# for lineid in sample_lineids:
#     if lineid not in id2mids.keys():
#         continue
    
    
#     found_this_example = False
#     truth_mid, truth_rel, question = id2question[lineid]
# #     print(id2question[lineid])
#     for (mid, mid_name, features) in id2mids[lineid]:
#         row = [length, tfidf, fuzzy]
#         try:
#             truth_name = pick_best_name(question, index_names[truth_mid])
#         except:
#             continue
# #         print(mid_name, truth_name)
#         if mid_name == truth_name:  # if name is correct, we are good
#             found_this_example = True
#             row.append(1) # pos example
#         else:
#             row.append(0) # neg example
#         data.append(row)
        
# df = pd.DataFrame(data)
# df.columns = ['length', 'tfidf', 'fuzzy', 'label']
# df.dropna(how='all')    #to drop if all values in the row are nan
# print(df.describe())
# print(df.head())

# X = df[['length', 'tfidf', 'fuzzy']]
# print(X.head())
# y = df['label']
# print(y.head()) 

In [38]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(C=0.01)
lr.fit(X, y)

print(lr.score(X, y))
print(lr.coef_)
print(lr.intercept_)

0.976950400581
[[ 1.51357524 -2.08075315  0.36106511 -0.3810684  -0.13058176  1.32223429
   1.62022576  0.06219903 -0.13805733]]
[-0.36577348]


In [37]:
len(index_ent['mark'])

4493

In [138]:
def fuzzy_intersection(tokens, text):
    common = []
    for tok in tokens:
        match_score = fuzz.partial_ratio(tok, text) / 100.0
        if match_score > 0.80:
            common.append( (tok, match_score) )
    return common

def custom_match(question, query_text, cand_ent_name, count_mid, num_entities_fbsubset, index_ent):
    query_terms = tokenize_text(cand_ent_name)
    common_terms = fuzzy_intersection(set(query_terms), question)

    # len_intersection = len(common_terms)
    # len_union = len(set(query_terms).union(set(doc_tokens)))
    # tf = len_intersection / len_union
    tf = math.log10(count_mid + 1)
    k1 = 0.5
    k2 = 0.5
    total_idf = 0
    for (term, fuzzy_score) in common_terms:
        df = len(index_ent[term])
        idf = fuzzy_score * math.log10( (num_entities_fbsubset + k1) / (df + k2) )
        total_idf += idf
    return tf*total_idf

def custom_weights(question, query_text, cand_ent_name, count_mid, num_entities_fbsubset, index_ent):
    x1 = calc_tf_idf(question, query_text, cand_ent_name, count_mid, num_entities_fbsubset, index_ent)
    x2 = fuzz.partial_ratio(cand_ent_name, question)
    

In [81]:
id2mids = {}
HITS_TOP_ENTITIES = 20
sims = ["simple", "partial"]
texts = ["question", "query"]
for USE_TEXT in texts:
    for SIM in sims:
        print("sim: {}, text: {}, hits: {}".format(SIM, USE_TEXT, HITS_TOP_ENTITIES))
        if i % 100 == 0:
            print("line {}".format(i))
        for i, lineid in enumerate(sample_lineids):
            if lineid not in ent_lineids:
                notfound_ent += 1
                continue

            truth_mid, truth_rel, question = id2question[lineid]
            try:
                query_text = id2gold_query[lineid.replace('valid', 'val')].lower()  # lowercase the query
            except:
                query_text = question.lower()
            query_tokens = tokenize_text(query_text)

            # print("lineid: {}, query_text: {}, relation: {}".format(lineid, query_text, pred_relation))
            # print("query_tokens: {}".format(query_tokens))

            N = min(len(query_tokens), 3)
            C = []  # candidate entities
            for n in range(N, 0, -1):
                ngrams_set = find_ngrams(query_tokens, n)
                # print("ngrams_set: {}".format(ngrams_set))
                for ngram_tuple in ngrams_set:
                    ngram = " ".join(ngram_tuple)
                    ngram = strip_accents(ngram)
                    # unigram stopwords have too many candidates so just skip over
                    if ngram in stopwords:
                        continue
                    # print("ngram: {}".format(ngram))
                    try:
                        cand_mids = index_ent[ngram]  # search entities
                    except:
                        continue
                    C.extend(cand_mids)
                    # print("C: {}".format(C))
                if (len(C) > 0):
                    # print("early termination...")
                    break
            # print("C[:5]: {}".format(C[:5]))

            # relation correction
            C_pruned = []
            for mid in set(C):
                if mid in index_reach.keys():  # PROBLEM: don't know why this may not exist??
                    count_mid = C.count(mid)  # count number of times mid appeared in C
                    C_pruned.append((mid, count_mid))

            C_tfidf_pruned = []
            for mid, count_mid in C_pruned:
                if mid in index_names.keys():
                    cand_ent_name = pick_best_name(question, index_names[mid])
                    if  USE_TEXT == "question":
                        text = question
                    else:
                        text = query_text

                    if SIM == "custom":
                        score = 0.8 * fuzz.token_set_ratio(cand_ent_name, question) + fuzzy_match_score(cand_ent_name, question)
                    elif SIM == "simple":
                        score = fuzz.ratio(cand_ent_name, text) / 100.0
                    elif SIM == "partial":
                        simple_question = fuzz.partial_ratio(cand_ent_name, text) / 100.0
                    else:
                        score = calc_tf_idf(question, query_text, cand_ent_name, count_mid, num_entities_fbsubset, index_ent)
                    C_tfidf_pruned.append((mid, cand_ent_name, score))
            # print("C_tfidf_pruned[:10]: {}".format(C_tfidf_pruned[:10]))

            if len(C_tfidf_pruned) == 0:
                continue

            C_tfidf_pruned.sort(key=lambda t: -t[2])
            cand_mids = C_tfidf_pruned[:HITS_TOP_ENTITIES]

            id2mids[lineid] = cand_mids


        # evaluate on sample line ids
        found = 0
        notfound = 0
        notfound_lineids = []

        for lineid in sample_lineids:
            if lineid not in id2mids.keys():
                notfound_lineids.append( lineid )
                notfound += 1
                continue

            found_this_example = False
            truth_mid, truth_rel, question = id2question[lineid]
        #     print(id2question[lineid])
            for (mid, mid_name, mid_score) in id2mids[lineid]:
                if mid == truth_mid:
                        found_this_example = True
                        break


            if found_this_example:
                found += 1
            else:
                notfound_lineids.append( lineid )
                notfound += 1    

        retrieval = found / (found + notfound) * 100.0
        print("retrieval: {}\t found: {}\tnotfound: {}".format(retrieval, found, notfound))
        print("-" * 40)


sim: simple, text: question, hits: 20
retrieval: 84.0	 found: 168	notfound: 32
----------------------------------------
sim: partial, text: question, hits: 20
retrieval: 81.0	 found: 162	notfound: 38
----------------------------------------
sim: simple, text: query, hits: 20
retrieval: 89.0	 found: 178	notfound: 22
----------------------------------------
sim: partial, text: query, hits: 20
retrieval: 81.0	 found: 162	notfound: 38
----------------------------------------


In [103]:
for id in random.sample(notfound_lineids, 10):
    ent, rel, ques = id2question[id]
    print(id, ques)
    try:
        print("gold entity id: {}\nnames: {}".format(ent, index_names[ent]))
    except:
        continue
    print("query text: {}".format(id2query[id].lower()))
    print("MIDS:")
    try:
        mids = id2mids[id]
    except:
        continue
    for mid in mids:
        print(mid)
    print("-" * 40)

valid-9773 Name one of the townships in indiana
gold entity id: fb:m.03v1s
names: ['in', 'ind', 'us-in', 'indiana', 'hoosier state']
query text: name one of the townships in indiana
MIDS:
('fb:m.05zms17', 'an army at dawn : the war in north africa 1942-1943', 0.259241157950361, 0.5)
----------------------------------------
valid-3981 what's out of eden about
gold entity id: fb:m.04t4n1z
names: ['out of eden']
query text: eden
MIDS:
('fb:m.015d6d', 'east of eden', 0.8968554509168298, 1.0)
----------------------------------------
valid-5775 which release was lemon on
gold entity id: fb:m.0dxyd5l
names: ['lemon']
query text: lemon
MIDS:
('fb:m.0cs3sg3', 'life of lemon', 0.8289980641994564, 1.0)
----------------------------------------
valid-4123 what is the formulation of clear defense hand sanitizing wipes
gold entity id: fb:m.0hqtqnf
names: ['alcohol 3.32 cloth']
query text: defense hand sanitizing wipes
MIDS:
('fb:m.0hqtf4n', 'advanced hand sanitizing wipes 0.62/0.003 swab', 3.64347594