In [1]:
import os
import sys
import argparse
import pickle
import math
import unicodedata
import pandas as pd

from fuzzywuzzy import fuzz
from nltk.tokenize.treebank import TreebankWordTokenizer
from nltk.corpus import stopwords

In [23]:
# arguments
index_entpath = "../indexes/entity_2M.pkl"
index_reachpath = "../indexes/reachability_2M.pkl"
index_namespath = "../indexes/names_2M.pkl"
ent_resultpath = "../entity_detection/query-text/val.txt"
rel_resultpath = "../relation_prediction/results/main-valid-results.txt"
outpath = "./tmp/results"

In [3]:
tokenizer = TreebankWordTokenizer()
stopwords = set(stopwords.words('english'))

def tokenize_text(text):
    tokens = tokenizer.tokenize(text)
    return tokens

def www2fb(in_str):
    if in_str.startswith("www.freebase.com"):
        return 'fb:%s' % (in_str.split('www.freebase.com/')[-1].replace('/', '.'))
    return in_str

def get_index(index_path):
    print("loading index from: {}".format(index_path))
    with open(index_path, 'rb') as f:
        index = pickle.load(f)
    return index

def strip_accents(text):
    return ''.join(c for c in unicodedata.normalize('NFKD', text) if unicodedata.category(c) != 'Mn')


In [4]:
def get_query_text(ent_resultpath):
    print("getting query text...")
    lineids = []
    id2query = {}
    notfound = 0
    with open(ent_resultpath, 'r') as f:
        for line in f:
            items = line.strip().split(" %%%% ")
            try:
                lineid = items[0].strip()
                query = items[1].strip()
                # mid = items[2].strip()
            except:
                # print("ERROR: line does not have >2 items  -->  {}".format(line.strip()))
                notfound += 1
                continue
            # print("{}   -   {}".format(lineid, query))
            lineids.append(lineid)
            id2query[lineid] = query
    print("notfound (empty query text): {}".format(notfound))
    return lineids, id2query

def get_relations(rel_resultpath):
    print("getting relations...")
    lineids = []
    id2rel = {}
    with open(rel_resultpath, 'r') as f:
        for line in f:
            items = line.strip().split(" %%%% ")
            lineid = items[0].strip()
            rel = items[1].strip()
            score = items[2].strip()
            # print("{}   -   {}".format(lineid, rel))
            lineids.append(lineid)
            id2rel[lineid] = rel
    return lineids, id2rel

In [5]:
def find_ngrams(input_list, n):
    ngrams = zip(*[input_list[i:] for i in range(n)])
    return set(ngrams)

In [33]:
def calc_tf_idf(query, cand_ent_name, cand_ent_count, num_entities, index_ent):
    query_terms = tokenize_text(query)
    doc_tokens = tokenize_text(cand_ent_name)
    common_terms = set(query_terms).intersection(set(doc_tokens))

    # len_intersection = len(common_terms)
    # len_union = len(set(query_terms).union(set(doc_tokens)))
    # tf = len_intersection / len_union
    tf = math.log10(cand_ent_count + 1)
    k1 = 0.5
    k2 = 0.5
    total_idf = 0
    for term in common_terms:
        df = len(index_ent[term])
        idf = math.log10( (num_entities - df + k1) / (df + k2) )
        total_idf += idf
    return tf * total_idf

def fuzzy_match(name, question):
    return fuzz.partial_ratio(name, question)

In [7]:
#outfile = open(os.path.join(outpath, "linking-results.txt"), 'w')
notfound_ent = 0
notfound_c = 0

index_ent = get_index(index_entpath)
num_entities_fbsubset = 1959820  # 2M - 1959820 , 5M - 1972702

loading index from: ../indexes/entity_2M.pkl


In [8]:
index_names = get_index(index_namespath)

loading index from: ../indexes/names_2M.pkl


In [9]:
index_reach = get_index(index_reachpath)

loading index from: ../indexes/reachability_2M.pkl


In [10]:
def pick_best_name(question, names_list):
    best_score = None
    best_name = None
    for name in names_list:
        score =  fuzz.ratio(name, question)
        if best_score == None or score > best_score:
            best_score = score
            best_name = name

    return best_name

In [24]:
rel_lineids, id2rel = get_relations(rel_resultpath)
ent_lineids, id2query = get_query_text(ent_resultpath)  # ent_lineids may have some examples missing

getting relations...
getting query text...
notfound (empty query text): 0


In [25]:
def get_questions(datapath):
    print("getting questions...")
    id2question = {}
    with open(datapath, 'r') as f:
        for line in f:
            items = line.strip().split("\t")
            lineid = items[0].strip()
            sub = items[1].strip()
            pred = items[2].strip()
            obj = items[3].strip()
            question = items[4].strip()
            # print("{}   -   {}".format(lineid, question))
            if lineid.startswith("valid"):
                id2question[lineid] = (sub, pred, question)
    return id2question

datapath = "../data/SimpleQuestions_v2_modified/all.txt"
id2question = get_questions(datapath)
print(id2question['valid-1'])

getting questions...
('fb:m.0f3xg_', 'fb:symbols.namesake.named_after', 'Who was the trump ocean club international hotel and tower named after')


In [26]:
def get_docs(query):
    docids = index_ent[query]
    docs = []
    for id in docids:
        try:
            docs.append( (id, index_names[id]) )
        except:
            continue
    return docs

In [27]:
print(len(id2query))
print(len(id2rel))

10845
10845


In [17]:
index_ent["carlos"]

{'fb:m.02rfnbr',
 'fb:m.0b6j1hb',
 'fb:m.0fvpvq',
 'fb:m.0ksvtcp',
 'fb:m.050vqm',
 'fb:m.05q5xv8',
 'fb:m.0ct040',
 'fb:m.0343fl',
 'fb:m.0crsgh3',
 'fb:m.02rtln4',
 'fb:m.0pd9syf',
 'fb:m.04cxkbv',
 'fb:m.0gfrl81',
 'fb:m.01wm909',
 'fb:m.0dgpc72',
 'fb:m.0rf67jw',
 'fb:m.09c2n2',
 'fb:m.05pt8m',
 'fb:m.03cggsq',
 'fb:m.0gvrcn9',
 'fb:m.0bbxwws',
 'fb:m.0hzrbzc',
 'fb:m.099tjm',
 'fb:m.0bxzcqv',
 'fb:m.0y4t4wr',
 'fb:m.0b6mljj',
 'fb:m.04x5g_',
 'fb:m.04n35ks',
 'fb:m.0x0fb8l',
 'fb:m.0bwkh5g',
 'fb:m.0qgr824',
 'fb:m.01jb5mb',
 'fb:m.06w9sy5',
 'fb:m.0qpwrxv',
 'fb:m.02810z6',
 'fb:m.0l1l3',
 'fb:m.0gccpz7',
 'fb:m.0fw6_f',
 'fb:m.05b30dm',
 'fb:m.0f12v7d',
 'fb:m.02x2sc',
 'fb:m.0bvqf0t',
 'fb:m.02vltgw',
 'fb:m.02vydb0',
 'fb:m.054wb66',
 'fb:m.02rhp9j',
 'fb:m.0cjlwk',
 'fb:m.0462gmv',
 'fb:m.0d0qws',
 'fb:m.0fzbtm',
 'fb:m.03grzh3',
 'fb:m.0gfrmff',
 'fb:m.064n6q',
 'fb:m.02wb41k',
 'fb:m.0crv7xt',
 'fb:m.0gn1zw',
 'fb:m.0bgr2k',
 'fb:m.09gqd5q',
 'fb:m.043lx1y',
 'fb:m.0byzkc',

In [32]:
# explore
lineid = 'valid-3981'
truth_mid, truth_rel, question = id2question[lineid]
print(id2question[lineid])
pred_relation = id2rel[lineid]
print(pred_relation)
query_text = id2query[lineid].lower()  # lowercase the query
print(query_text)
query_tokens = tokenize_text(query_text)
print(query_tokens)
N = min(len(query_tokens), 3)
print(N)

('fb:m.04t4n1z', 'fb:book.written_work.subjects', "what's out of eden about")
www.freebase.com/book/written_work/subjects
eden
['eden']
1


In [34]:
C = []  # candidate entities
for n in range(N, 0, -1):
    ngrams_set = find_ngrams(query_tokens, n)
    print("ngrams_set: {}".format(ngrams_set))
    for ngram_tuple in ngrams_set:
        ngram = " ".join(ngram_tuple)
        ngram = strip_accents(ngram)
        # unigram stopwords have too many candidates so just skip over
        if ngram in stopwords:
            continue
        print("ngram: {}".format(ngram))
        ## PROBLEM! - ngram doesnt exist in index - at test-2592 - KeyError: 'p.a.r.c.e. parce'
        try:
            cand_mids = index_ent[ngram]  # search entities
        except:
            continue
        C.extend(cand_mids)
        # print("C: {}".format(C))
    if (len(C) > 0):
        print("early termination...")
        break
    break
print(C)

ngrams_set: {('eden',)}
ngram: eden
early termination...
['fb:m.0f7znh', 'fb:m.01hgtl3', 'fb:m.07kczh3', 'fb:m.0bqscbs', 'fb:m.04t_mjw', 'fb:m.02l3xr', 'fb:m.0dx7k6s', 'fb:m.0fdwv0h', 'fb:m.0ncd4yy', 'fb:m.0nv26gk', 'fb:m.0llkzs', 'fb:m.05b3gkt', 'fb:m.010jpp', 'fb:m.04v84j8', 'fb:m.04t4n1z', 'fb:m.098gdxt', 'fb:m.04dz23h', 'fb:m.062sr57', 'fb:m.0h12n2s', 'fb:m.01p73zv', 'fb:m.04t_pm3', 'fb:m.0njj6z_', 'fb:m.0kyxsnv', 'fb:m.011396', 'fb:m.0cc6w4r', 'fb:m.0gtl8ks', 'fb:m.0cs54k5', 'fb:m.02q533b', 'fb:m.027t41y', 'fb:m.0822mz', 'fb:m.027vl0z', 'fb:m.04d_5vp', 'fb:m.04_0nx_', 'fb:m.04t_mlf', 'fb:m.0jwz6gk', 'fb:m.04t_pkq', 'fb:m.02pmg8m', 'fb:m.0cnb7t8', 'fb:m.03cxtpd', 'fb:m.05t3kx', 'fb:m.01jdm6m', 'fb:m.032q6y', 'fb:m.07kwdy6', 'fb:m.04t_mkx', 'fb:m.04ttl6z', 'fb:m.02b7_m8', 'fb:m.04j1n_s', 'fb:m.071kwv', 'fb:m.0gfg67', 'fb:m.07hqkw', 'fb:m.04lfvv4', 'fb:m.04w61_4', 'fb:m.0488_s4', 'fb:m.01phfbw', 'fb:m.05sxgcb', 'fb:m.0j3zh16', 'fb:m.01ssjjb', 'fb:m.04jf9x', 'fb:m.0hmwj_p', 'fb:m.03jg

In [183]:
print(index_names['fb:m.0504s2'])

['scotty', 'gomer', 'scott gomez', 'scott carlos gomez']


In [35]:
C_pruned = []
for mid in set(C):
    if mid in index_reach.keys():  # PROBLEM: don't know why this may not exist??
        count_mid = C.count(mid)  # count number of times mid appeared in C
        C_pruned.append((mid, count_mid))
#         if pred_relation in index_reach[mid]:
#             count_mid = C.count(mid)  # count number of times mid appeared in C
#             C_pruned.append((mid, count_mid))

print(C_pruned)

[('fb:m.0f7znh', 1), ('fb:m.01hgtl3', 1), ('fb:m.07kczh3', 1), ('fb:m.0bqscbs', 1), ('fb:m.04t_mjw', 1), ('fb:m.02l3xr', 1), ('fb:m.0dx7k6s', 1), ('fb:m.0fdwv0h', 1), ('fb:m.0ncd4yy', 1), ('fb:m.0nv26gk', 1), ('fb:m.0llkzs', 1), ('fb:m.05b3gkt', 1), ('fb:m.010jpp', 1), ('fb:m.04v84j8', 1), ('fb:m.04t4n1z', 1), ('fb:m.098gdxt', 1), ('fb:m.04dz23h', 1), ('fb:m.062sr57', 1), ('fb:m.0h12n2s', 1), ('fb:m.01p73zv', 1), ('fb:m.04t_pm3', 1), ('fb:m.0njj6z_', 1), ('fb:m.0kyxsnv', 1), ('fb:m.011396', 1), ('fb:m.0cc6w4r', 1), ('fb:m.0gtl8ks', 1), ('fb:m.0cs54k5', 1), ('fb:m.02q533b', 1), ('fb:m.027t41y', 1), ('fb:m.0822mz', 1), ('fb:m.027vl0z', 1), ('fb:m.04d_5vp', 1), ('fb:m.04_0nx_', 1), ('fb:m.04t_mlf', 1), ('fb:m.0jwz6gk', 1), ('fb:m.04t_pkq', 1), ('fb:m.02pmg8m', 1), ('fb:m.0cnb7t8', 1), ('fb:m.03cxtpd', 1), ('fb:m.05t3kx', 1), ('fb:m.01jdm6m', 1), ('fb:m.032q6y', 1), ('fb:m.07kwdy6', 1), ('fb:m.04t_mkx', 1), ('fb:m.04ttl6z', 1), ('fb:m.02b7_m8', 1), ('fb:m.04j1n_s', 1), ('fb:m.071kwv', 1), 

In [36]:
C_tfidf_pruned = []
for mid, count_mid in C_pruned:
    if mid in index_names.keys():
        cand_ent_name = pick_best_name(question, index_names[mid])
        score = fuzzy_match(cand_ent_name, question)
#         score = calc_tf_idf(query_text, cand_ent_name, count_mid, num_entities_fbsubset, index_ent)
        C_tfidf_pruned.append((mid, score))
# print("C_tfidf_pruned[:10]: {}".format(C_tfidf_pruned[:10]))
print(C_tfidf_pruned)

[('fb:m.0f7znh', 67), ('fb:m.01hgtl3', 100), ('fb:m.07kczh3', 64), ('fb:m.0bqscbs', 53), ('fb:m.04t_mjw', 55), ('fb:m.02l3xr', 71), ('fb:m.0dx7k6s', 100), ('fb:m.0fdwv0h', 100), ('fb:m.0ncd4yy', 47), ('fb:m.0nv26gk', 57), ('fb:m.0llkzs', 69), ('fb:m.05b3gkt', 43), ('fb:m.010jpp', 50), ('fb:m.04v84j8', 54), ('fb:m.04t4n1z', 100), ('fb:m.098gdxt', 48), ('fb:m.04dz23h', 38), ('fb:m.062sr57', 42), ('fb:m.0h12n2s', 50), ('fb:m.01p73zv', 53), ('fb:m.04t_pm3', 55), ('fb:m.0njj6z_', 55), ('fb:m.0kyxsnv', 67), ('fb:m.011396', 54), ('fb:m.0cc6w4r', 62), ('fb:m.0gtl8ks', 43), ('fb:m.0cs54k5', 46), ('fb:m.02q533b', 100), ('fb:m.027t41y', 69), ('fb:m.0822mz', 44), ('fb:m.027vl0z', 44), ('fb:m.04d_5vp', 50), ('fb:m.04_0nx_', 53), ('fb:m.04t_mlf', 55), ('fb:m.0jwz6gk', 42), ('fb:m.04t_pkq', 55), ('fb:m.02pmg8m', 42), ('fb:m.0cnb7t8', 42), ('fb:m.03cxtpd', 50), ('fb:m.05t3kx', 57), ('fb:m.01jdm6m', 50), ('fb:m.032q6y', 56), ('fb:m.07kwdy6', 42), ('fb:m.04t_mkx', 55), ('fb:m.04ttl6z', 75), ('fb:m.02b7_

In [197]:
notfound_ent = 0
notfound_c = 0
notfound_c_lineids = []
notfound_ent = 0
notcorrect_ent_lineids = []
id2pred_ent = {}

for i, lineid in enumerate(rel_lineids):
    if lineid not in ent_lineids:
        notfound_ent += 1
        continue

    pred_relation = www2fb(id2rel[lineid])
    query_text = id2query[lineid].lower()  # lowercase the query
    query_tokens = tokenize_text(query_text)

    # print("lineid: {}, query_text: {}, relation: {}".format(lineid, query_text, pred_relation))
    # print("query_tokens: {}".format(query_tokens))

    N = min(len(query_tokens), 3)
    C = []  # candidate entities
    for n in range(N, 0, -1):
        ngrams_set = find_ngrams(query_tokens, n)
        # print("ngrams_set: {}".format(ngrams_set))
        for ngram_tuple in ngrams_set:
            ngram = " ".join(ngram_tuple)
            ngram = strip_accents(ngram)
            # unigram stopwords have too many candidates so just skip over
            if ngram in stopwords:
                continue
            # print("ngram: {}".format(ngram))
            ## PROBLEM! - ngram doesnt exist in index - at test-2592 - KeyError: 'p.a.r.c.e. parce'
            try:
                cand_mids = index_ent[ngram]  # search entities
            except:
                continue
            C.extend(cand_mids)
            # print("C: {}".format(C))
        if (len(C) > 0):
            # print("early termination...")
            break
    # print("C[:5]: {}".format(C[:5]))

    # relation correction
    C_pruned = []
    for mid in set(C):
        if mid in index_reach.keys():  # PROBLEM: don't know why this may not exist??
            count_mid = C.count(mid)  # count number of times mid appeared in C
            C_pruned.append((mid, count_mid))
            ### NOT DOING RELATION CORRECTION HERE!!
#             if pred_relation in index_reach[mid]:
#                 count_mid = C.count(mid)  # count number of times mid appeared in C
#                 C_pruned.append((mid, count_mid))
    # print("C_pruned[:5]: {}".format(C_pruned[:5]))

    C_tfidf_pruned = []
    for mid, count_mid in C_pruned:
        if mid in index_names.keys():
            cand_ent_name = pick_best_name(question, index_names[mid])
            tfidf = calc_tf_idf(query_text, cand_ent_name, count_mid, num_entities_fbsubset, index_ent)
            C_tfidf_pruned.append((mid, tfidf))
    # print("C_tfidf_pruned[:10]: {}".format(C_tfidf_pruned[:10]))

    if len(C_tfidf_pruned) == 0:
        #print("WARNING: C_tfidf_pruned is empty.")
        notfound_c_lineids.append(lineid)
        notfound_c += 1
        continue

    C_tfidf_pruned.sort(key=lambda t: -t[1])
    pred_ent_mid = C_tfidf_pruned[0][0]  # get first entry's mid

    line_to_print = "{}\t{}\t{}".format(lineid, pred_ent_mid, pred_relation)
    id2pred_ent[lineid] = pred_ent_mid
    #print("PRED: " + line_to_print)
    
    gold_ent_mid, gold_rel, question = id2question[lineid]
    if not gold_ent_mid == pred_ent_mid:
        notfound_ent += 1
        notcorrect_ent_lineids.append(lineid)
        
    if (i+1) % 100 == 0:
        print("at line {}".format(i+1))
    if (i+1) % 1000 == 0:
        break
    # outfile.write(line_to_print + "\n")

print("notfound_ent : {}".format(notfound_ent))
print("notfound_c : {}".format(notfound_c))
print("notfound_c_lineids: {}".format(notfound_c_lineids))
print("notcorrect_ent_lineids: {}".format(notcorrect_ent_lineids))
#outfile.close()

at line 100
at line 200
at line 300
at line 400
at line 500
at line 600
at line 700
at line 800
at line 900
at line 1000
notfound_ent : 483
notfound_c : 7
notfound_c_lineids: ['test-40', 'test-146', 'test-312', 'test-414', 'test-578', 'test-742', 'test-848']
notcorrect_ent_lineids: ['test-2', 'test-6', 'test-7', 'test-8', 'test-11', 'test-12', 'test-15', 'test-18', 'test-19', 'test-20', 'test-22', 'test-23', 'test-24', 'test-25', 'test-26', 'test-27', 'test-29', 'test-30', 'test-31', 'test-32', 'test-39', 'test-41', 'test-43', 'test-45', 'test-47', 'test-48', 'test-49', 'test-50', 'test-52', 'test-61', 'test-65', 'test-66', 'test-67', 'test-68', 'test-69', 'test-71', 'test-73', 'test-75', 'test-76', 'test-77', 'test-78', 'test-79', 'test-81', 'test-82', 'test-85', 'test-87', 'test-88', 'test-91', 'test-92', 'test-93', 'test-94', 'test-95', 'test-96', 'test-97', 'test-98', 'test-100', 'test-104', 'test-105', 'test-110', 'test-112', 'test-113', 'test-118', 'test-125', 'test-127', 'test-1

In [201]:
notfound_c_lineids = ['test-40', 'test-146', 'test-312', 'test-414', 'test-578', 'test-742', 'test-848']
df = []
pred_rel_fault = 0
names_fault = 0
other_fault = 0
names = ['question', 'gold_entity_id', 'gold_entity_name', 'gold_relation', 'query_text', 'predicted_relation']
for lineid in notfound_c_lineids[:50]:
    ent, rel, question = id2question[lineid]
    pred_relation = www2fb(id2rel[lineid])
    query_text = id2query[lineid].lower()  # lowercase the query
#     pred_ent_mid = id2pred_ent[lineid]
    if not pred_relation == rel:
        pred_rel_fault += 1
    else:
        other_fault += 1
    if ent in index_names.keys():
        row = [question, ent, pick_best_name(question, index_names[ent]), rel, query_text, pred_relation]
    else:
        names_fault += 1
    df.append(row)

df = pd.DataFrame(df)
df.columns = names

print(pred_rel_fault)
print(names_fault)
print(other_fault)

3
0
4


In [202]:
df

Unnamed: 0,question,gold_entity_id,gold_entity_name,gold_relation,query_text,predicted_relation
0,Which label is somevelvetsidewalk signed to,fb:m.01pm4nb,some velvet sidewalk,fb:music.artist.label,somevelvetsidewalk,fb:music.artist.label
1,what is a short-lived British sitcom series,fb:m.0c4xc,situation comedy,fb:tv.tv_genre.programs,short-lived,fb:tv.tv_genre.programs
2,Who is the focus of uttar pradesh has more tha...,fb:m.0j2hj_0,uttar pradesh has more than one capital.,fb:base.uncommon.exception.focus,than,fb:base.culturalevent.event.entity_involved
3,what genre of music is locd out,fb:m.01rrs9n,loc 'd out,fb:music.album.genre,locd out,fb:music.album.genre
4,what is cassiesteele's gender?,fb:m.03_fby,cassie steele,fb:people.person.gender,cassiesteele,fb:people.person.gender
5,what is the genre of gusgofficial,fb:m.03f3bp7,kostas karamitroudis,fb:music.artist.genre,gusgofficial,fb:music.album.genre
6,What is the title of the netlix film in the ge...,fb:m.03_3d,land of the rising sun,fb:media_common.netflix_genre.titles,netlix,fb:film.film_genre.films_in_this_genre


In [208]:
notcorrect_ent_lineids = ['test-2', 'test-6', 'test-7', 'test-8', 'test-11', 'test-12', 'test-15', 'test-18', 'test-19', 'test-20', 'test-22', 'test-23', 'test-24', 'test-25', 'test-26', 'test-27', 'test-29', 'test-30', 'test-31', 'test-32', 'test-39', 'test-41', 'test-43', 'test-45', 'test-47', 'test-48', 'test-49', 'test-50', 'test-52', 'test-61', 'test-65', 'test-66', 'test-67', 'test-68', 'test-69', 'test-71', 'test-73', 'test-75', 'test-76', 'test-77', 'test-78', 'test-79', 'test-81', 'test-82', 'test-85', 'test-87', 'test-88', 'test-91', 'test-92', 'test-93', 'test-94', 'test-95', 'test-96', 'test-97', 'test-98', 'test-100', 'test-104', 'test-105', 'test-110', 'test-112', 'test-113', 'test-118', 'test-125', 'test-127', 'test-128']

df = []
pred_rel_fault = 0
names_fault = 0
other_fault = 0
names = ['question', 'gold_ent_name', 'query_text', 'predicted_ent_name']
for lineid in notcorrect_ent_lineids[:50]:
    ent, rel, question = id2question[lineid]
    pred_relation = www2fb(id2rel[lineid])
    query_text = id2query[lineid].lower()  # lowercase the query
    pred_ent_mid = id2pred_ent[lineid]
    if not pred_relation == rel:
        pred_rel_fault += 1
    else:
        other_fault += 1
    if ent in index_names.keys():
        row = [question, pick_best_name(question,index_names[ent]), query_text,  pick_best_name(question,index_names[pred_ent_mid])]
    else:
        names_fault += 1
    df.append(row)

df = pd.DataFrame(df)
df.columns = names

print(pred_rel_fault)
print(names_fault)
print(other_fault)

13
1
37


In [209]:
df

Unnamed: 0,question,gold_ent_name,query_text,predicted_ent_name
0,what format is fearless,fearless,fearless,fearless
1,what was the cause of death of yves klein,yves klein,yves klein,yves klein blue
2,Which equestrian was born in dublin?,"dublin , republic of ireland",dublin,dublin
3,What is a tv action show?,action,action show,junit in action
4,What's a song by jean grae,jean grae,jean grae,grae fruits : the jean grae compilation
5,What position does carlos gomez play?,carlos argelis gomez pena,carlos gomez,scott carlos gomez
6,What's a release on pretty in pink,pretty in pink,pretty in pink,pretty in pink
7,Who created the typeface chicago?,chicago,typeface chicago,typeface
8,what position does pee wee reese play in baseball,pee wee reese,pee wee reese,pee wee & jackie : pee wee reese & jackie robi...
9,which artist recorded one life to live,one life to live,one life to live,lady in the dark : one life to live


In [128]:
df.to_csv('incorrect_ents_without_rel_correction.csv')