In [2]:
from tqdm import tqdm
import pickle

# facts = pickle.load(open('wikidata_text_facts.txt', 'rb'))
# facts = pickle.load(open('text_facts_v2.pkl', 'rb'))

In [3]:
def getEntsFromFacts(facts):
    e = set()
    for f in facts:
        e.add(f[0])
        e.add(f[2])
    return e

def readFile(filename):
    f = open(filename, 'r')
    lines = []
    for line in f:
        line = line.strip()
        if line != '':
            lines.append(line)
    f.close()
    return lines

def getRelsFromFacts(facts):
    r = set()
    for f in facts:
        r.add(f[1])
    return r

def filterByRelation(facts, rel, max=50):
    filtered = []
    count = 0
    for f in facts:
        if f[1] == rel:
            filtered.append(f)
            count += 1
        if max < 0:
            continue
        if count >= max:
            break
    return filtered

def filterByHead(facts, head, max=50):
    filtered = []
    count = 0
    for f in facts:
        if f[0] == head:
            filtered.append(f)
            count += 1
        if max < 0:
            continue
        if count >= max:
            break
    return filtered

def filterByTail(facts, tail, max=50):
    filtered = []
    count = 0
    for f in facts:
        if f[2] == tail:
            filtered.append(f)
            count += 1
        if max < 0:
            continue
        if count >= max:
            break
    return filtered

def filterByEntity(facts, ent, max=50):
    filtered = []
    count = 0
    for f in facts:
        if f[0] == ent or f[2] == ent:
            filtered.append(f)
            count += 1
        if max < 0:
            continue
        if count >= max:
            break
    return filtered

def printFact(f):
    s = "{head}, {rel}, {tail}, {t1}, {t2}"
    head=f[0]
    rel = f[1]
    tail = f[2]
    t1 = f[3][1:5]
    t2 = f[4][1:5]
    print(s.format(head=head, rel=rel, tail=tail, t1=t1, t2=t2))
    
def printFacts(facts):
    for f in facts:
        printFact(f)
        
def isEntityInFact(e, fact):
    if fact[0] == e or fact[2] == e:
        return True
    else:
        return False
    
        
def writeFactsToFile(filename, facts):
    f = open(filename, 'w')
    for fact in facts:
        line = '\t'.join(fact)
        f.write(line + '\n')
    f.close()
    
def readFactsFromFile(filename):
    f = open(filename, 'r')
    facts = []
    for line in f:
        fact = line.strip().split('\t')
        facts.append(fact)
    f.close()
    return facts

def createWikidataIdentifierToTextDict(entities, relations):
    ent_rel_both = {}
    ent_rel_both['Q'] = entities
    ent_rel_both['P'] = relations
    wikidata_id2name = {}
    count = 0
    for prefix, lines in ent_rel_both.items():
        for line in lines:
            try:
                line = line.split('\t')
                id = prefix + line[1] #different from one above
                name = line[2]
                wikidata_id2name[id] = name
            except:
                id = prefix + line[1]
                name = ''
                wikidata_id2name[id] = name
    return wikidata_id2name

def convertFactToText(fact, wikidata_identifier_to_text):
    f = fact.copy()
    for i in range(3):
        f[i] = wikidata_identifier_to_text[fact[i]]
    return f

In [4]:
facts = readFactsFromFile('data/temporal_big/full.txt')

In [5]:
r = getRelsFromFacts(facts)
e = getEntsFromFacts(facts)

In [6]:
e_count_dict = {}
r_count_dict = {}
head_count = {}
tail_count = {}
for ent in e:
    e_count_dict[ent] = 0
    head_count[ent] = 0
    tail_count[ent] = 0
for rel in r:
    r_count_dict[rel] = 0
for f in facts:
    e1 = f[0]
    e2 = f[2]
    rel = f[1]
    e_count_dict[e1] += 1
    e_count_dict[e2] += 1
    r_count_dict[rel] += 1
    head_count[e1] += 1
    tail_count[e2] += 1

In [7]:
x = r_count_dict
sorted_r = {k: v for k, v in sorted(x.items(), key=lambda item: item[1], reverse=True)}

x = e_count_dict
sorted_e = {k: v for k, v in sorted(x.items(), key=lambda item: item[1], reverse=True)}

x = head_count
sorted_head = {k: v for k, v in sorted(x.items(), key=lambda item: item[1], reverse=True)}

x = tail_count
sorted_tail = {k: v for k, v in sorted(x.items(), key=lambda item: item[1], reverse=True)}

In [8]:
for key, value in sorted_r.items():
    print(key, '|', value, 'facts', round(value/len(facts)*100, 2), '%' )

P39 | 78380 facts 23.85 %
P166 | 75474 facts 22.97 %
P54 | 67007 facts 20.39 %
P1411 | 21983 facts 6.69 %
P1346 | 17841 facts 5.43 %
P793 | 9463 facts 2.88 %
P26 | 6034 facts 1.84 %
P108 | 5448 facts 1.66 %
P937 | 4795 facts 1.46 %
P69 | 4424 facts 1.35 %
P512 | 3821 facts 1.16 %
P17 | 2604 facts 0.79 %
P131 | 2327 facts 0.71 %
P106 | 1669 facts 0.51 %
P463 | 1605 facts 0.49 %
P551 | 1407 facts 0.43 %
P150 | 1355 facts 0.41 %
P6 | 1310 facts 0.4 %
P410 | 1306 facts 0.4 %
P102 | 1239 facts 0.38 %
P488 | 1154 facts 0.35 %
P241 | 1112 facts 0.34 %
P185 | 1045 facts 0.32 %
P27 | 872 facts 0.27 %
P527 | 867 facts 0.26 %
P286 | 866 facts 0.26 %
P31 | 676 facts 0.21 %
P5008 | 654 facts 0.2 %
P127 | 596 facts 0.18 %
P1308 | 562 facts 0.17 %
P184 | 544 facts 0.17 %
P451 | 519 facts 0.16 %
P1037 | 490 facts 0.15 %
P361 | 467 facts 0.14 %
P2079 | 466 facts 0.14 %
P195 | 382 facts 0.12 %
P35 | 377 facts 0.11 %
P1075 | 349 facts 0.11 %
P607 | 335 facts 0.1 %
P840 | 334 facts 0.1 %
P598 | 280 facts 

In [9]:
rel = 'P26'
spouseFacts = filterByRelation(facts, rel, -1)

In [11]:
spouse_dict={}
for f in facts:
    if f[1]==rel:
        if f[0] not in spouse_dict: spouse_dict[f[0]]=set()
#         if f[2] not in spouse_dict: spouse_dict[f[2]]=set()
        spouse_dict[f[0]].add(f[2])
count=0
for q in spouse_dict:
    for v in spouse_dict[q]:
        if q not in spouse_dict[v]:
            count+=1
print(count)
        
            
# return filtered


KeyError: 'Q316596'

In [None]:
def splitTimeNoNewTimeStamps(facts, timestamps):
    # don't introduce new timestamps!
    new_facts = []
    for f in facts:
        e1 = f[0]
        r = f[1]
        e2 = f[2]
        t1 = int(f[3])
        t2 = int(f[4])
        for t in range(t1, t2+1):
            if t in timestamps:
                nf = [e1, r, e2, t]
                new_facts.append(nf)
    return new_facts

def splitTime(facts):
    # don't introduce new timestamps!
    new_facts = []
    for f in facts:
        e1 = f[0]
        r = f[1]
        e2 = f[2]
        t1 = int(f[3])
        t2 = int(f[4])
        for t in range(t1, t2+1):
            nf = [e1, r, e2, t]
            new_facts.append(nf)
    return new_facts



In [None]:
len(spouseFacts)

In [None]:
spouseFacts[0]

In [None]:
x = splitTime(spouseFacts)

In [None]:
len(x)

In [None]:
x[:10]

In [None]:
f = x[0]

In [None]:
f

In [None]:
spouseFactsSingle = x

In [12]:
# ERROR: spouse is a symmetric relation but in KB this is not indicated. Though thisis not a problem because
# for our systems really do not interpret relations.


# simple questions, time answer
def genTimeSimple1(facts, base_fact):
    head = base_fact[0]
    tail = base_fact[2]
    template = "When was {head} the spouse of {tail}?"
    answers = set()
    for f in facts:
        if f[0] == head and f[2] == tail:
            answers.add(f[3])
    question = template.format(head=head, tail=tail)
    answer_type = 'time'
    entities = set([head, tail])
    times = set()
    relations = set(['26'])
    output = {'question': question,
             'answers': answers,
             'answer_type': answer_type,
             'template': template,
             'entities': entities,
             'times': times,
             'relations': relations}
    return output

# complex questions, time answer
# def genTimeComplex1(facts, head, tail, first=True):
#     template = "When did {head} play their {adj} game for {tail}?"
#     minTime = 9999
#     maxTime = -1
#     answers = set()
#     for f in facts:
#         if f[0] == head and f[2] == tail:
#             time = f[3]
#             if time < minTime:
#                 minTime = time
#             if time > maxTime:
#                 maxTime = time
#     if first==True:
#         question = template.format(head=head, tail=tail, adj='first')
#         answers.add(minTime)
#     else:
#         question = template.format(head=head, tail=tail, adj='last')
#         answers.add(maxTime)
#     answer_type = 'time'
#     entities = set([head, tail])
#     times = set()
#     relations = set(['P26'])
#     output = {'question': question,
#              'answers': answers,
#              'answer_type': answer_type,
#              'template': template,
#              'entities': entities,
#              'times': times,
#              'relations': relations}
#     return output
        
def genTimeComplex2(facts, head, first=True):
    first = True # only want first time that a person got married
    template = "When was the {adj} time that {head} got married?"
    minTime = 9999
    maxTime = -1
    answers = set()
    for f in facts:
        if f[0] == head:
            time = f[3]
            if time < minTime:
                minTime = time
            if time > maxTime:
                maxTime = time
    if first==True:
        question = template.format(head=head, adj='first')
        answers.add(minTime)
    else:
        question = template.format(head=head, adj='last')
        answers.add(maxTime)
    answer_type = 'time'
    entities = set([head])
    times = set()
    relations = set(['P26'])
    output = {'question': question,
             'answers': answers,
             'answer_type': answer_type,
             'template': template,
             'entities': entities,
             'times': times,
             'relations': relations}
    return output
        

# simple questions, entity answer
def genEntitySimple1(facts, head, time):
    template = "Who was {head} married to in {time}?"
    answers = set()
    for f in facts:
        if f[0] == head and f[3] == time: 
            answers.add(f[2])
    question = template.format(head=head, time=time)
    answer_type = 'entity'
    entities = set([head])
    times = set([time])
    relations = set(['P26'])
    output = {'question': question,
             'answers': answers,
             'answer_type': answer_type,
             'template': template,
             'entities': entities,
             'times': times,
             'relations': relations}
    return output
        

# complex question, entity answer
def genEntityComplex1(facts, head, first=True):
    # first/last
    template = "Who was the {adj} person that {head} was married to?"
    minTime = 9999
    minTeam = ""
    maxTime = -1
    maxTeam = ""
    answers = set()
    for f in facts:
        if f[0] == head:
            time = f[3]
            if first==True:
                if time < minTime:
                    minTime = time
            else:
                if time > maxTime:
                    maxTime = time
    for f in facts:
        if f[0] == head:
            time = f[3]
            if first==True:
                if time == minTime:
                    answers.add(f[2])
            else:
                if time == maxTime:
                    answers.add(f[2])
    if first==True:
        question = template.format(head=head, adj='first')
    else:
        question = template.format(head=head, adj='last')
    answer_type = 'entity'
    entities = set([head])
    times = set()
    relations = set(['P26'])
    if len(answers)==0:
        return None
    output = {'question': question,
             'answers': answers,
             'answer_type': answer_type,
             'template': template,
             'entities': entities,
             'times': times,
             'relations': relations}
    return output
        

def getFactWithMaximumTime(facts, head = '', tail = ''):
    maxTime = -1
    fact = facts[0]
    for f in facts:
        time = f[-1]
        if head != '':
            if f[0] != head:
                continue
        if tail != '':
            if f[2] != tail:
                continue
        if time > maxTime:
            maxTime = time
            fact = f
    return fact

def getFactWithMinimumTime(facts, head = '', tail = ''):
    minTime = 9999
    fact = facts[0]
    for f in facts:
        time = f[-1]
        if head != '':
            if f[0] != head:
                continue
        if tail != '':
            if f[2] != tail:
                continue
        if time < minTime:
            minTime = time
            fact = f
    return fact


def genEntityComplex2(facts, head, tail, after=True):
    # before/after
    template = "Who was {head} married to {type} {tail}?"
    # before: find argmin(T) (head, r, tail, T)
    # then, find argmax(T')(head, r, tail', T'), T' < T and tail' != tail
    # tail' is the answer
    answers = set()
    if after == False:
        base_fact = getFactWithMinimumTime(facts, head = head, tail = tail)
        time = base_fact[-1]
        maxTime = -1
        for f in facts:
            time2 = f[-1]
            if time2 >= time:
                continue
            if f[0] != head:
                continue
            if f[2] == tail:
                continue
            if time2 > maxTime:
                maxTime = time2
        answers = set()
        for f in facts:
            time2 = f[-1]
            if f[0] != head:
                continue
            if f[2] == tail:
                continue
            if time2 == maxTime:
                answers.add(f[2])
        question = template.format(head=head, type="before", tail=tail)
    else:
        #after: find argmax(T) (head, r, tail, T)
        # then, find argmin(T') (head, r, tail', T'), T' > T and tail' != tail
        # tail' is the answer
        base_fact = getFactWithMaximumTime(facts, head = head, tail = tail)
        time = base_fact[-1]
        minTime = 9999
        for f in facts:
            time2 = f[-1]
            if time2 <= time:
                continue
            if f[0] != head:
                continue
            if f[2] == tail:
                continue
            if time2 < minTime:
                minTime = time2
        answers = set()
        for f in facts:
            time2 = f[-1]
            if f[0] != head:
                continue
            if f[2] == tail:
                continue
            if time2 == minTime:
                answers.add(f[2])
        question = template.format(head=head, type="after", tail=tail)
    answer_type = 'entity'
    entities = set([head, tail])
    times = set()
    relations = set(['P26'])
    if len(answers)==0:
        return None
    output = {'question': question,
             'answers': answers,
             'answer_type': answer_type,
             'template': template,
             'entities': entities,
             'times': times,
             'relations': relations}
    return output
        
    
    
# def genEntityComplex3(facts, head, tail):
#     template = "Who played with {head} on the {tail}?"
#     # first get all time instances when head played for tail
#     # then get all heads, where tail' = tail and time' in T
#     valid_times = set()
#     for f in facts:
#         if f[0] == head and f[2] == tail:
#             valid_times.add(f[-1])
#     answers = set()
#     for f in facts:
#         if f[2] == tail and f[-1] in valid_times:
#             answers.add(f[0])
#     question = template.format(head=head, tail=tail)
#     answer_type = 'entity'
#     entities = set([head, tail])
#     times = set()
#     relations = set(['P54'])
#     output = {'question': question,
#              'answers': answers,
#              'answer_type': answer_type,
#              'template': template,
#              'entities': entities,
#              'times': times,
#              'relations': relations}
#     return output


# how to make event questions?
# during, before, after event?
# one example: take sports events eg. Olympics
# and making during questions

# def genEventQuestion(facts, events_fact, head):
#     event_head = events_fact[0]
#     template = "Which team did {head} play for during {event_head}?"
#     valid_times = set()
#     start_time = int(events_fact[-2])
#     end_time = int(events_fact[-1])
#     for i in range(start_time, end_time + 1):
#         valid_times.add(i)
#     answers = set()
#     for f in facts:
#         if f[0] == head and f[-1] in valid_times:
#             answers.add(f[2])
#     question = template.format(event_head=event_head, head=head)
#     answer_type = 'entity'
#     entities = set([head, event_head])
#     times = set()
#     relations = set(['P39'])
#     output = {'question': question,
#              'answers': answers,
#              'answer_type': answer_type,
#              'template': template,
#              'entities': entities,
#              'times': times,
#              'relations': relations}
#     return output

        

def areFactsSame(f1, f2):
    flag = True
    for i in range(3):
        if f1[i] != f2[i]:
            flag = False
            break
    return flag

In [None]:
id = 1
f = spouseFactsSingle[id]
genTimeSimple1(spouseFactsSingle, f)

In [None]:
genTimeComplex2(spouseFactsSingle, f[0], first=True)

In [None]:
# genTimeComplex1(spouseFactsSingle, f[0], f[2])

In [None]:
genEntitySimple1(spouseFactsSingle, f[0], f[-1])

In [None]:
genEntityComplex1(spouseFactsSingle, f[0], first=True)

In [None]:
for id in tqdm(range(500)):
    f = spouseFactsSingle[id]
    a = genEntityComplex2(spouseFactsSingle, f[0], f[2])
    if len(a['answers']) > 0:
        print(a)
        break

In [None]:
# kg_input_output_t5 = []
# for f in sportsFacts:
#     entry = {}
#     head = f[0]
#     relation = f[1]
#     tail = f[2]
#     start_time = f[3][1:5]
#     end_time = f[4][1:5]
#     input_format = 

In [14]:
def makeQuestions(spouseFactsSingle, f):
    questions = []
    questions.append(genTimeSimple1(spouseFactsSingle, f))
    questions.append(genTimeComplex2(spouseFactsSingle, f[0], first=random.choice([True, False])))
    questions.append(genEntitySimple1(spouseFactsSingle, f[0], f[-1]))
    tempques=genEntityComplex1(spouseFactsSingle, f[0], first=random.choice([True, False]))
    if tempques is not None:
        questions.append(tempques)
    tempques=genEntityComplex2(spouseFactsSingle, f[0], f[2], after=random.choice([True, False]))
    if tempques is not None:
        questions.append(tempques)
    return questions



In [None]:
import pickle
dataset_split = 'train'
split_entities = set(pickle.load(open(dataset_split + '_ents.pickle', 'rb')))
split_facts = []
my_facts = spouseFactsSingle
for f in my_facts:
    if f[0] in split_entities and f[2] in split_entities:
        split_facts.append(f)
len(split_facts)

In [None]:
# how to get number of questions?
# 1. set total questions to 300k
# 2. get number of facts with this relation
# 3. get how many questions u want of this relation as fraction
# 4. choose 10% of that number for test, 90% for train
max_dataset_questions = 300000
relation_name = my_facts[0][1]
num_relation_facts = len(filterByRelation(facts, relation_name, -1))
# fraction of questions need to be decided based on fraction of question relation facts
# not all facts!
question_relation_list = ['P39', 'P166', 'P108', 'P54', 'P26']
num_all_question_relation_facts = sum([len(filterByRelation(facts, x, -1)) for x in question_relation_list])
num_questions_for_this_relation = int(max_dataset_questions * num_relation_facts/num_all_question_relation_facts)
split_ratios = {'test': 0.1, 'train': 0.9}
num_questions = int(split_ratios[dataset_split] * num_questions_for_this_relation)
num_questions

In [None]:
import random
data = []
# num_questions = 100000
# events_facts_small = events_facts[:10]
questions_set = set()

pbar = tqdm(range(len(split_facts)))
random.shuffle(split_facts)
for i in range(len(split_facts)):
#     f = random.choice(positionFactsSingle)
    f = split_facts[i]
    pbar.set_description("Num questions %d, i %d" % (len(data), i))
    questions = makeQuestions(my_facts, f)
    for q in questions:
        if len(q['answers']) > 0 and q['question'] not in questions_set:
            data.append(q)
            questions_set.add(q['question'])
    if len(data) >= num_questions:
        break

In [None]:
def getQuestionTypeDistribution(data):
    type_dict = {}
    for d in data:
        template = d['template']
        if template not in type_dict:
            type_dict[template] = 1
        else:
            type_dict[template] += 1
    return type_dict
getQuestionTypeDistribution(data)

In [None]:
len(data)

In [None]:
import pickle
# filename = 'data/questions/questions_spouse_big.pickle'
filename = 'data/questions/{split}_questions_spouse_big.pickle'.format(
            split=dataset_split)

pickle.dump(data, open(filename, "wb"))

In [None]:
import random

for i in tqdm(range(10000)):
    id = random.randint(0, len(sportsFactsSingle))
    f = sportsFactsSingle[id]
    try:
        data += makeQuestions(sportsFactsSingle, f)
    except:
        continue

In [None]:
data[10:]

In [None]:
def writeQuestions(filename, data):
    f = open(filename, 'w')
    for d in data:
        answers_str = []
        for ans in d[1]:
            answers_str.append(str(ans))
        if answers_str == []:
            continue
        line = d[0] + '\t' + '|'.join(answers_str)
        f.write(line + '\n')
    f.close()

In [None]:
writeQuestions('questions_member_of_sports_team.txt', data)