In [4]:
from tqdm import tqdm
import pickle

# facts = pickle.load(open('wikidata_text_facts.txt', 'rb'))

# facts = pickle.load(open('text_facts_v2.pkl', 'rb'))
facts = readFactsFromFile('../data/temporal_big/full.txt')

In [3]:
def getEntsFromFacts(facts):
    e = set()
    for f in facts:
        e.add(f[0])
        e.add(f[2])
    return e

def readFile(filename):
    f = open(filename, 'r')
    lines = []
    for line in f:
        line = line.strip()
        if line != '':
            lines.append(line)
    f.close()
    return lines

def getRelsFromFacts(facts):
    r = set()
    for f in facts:
        r.add(f[1])
    return r

def filterByRelation(facts, rel, max=50):
    filtered = []
    count = 0
    for f in facts:
        if f[1] == rel:
            filtered.append(f)
            count += 1
        if max < 0:
            continue
        if count >= max:
            break
    return filtered

def filterByHead(facts, head, max=50):
    filtered = []
    count = 0
    for f in facts:
        if f[0] == head:
            filtered.append(f)
            count += 1
        if max < 0:
            continue
        if count >= max:
            break
    return filtered

def filterByTail(facts, tail, max=50):
    filtered = []
    count = 0
    for f in facts:
        if f[2] == tail:
            filtered.append(f)
            count += 1
        if max < 0:
            continue
        if count >= max:
            break
    return filtered

def filterByEntity(facts, ent, max=50):
    filtered = []
    count = 0
    for f in facts:
        if f[0] == ent or f[2] == ent:
            filtered.append(f)
            count += 1
        if max < 0:
            continue
        if count >= max:
            break
    return filtered

def printFact(f):
    s = "{head}, {rel}, {tail}, {t1}, {t2}"
    head=f[0]
    rel = f[1]
    tail = f[2]
    t1 = f[3][1:5]
    t2 = f[4][1:5]
    print(s.format(head=head, rel=rel, tail=tail, t1=t1, t2=t2))
    
def printFacts(facts):
    for f in facts:
        printFact(f)
        
def isEntityInFact(e, fact):
    if fact[0] == e or fact[2] == e:
        return True
    else:
        return False
    
        
def writeFactsToFile(filename, facts):
    f = open(filename, 'w')
    for fact in facts:
        line = '\t'.join(fact)
        f.write(line + '\n')
    f.close()
    
def readFactsFromFile(filename):
    f = open(filename, 'r')
    facts = []
    for line in f:
        fact = line.strip().split('\t')
        facts.append(fact)
    f.close()
    return facts

def createWikidataIdentifierToTextDict(entities, relations):
    ent_rel_both = {}
    ent_rel_both['Q'] = entities
    ent_rel_both['P'] = relations
    wikidata_id2name = {}
    count = 0
    for prefix, lines in ent_rel_both.items():
        for line in lines:
            try:
                line = line.split('\t')
                id = prefix + line[1] #different from one above
                name = line[2]
                wikidata_id2name[id] = name
            except:
                id = prefix + line[1]
                name = ''
                wikidata_id2name[id] = name
    return wikidata_id2name

def convertFactToText(fact, wikidata_identifier_to_text):
    f = fact.copy()
    for i in range(3):
        f[i] = wikidata_identifier_to_text[fact[i]]
    return f

In [5]:
facts[0]

['Q25559009', 'P39', 'Q41582555', '1847', '1852']

In [6]:
# facts = readFactsFromFile('data/temporal_small/full.txt')
facts = readFactsFromFile('../data/temporal_big/full.txt')

In [7]:
r = getRelsFromFacts(facts)
e = getEntsFromFacts(facts)

In [8]:
len(facts)

328635

In [9]:
len(e)

125726

In [10]:
e_count_dict = {}
r_count_dict = {}
head_count = {}
tail_count = {}
for ent in e:
    e_count_dict[ent] = 0
    head_count[ent] = 0
    tail_count[ent] = 0
for rel in r:
    r_count_dict[rel] = 0
for f in facts:
    e1 = f[0]
    e2 = f[2]
    rel = f[1]
    e_count_dict[e1] += 1
    e_count_dict[e2] += 1
    r_count_dict[rel] += 1
    head_count[e1] += 1
    tail_count[e2] += 1

In [11]:
x = r_count_dict
sorted_r = {k: v for k, v in sorted(x.items(), key=lambda item: item[1], reverse=True)}

x = e_count_dict
sorted_e = {k: v for k, v in sorted(x.items(), key=lambda item: item[1], reverse=True)}

x = head_count
sorted_head = {k: v for k, v in sorted(x.items(), key=lambda item: item[1], reverse=True)}

x = tail_count
sorted_tail = {k: v for k, v in sorted(x.items(), key=lambda item: item[1], reverse=True)}

In [12]:
for key, value in sorted_r.items():
    print(key, '|', value, 'facts', round(value/len(facts)*100, 2), '%' )

P39 | 78380 facts 23.85 %
P166 | 75474 facts 22.97 %
P54 | 67007 facts 20.39 %
P1411 | 21983 facts 6.69 %
P1346 | 17841 facts 5.43 %
P793 | 9463 facts 2.88 %
P26 | 6034 facts 1.84 %
P108 | 5448 facts 1.66 %
P937 | 4795 facts 1.46 %
P69 | 4424 facts 1.35 %
P512 | 3821 facts 1.16 %
P17 | 2604 facts 0.79 %
P131 | 2327 facts 0.71 %
P106 | 1669 facts 0.51 %
P463 | 1605 facts 0.49 %
P551 | 1407 facts 0.43 %
P150 | 1355 facts 0.41 %
P6 | 1310 facts 0.4 %
P410 | 1306 facts 0.4 %
P102 | 1239 facts 0.38 %
P488 | 1154 facts 0.35 %
P241 | 1112 facts 0.34 %
P185 | 1045 facts 0.32 %
P27 | 872 facts 0.27 %
P527 | 867 facts 0.26 %
P286 | 866 facts 0.26 %
P31 | 676 facts 0.21 %
P5008 | 654 facts 0.2 %
P127 | 596 facts 0.18 %
P1308 | 562 facts 0.17 %
P184 | 544 facts 0.17 %
P451 | 519 facts 0.16 %
P1037 | 490 facts 0.15 %
P361 | 467 facts 0.14 %
P2079 | 466 facts 0.14 %
P195 | 382 facts 0.12 %
P35 | 377 facts 0.11 %
P1075 | 349 facts 0.11 %
P607 | 335 facts 0.1 %
P840 | 334 facts 0.1 %
P598 | 280 facts 

In [13]:
#rel = 'position held'
rel = 'P39'
sportsFacts = filterByRelation(facts, rel, -1)

In [14]:
len(sportsFacts)

78380

In [15]:
# position_held = filterByRelation(facts, 'position held', -1)
position_held = filterByRelation(facts, 'P39', -1)
len(position_held)

78380

In [16]:
filtered_facts = []
for f in position_held:
    tail = f[2]
    if tail != tail.lower():
        filtered_facts.append(f)

In [17]:
len(filtered_facts)

78380

In [18]:
def splitTimeNoNewTimeStamps(facts, timestamps):
    # don't introduce new timestamps!
    new_facts = []
    for f in facts:
        e1 = f[0]
        r = f[1]
        e2 = f[2]
        t1 = int(f[3])
        t2 = int(f[4])
        for t in range(t1, t2+1):
            if t in timestamps:
                nf = [e1, r, e2, t]
                new_facts.append(nf)
    return new_facts

def splitTime(facts):
    # don't introduce new timestamps!
    new_facts = []
    for f in facts:
        e1 = f[0]
        r = f[1]
        e2 = f[2]
        t1 = int(f[3])
        t2 = int(f[4])
        for t in range(t1, t2+1):
            nf = [e1, r, e2, t]
            new_facts.append(nf)
    return new_facts



In [19]:
timestamps = set()
for f in facts:
    timestamps.add(int(f[-2]))
    timestamps.add(int(f[-1]))
print(len(timestamps))


1643


In [20]:
x = splitTime(filtered_facts)

In [21]:
len(x)

422446

In [22]:
len(filtered_facts)

78380

In [23]:
sportsFactsSingle = x

In [24]:
sportsFactsSingle[0]

['Q25559009', 'P39', 'Q41582555', 1847]

In [78]:
# simple questions, time answer
def genTimeSimple1(facts, base_fact):
    head = base_fact[0]
    tail = base_fact[2]
    template = "When did {head} hold the position of {tail}?"
    answers = set()
    for f in facts:
        # (s, r, o, T)
        if f[0] == head and f[2] == tail:
            answers.add(f[3])
    question = template.format(head=head, tail=tail)
    answer_type = 'time'
    entities = set([head, tail])
    times = set()
    relations = set(['P39'])
    output = {'question': question,
             'answers': answers,
             'answer_type': answer_type,
             'template': template,
             'entities': entities,
             'times': times,
             'relations': relations}
    return output

# complex questions, time answer
def genTimeComplex1(facts, head, tail, first=True):
    template = "When was the {adj} time that {head} was the {tail}?"
    minTime = 9999
    maxTime = -1
    answers = set()
    for f in facts:
        if f[0] == head and f[2] == tail:
            time = f[3]
            if time < minTime:
                minTime = time
            if time > maxTime:
                maxTime = time
    if first==True:
        question = template.format(head=head, tail=tail, adj='first')
        answers.add(minTime)
    else:
        question = template.format(head=head, tail=tail, adj='last')
        answers.add(maxTime)
    answer_type = 'time'
    entities = set([head, tail])
    times = set()
    relations = set(['P39'])
    output = {'question': question,
             'answers': answers,
             'answer_type': answer_type,
             'template': template,
             'entities': entities,
             'times': times,
             'relations': relations}
    return output
        
def genTimeComplex2(facts, tail, first=True):
    template = "When did the {adj} {tail} come to power?" # ERROR- semantically correct only for president, etc
    minTime = 9999
    maxTime = -1
    answers = set()
    for f in facts:
        if f[2] == tail:
            time = f[3]
            if time < minTime:
                minTime = time
            if time > maxTime:
                maxTime = time
    if first==True:
        question = template.format(tail=tail, adj='first')
        answers.add(minTime)
    else:
        question = template.format(tail=tail, adj='last')
        answers.add(maxTime)
    answer_type = 'time'
    entities = set([tail])
    times = set()
    relations = set(['P39'])
    output = {'question': question,
             'answers': answers,
             'answer_type': answer_type,
             'template': template,
             'entities': entities,
             'times': times,
             'relations': relations}
    return output

# simple questions, entity answer
def genEntitySimple1(facts, tail, time):
    template = "Who was the {tail} in {time}?"
    answers = set()
    for f in facts:
        if f[2] == tail and f[3] == time:
            answers.add(f[0])
    question = template.format(tail=tail, time=time)
    answer_type = 'entity'
    entities = set([tail])
    times = set([time])
    relations = set(['P39'])
    output = {'question': question,
             'answers': answers,
             'answer_type': answer_type,
             'template': template,
             'entities': entities,
             'times': times,
             'relations': relations}
    return output

def genEntitySimple2(facts, tail, time1, time2):
    template = "Who was the {tail} from {time1} to {time2}?"
    answers = set()
    for f in facts:
        if f[2] == head and f[3] == time: # ERROR f[2]>=head and f[3]<=tail
            answers.add(f[2])
    question = template.format(head=head, time=time)
    answer_type = 'entity'
    entities = set([tail])
    times = set([time1, time2])
    relations = set(['P39'])
    output = {'question': question,
             'answers': answers,
             'answer_type': answer_type,
             'template': template,
             'entities': entities,
             'times': times,
             'relations': relations}
    return output


# complex question, entity answer
def genEntityComplex1(facts, tail, first=True):
    # first/last
    template = "Who was the {adj} {tail}?"
    answers = set()
    if first==True:
        x = getFactWithMinimumTime(facts, head ='', tail=tail)
        time=x[-1]
        for f in facts:
            if f[-1]==time and f[-2]==tail:
                answers.add(f[0])
        question = template.format(tail=x[2], adj='first')
    else:
        x = getFactWithMaximumTime(facts, head ='', tail=tail)
        time=x[-1]
        for f in facts:
            if f[-1]==time and f[-2]==tail:
                answers.add(f[0])
        question = template.format(tail=x[2], adj='last')
    answer_type = 'entity'
    entities = set([tail])
    times = set()
    relations = set(['P39'])
    output = {'question': question,
             'answers': answers,
             'answer_type': answer_type,
             'template': template,
             'entities': entities,
             'times': times,
             'relations': relations}
    return output

def getFactWithMaximumTime(facts, head = '', tail = ''):
    maxTime = -1
    fact = facts[0]
    for f in facts:
        time = f[-1]
        if head != '':
            if f[0] != head:
                continue
        if tail != '':
            if f[2] != tail:
                continue
        if time > maxTime:
            maxTime = time
            fact = f
    return fact

def getFactWithMinimumTime(facts, head = '', tail = ''):
    minTime = 9999
    fact = facts[0]
    for f in facts:
        time = f[-1]
        if head != '':
            if f[0] != head:
                continue
        if tail != '':
            if f[2] != tail:
                continue
        if time < minTime:
            minTime = time
            fact = f
    return fact


def genEntityComplex2(facts, head, tail, after=True):
    # before/after
    template = "Who was the {tail} {type} {head}?"
    # before: find argmin(T) (head, r, tail, T)
    # then, find argmax(T')(head, r, tail', T'), T' < T and head' != head
    # tail' is the answer
    if after == False:
        base_fact = getFactWithMinimumTime(facts, head = head, tail = tail)
        time = base_fact[-1]
        maxTime = -1
        for f in facts:
            time2 = f[-1]
            if time2 >= time:
                continue
            if f[0] == head:
                continue
            if f[2] != tail:
                continue
            if time2 > maxTime:
                maxTime = time2
        answers = set()
        for f in facts:
            time2 = f[-1]
            if f[0] == head:
                continue
            if f[2] != tail:
                continue
            if time2 == maxTime:
                answers.add(f[0])
        question = template.format(head=head, type="before", tail=tail)
    else:
        #after: find argmax(T) (head, r, tail, T)
        # then, find argmin(T') (head, r, tail', T'), T' > T and head' != head
        # tail' is the answer
        base_fact = getFactWithMaximumTime(facts, head = head, tail = tail)
        time = base_fact[-1]
        minTime = 9999
        for f in facts:
            time2 = f[-1]
            if time2 <= time:
                continue
            if f[0] == head:
                continue
            if f[2] != tail:
                continue
            if time2 < minTime:
                minTime = time2
        answers = set()
        for f in facts:
            time2 = f[-1]
            if f[0] == head:
                continue
            if f[2] != tail:
                continue
            if time2 == minTime:
                answers.add(f[0])
        question = template.format(head=head, type="after", tail=tail)
    answer_type = 'entity'
    entities = set([head, tail])
    times = set()
    relations = set(['P39'])
    output = {'question': question,
             'answers': answers,
             'answer_type': answer_type,
             'template': template,
             'entities': entities,
             'times': times,
             'relations': relations}
    return output
    
# def genEntityComplex3(facts, head1, head2, tail):
#     template = "Which team did {head1} play for when {head2} was playing for "


def genEntityComplex3(facts, head, tail, tail2):
    template = "Who held the position of {tail2} when {head} was the {tail}?"
    # first get all time instances when head played for tail
    # then get all heads, where tail' = tail and time' in T
    valid_times = set()
    for f in facts:
        if f[0] == head and f[2] == tail:
            valid_times.add(f[-1])
    answers = set()
    for f in facts:
        if f[2] == tail2 and f[-1] in valid_times:
            answers.add(f[0])
    question = template.format(head=head, tail=tail, tail2=tail2)
    answer_type = 'entity'
    entities = set([head, tail, tail2])
    times = set()
    relations = set(['P39'])
    output = {'question': question,
             'answers': answers,
             'answer_type': answer_type,
             'template': template,
             'entities': entities,
             'times': times,
             'relations': relations}
    return output
    
def genEventQuestion(facts, events_fact, tail):
    event_head = events_fact[0]
    template = "Who held the position of {tail} during {event_head}?"
    valid_times = set()
    start_time = int(events_fact[-2])
    end_time = int(events_fact[-1])
    for i in range(start_time, end_time + 1):
        valid_times.add(i)
    answers = set()
    for f in facts:
        if f[2] == tail and f[-1] in valid_times:
            answers.add(f[0])
    question = template.format(event_head=event_head, tail=tail)
    answer_type = 'entity'
    entities = set([tail, event_head])
    times = set()
    relations = set(['P39'])
    output = {'question': question,
             'answers': answers,
             'answer_type': answer_type,
             'template': template,
             'entities': entities,
             'times': times,
             'relations': relations}
    return output

def genEventQuestion2(facts, events_fact, tail, after=True):
    event_head = events_fact[0]
    template = "Who held the position of {tail} {type} {event_head}?"
    answers = set()
    # event happened at some time
    # choose all times before/after that time
    # choose fact with tail=tail with max/min time in those valid times
    # head of that fact is the answer
    if after == False:
        # before
        # get upper limit time
        upper_limit = int(events_fact[-2])
        maxTime = -1
        # now get the fact with time < upper_limit
        for f in facts:
            time = f[-1]
            if time >= upper_limit:
                continue
            if tail != f[2]:
                continue
            if time > maxTime:
                maxTime = time
        answers = set()
        for f in facts:
            time = f[-1]
            if tail != f[2] or event_head==f[0]:
                continue
            if time == maxTime:
                answers.add(f[0])
        question = template.format(event_head=event_head, type="before", tail=tail)
    else:
        # after
        # get lower limit time
        lower_limit = int(events_fact[-1])
        minTime = 99999
        # now get the fact with time > lower_limit
        for f in facts:
            time = f[-1]
            if time <= lower_limit:
                continue
            if tail != f[2]:
                continue
            if time < minTime:
                minTime = time
        answers = set()
        for f in facts:
            time = f[-1]
            if tail != f[2] or event_head==f[0]:
                continue
            if time == minTime:
                answers.add(f[0])
        question = template.format(event_head=event_head, type="after", tail=tail)
    answer_type = 'entity'
    entities = set([tail, event_head])
    times = set()
    relations = set(['P39'])
    output = {'question': question,
             'answers': answers,
             'answer_type': answer_type,
             'template': template,
             'entities': entities,
             'times': times,
             'relations': relations}
    return output

def areFactsSame(f1, f2):
    flag = True
    for i in range(3):
        if f1[i] != f2[i]:
            flag = False
            break
    return flag

def toQuestionJSON(question, answers, template, wikidata_identifier_to_text):
    output = {}
    output['']

In [29]:
entities = readFile('entities')
relations = readFile('predicates')

In [30]:
wikidata_identifier_to_text = createWikidataIdentifierToTextDict(entities, relations)
text_to_wikidata_identifier = {v: k for k, v in wikidata_identifier_to_text.items()}

In [31]:
# positionFactsSingle = [convertFactToText(f, wikidata_identifier_to_text) for f in sportsFactsSingle]
positionFactsSingle = sportsFactsSingle

In [48]:
id = 600
f = positionFactsSingle[id]
genTimeSimple1(positionFactsSingle, f)

{'question': 'When did Q324970 hold the position of Q2047357?',
 'answers': {1935,
  1936,
  1937,
  1938,
  1939,
  1940,
  1941,
  1942,
  1943,
  1944,
  1945,
  1946},
 'answer_type': 'time',
 'template': 'When did {head} hold the position of {tail}?',
 'entities': {'Q2047357', 'Q324970'},
 'times': set(),
 'relations': {'P39'}}

In [49]:
genTimeComplex2(positionFactsSingle, f[2], first=True)

{'question': 'When did the first Q2047357 come to power?',
 'answers': {1813},
 'answer_type': 'time',
 'template': 'When did the {adj} {tail} come to power?',
 'entities': {'Q2047357'},
 'times': set(),
 'relations': {'P39'}}

In [50]:
genTimeComplex1(positionFactsSingle, f[0], f[2], first=True)

{'question': 'When was the first time that Q324970 was the Q2047357?',
 'answers': {1935},
 'answer_type': 'time',
 'template': 'When was the {adj} time that {head} was the {tail}?',
 'entities': {'Q2047357', 'Q324970'},
 'times': set(),
 'relations': {'P39'}}

In [51]:
genEntitySimple1(positionFactsSingle, f[2], f[-1])

{'question': 'Who was the Q2047357 in 1946?',
 'answers': {'Q324970'},
 'answer_type': 'entity',
 'template': 'Who was the {tail} in {time}?',
 'entities': {'Q2047357'},
 'times': {1946},
 'relations': {'P39'}}

In [52]:
genEntityComplex1(positionFactsSingle, f[2], first=True)

{'question': 'Who was the first Q2047357?',
 'answers': {'Q26286363'},
 'answer_type': 'entity',
 'template': 'Who was the {adj} {tail}?',
 'entities': {'Q2047357'},
 'times': set(),
 'relations': {'P39'}}

In [53]:
genEntityComplex2(positionFactsSingle, f[0], f[2], after=True)

{'question': 'Who was the Q2047357 after Q324970?',
 'answers': {'Q1790063'},
 'answer_type': 'entity',
 'template': 'Who was the {tail} {type} {head}?',
 'entities': {'Q2047357', 'Q324970'},
 'times': set(),
 'relations': {'P39'}}

In [54]:
genEntityComplex3(positionFactsSingle, f[0], f[2], 'Prime minister of India')

In [55]:
# with (open("temporal_events_as_text_facts.pkl", "rb")) as openfile:
#     events_facts = pickle.load(openfile)
# events_facts = readFactsFromFile('facts_temporal_event.txt')

# events facts are already there in facts

In [56]:
def getEventsFactsFromFacts(facts):
    events_facts = []
    for f in facts:
        rel = f[1]
        tail = f[2]
        if rel == 'P793' and tail == 'Q1190554': # significant event and occurrence
#         if rel == 'P39' and tail == 'Q1190554': # significant event and occurrence
            events_facts.append(f)
    return events_facts
events_facts = getEventsFactsFromFacts(facts)


In [57]:
# len(events_facts)
import pickle as pkl
def get_facts_from_ent(entname):
    idx=name2ent[entname]
    facts_=[]
    for fact in events_facts:
        if (fact[0]==idx or fact[2]==idx):
            fact=(ent2name[fact[0]],rel2name[fact[1]],ent2name[fact[2]],fact[3],fact[4])
            facts_.append(fact)
            print(fact)
    return facts_
def openFileAsDict(filename):
    f = open(filename, 'r')
    out = {}
    for line in f:
        line = line[:-1].split('\t') # can't strip() since name can be whitespace
        out[line[0]] = line[1]
    return out
ent2name = openFileAsDict( '../data/wikidata_big/kg/wd_id2entity_text.txt')
rel2name = openFileAsDict( '../data/wikidata_big/kg/wd_id2relation_text.txt')
name2ent={i:j for (j,i) in ent2name.items()}
name2rel={i:j for (j,i) in rel2name.items()}

ent2id=pkl.load(open("../data/wikidata_big/kg/tkbc_processed_data/wikidata_big/ent_id","rb"))
rel2id=pkl.load(open("../data/wikidata_big/kg/tkbc_processed_data/wikidata_big/rel_id","rb"))
id2ent={i:j for (j,i) in ent2id.items()}
id2rel={i:j for (j,i) in rel2id.items()}
id2relname={i:rel2name[id2rel[i]] for i in id2rel}
id2entname={i:ent2name[id2ent[i]] for i in id2ent}


In [58]:
name2ent["Member of the Victorian Legislative Assembly"]

'Q18534408'

In [59]:
facts_reqd=get_facts_from_ent("2012 Summer Olympics")

('2012 Summer Olympics', 'significant event', 'occurrence', '2012', '2012')


In [63]:
genEventQuestion(positionFactsSingle, events_facts[0], 'Q11696')

{'question': 'Who held the position of Q11696 during Q362?',
 'answers': {'Q11613', 'Q8007'},
 'answer_type': 'entity',
 'template': 'Who held the position of {tail} during {event_head}?',
 'entities': {'Q11696', 'Q362'},
 'times': set(),
 'relations': {'P39'}}

In [61]:
print(facts_reqd)
genEventQuestion2(positionFactsSingle, facts_reqd[0], 'Q18534408', after=False)

[('2012 Summer Olympics', 'significant event', 'occurrence', '2012', '2012')]


{'question': 'Who held the position of Q18534408 before 2012 Summer Olympics?',
 'answers': {'Q3108047', 'Q6317847', 'Q6939485'},
 'answer_type': 'entity',
 'template': 'Who held the position of {tail} {type} {event_head}?',
 'entities': {'2012 Summer Olympics', 'Q18534408'},
 'times': set(),
 'relations': {'P39'}}

In [62]:
events_facts[0]

['Q362', 'P793', 'Q1190554', '1939', '1945']

In [38]:
wikidata_identifier_to_text['Q11696']

'President of the United States'

In [71]:
id = 200
f = positionFactsSingle[id]
# print(genTimeSimple1(positionFactsSingle, f))
# print(genTimeComplex2(positionFactsSingle, f[2], first=True))
# print(genTimeComplex1(positionFactsSingle, f[0], f[2], first=True))
# print(genEntitySimple1(positionFactsSingle, f[2], f[-1]))
# print(genEntityComplex1(positionFactsSingle, f[2], first=True))
# print(genEntityComplex2(positionFactsSingle, f[0], f[2], after=True))
# print(genEntityComplex3(positionFactsSingle, f[0], f[2], 'Prime minister of India'))
print(genEventQuestion(positionFactsSingle, events_facts[0], 'Q18534408'))
print(genEventQuestion2(positionFactsSingle, events_facts[0], 'Q18534408'))

{'question': 'Who held the position of Q18534408 during Q362?', 'answers': {'Q8011040', 'Q4909082', 'Q7792143', 'Q8001954', 'Q3525207', 'Q8017875', 'Q5645040', 'Q444481'}, 'answer_type': 'entity', 'template': 'Who held the position of {tail} during {event_head}?', 'entities': {'Q18534408', 'Q362'}, 'times': set(), 'relations': {'P39'}}
{'question': 'Who held the position of Q18534408 after Q362?', 'answers': {'Q4909082', 'Q7792143', 'Q8001954', 'Q3525207', 'Q8017875', 'Q444481'}, 'answer_type': 'entity', 'template': 'Who held the position of {tail} {type} {event_head}?', 'entities': {'Q18534408', 'Q362'}, 'times': set(), 'relations': {'P39'}}


In [72]:
import random
def makeQuestions(positionFactsSingle, events_facts, split_facts, f):
    questions = []
    questions.append(genTimeSimple1(positionFactsSingle, f))
    questions.append(genTimeComplex2(positionFactsSingle, f[2], first=random.choice([True, False])))
    questions.append(genTimeComplex1(positionFactsSingle, f[0], f[2], first=random.choice([True, False])))
    questions.append(genEntitySimple1(positionFactsSingle, f[2], f[-1]))
    questions.append(genEntityComplex1(positionFactsSingle, f[2], first=random.choice([True, False])))
    questions.append(genEntityComplex2(positionFactsSingle, f[0], f[2], after=random.choice([True, False])))
    questions.append(genEntityComplex3(positionFactsSingle, f[0], f[2], random.choice(split_facts)[2]))
    # using only the first 10 events facts ?.?
    # because we want world war II to be there
    # which is the first event fact lol xD
    questions.append(genEventQuestion(positionFactsSingle, random.choice(events_facts[:10]), f[2]))
    questions.append(genEventQuestion2(positionFactsSingle, random.choice(events_facts[:10]), f[2], after=random.choice([True, False])))
    return questions


In [73]:
train_ents = set(pickle.load(open('train_ents.pickle', 'rb')))
test_ents = set(pickle.load(open('test_ents.pickle', 'rb')))

len(train_ents.intersection(test_ents))

0

In [85]:
import pickle
dataset_split = 'test'
split_entities = set(pickle.load(open(dataset_split + '_ents.pickle', 'rb')))
split_facts = []
my_facts = positionFactsSingle
for f in my_facts:
    if f[0] in split_entities and f[2] in split_entities:
        split_facts.append(f)
        
len(split_facts)

25968

In [86]:
# how to get number of questions?
# 1. set total questions to 300k
# 2. get number of facts with this relation
# 3. get how many questions u want of this relation as fraction
# 4. choose 10% of that number for test, 90% for train
max_dataset_questions = 300000
relation_name = my_facts[0][1]
num_relation_facts = len(filterByRelation(facts, relation_name, -1))
# fraction of questions need to be decided based on fraction of question relation facts
# not all facts!
question_relation_list = ['P39', 'P166', 'P108', 'P54', 'P26']
num_all_question_relation_facts = sum([len(filterByRelation(facts, x, -1)) for x in question_relation_list])
num_questions_for_this_relation = int(max_dataset_questions * num_relation_facts/num_all_question_relation_facts)
split_ratios = {'test': 0.1, 'train': 0.9}
num_questions = int(split_ratios[dataset_split] * num_questions_for_this_relation)
num_questions

10120

In [87]:
num_all_question_relation_facts

232343

In [88]:
import random
data = []
# num_questions has been calculated above
# num_questions = 100000

events_facts_small = events_facts[:10]
questions_set = set()

pbar = tqdm(range(len(split_facts)))
random.shuffle(split_facts)
for i in range(len(split_facts)):
#     f = random.choice(positionFactsSingle)
    f = split_facts[i]
    pbar.set_description("Num questions %d, i %d" % (len(data), i))
    questions = makeQuestions(my_facts, events_facts_small, split_facts, f)
    for q in questions:
        if len(q['answers']) > 0 and q['question'] not in questions_set:
            data.append(q)
            questions_set.add(q['question'])
    if len(data) >= num_questions:
        break

Num questions 10119, i 2565:   0%|          | 0/25968 [20:44<?, ?it/s]

In [89]:
len(data)

10123

In [90]:
def getQuestionTypeDistribution(data):
    type_dict = {}
    for d in data:
        template = d['template']
        if template not in type_dict:
            type_dict[template] = 1
        else:
            type_dict[template] += 1
    return type_dict

In [91]:
getQuestionTypeDistribution(data)

{'When did {head} hold the position of {tail}?': 1764,
 'When did the {adj} {tail} come to power?': 475,
 'When was the {adj} time that {head} was the {tail}?': 2059,
 'Who was the {tail} in {time}?': 1554,
 'Who was the {adj} {tail}?': 484,
 'Who held the position of {tail2} when {head} was the {tail}?': 1180,
 'Who was the {tail} {type} {head}?': 1149,
 'Who held the position of {tail} {type} {event_head}?': 993,
 'Who held the position of {tail} during {event_head}?': 465}

In [92]:
import pickle
# filename = 'data/questions/questions_position_held_big.pickle'
filename = 'data/questions/{split}_questions_position_held_big.pickle'.format(
            split=dataset_split)
pickle.dump(data, open(filename, "wb"))

In [None]:
# y = readQuestions('questions_position_held.txt')
# y[50:100]

In [None]:
data[50:100]

In [50]:
len(data)

91082

In [None]:
x = set()
for d in data:
    x.add(d['question'])
len(x)