In [1]:
from tqdm import tqdm
import pickle

# facts = pickle.load(open('wikidata_text_facts.txt', 'rb'))
# facts = pickle.load(open('text_facts_v2.pkl', 'rb'))

In [2]:
def getEntsFromFacts(facts):
    e = set()
    for f in facts:
        e.add(f[0])
        e.add(f[2])
    return e

def readFile(filename):
    f = open(filename, 'r')
    lines = []
    for line in f:
        line = line.strip()
        if line != '':
            lines.append(line)
    f.close()
    return lines

def getRelsFromFacts(facts):
    r = set()
    for f in facts:
        r.add(f[1])
    return r

def filterByRelation(facts, rel, max=50):
    filtered = []
    count = 0
    for f in facts:
        if f[1] == rel:
            filtered.append(f)
            count += 1
        if max < 0:
            continue
        if count >= max:
            break
    return filtered

def filterByHead(facts, head, max=50):
    filtered = []
    count = 0
    for f in facts:
        if f[0] == head:
            filtered.append(f)
            count += 1
        if max < 0:
            continue
        if count >= max:
            break
    return filtered

def filterByTail(facts, tail, max=50):
    filtered = []
    count = 0
    for f in facts:
        if f[2] == tail:
            filtered.append(f)
            count += 1
        if max < 0:
            continue
        if count >= max:
            break
    return filtered

def filterByEntity(facts, ent, max=50):
    filtered = []
    count = 0
    for f in facts:
        if f[0] == ent or f[2] == ent:
            filtered.append(f)
            count += 1
        if max < 0:
            continue
        if count >= max:
            break
    return filtered

def printFact(f):
    s = "{head}, {rel}, {tail}, {t1}, {t2}"
    head=f[0]
    rel = f[1]
    tail = f[2]
    t1 = f[3][1:5]
    t2 = f[4][1:5]
    print(s.format(head=head, rel=rel, tail=tail, t1=t1, t2=t2))
    
def printFacts(facts):
    for f in facts:
        printFact(f)
        
def isEntityInFact(e, fact):
    if fact[0] == e or fact[2] == e:
        return True
    else:
        return False
    
        
def writeFactsToFile(filename, facts):
    f = open(filename, 'w')
    for fact in facts:
        line = '\t'.join(fact)
        f.write(line + '\n')
    f.close()
    
def readFactsFromFile(filename):
    f = open(filename, 'r')
    facts = []
    for line in f:
        fact = line.strip().split('\t')
        facts.append(fact)
    f.close()
    return facts

def createWikidataIdentifierToTextDict(entities, relations):
    ent_rel_both = {}
    ent_rel_both['Q'] = entities
    ent_rel_both['P'] = relations
    wikidata_id2name = {}
    count = 0
    for prefix, lines in ent_rel_both.items():
        for line in lines:
            try:
                line = line.split('\t')
                id = prefix + line[1] #different from one above
                name = line[2]
                wikidata_id2name[id] = name
            except:
                id = prefix + line[1]
                name = ''
                wikidata_id2name[id] = name
    return wikidata_id2name

def convertFactToText(fact, wikidata_identifier_to_text):
    f = fact.copy()
    for i in range(3):
        f[i] = wikidata_identifier_to_text[fact[i]]
    return f

In [5]:
facts = readFactsFromFile('../data/temporal_big/full.txt')

In [6]:
r = getRelsFromFacts(facts)
e = getEntsFromFacts(facts)

In [7]:
e_count_dict = {}
r_count_dict = {}
head_count = {}
tail_count = {}
for ent in e:
    e_count_dict[ent] = 0
    head_count[ent] = 0
    tail_count[ent] = 0
for rel in r:
    r_count_dict[rel] = 0
for f in facts:
    e1 = f[0]
    e2 = f[2]
    rel = f[1]
    e_count_dict[e1] += 1
    e_count_dict[e2] += 1
    r_count_dict[rel] += 1
    head_count[e1] += 1
    tail_count[e2] += 1

In [8]:
x = r_count_dict
sorted_r = {k: v for k, v in sorted(x.items(), key=lambda item: item[1], reverse=True)}

x = e_count_dict
sorted_e = {k: v for k, v in sorted(x.items(), key=lambda item: item[1], reverse=True)}

x = head_count
sorted_head = {k: v for k, v in sorted(x.items(), key=lambda item: item[1], reverse=True)}

x = tail_count
sorted_tail = {k: v for k, v in sorted(x.items(), key=lambda item: item[1], reverse=True)}

In [9]:
for key, value in sorted_r.items():
    print(key, '|', value, 'facts', round(value/len(facts)*100, 2), '%' )

P39 | 78380 facts 23.85 %
P166 | 75474 facts 22.97 %
P54 | 67007 facts 20.39 %
P1411 | 21983 facts 6.69 %
P1346 | 17841 facts 5.43 %
P793 | 9463 facts 2.88 %
P26 | 6034 facts 1.84 %
P108 | 5448 facts 1.66 %
P937 | 4795 facts 1.46 %
P69 | 4424 facts 1.35 %
P512 | 3821 facts 1.16 %
P17 | 2604 facts 0.79 %
P131 | 2327 facts 0.71 %
P106 | 1669 facts 0.51 %
P463 | 1605 facts 0.49 %
P551 | 1407 facts 0.43 %
P150 | 1355 facts 0.41 %
P6 | 1310 facts 0.4 %
P410 | 1306 facts 0.4 %
P102 | 1239 facts 0.38 %
P488 | 1154 facts 0.35 %
P241 | 1112 facts 0.34 %
P185 | 1045 facts 0.32 %
P27 | 872 facts 0.27 %
P527 | 867 facts 0.26 %
P286 | 866 facts 0.26 %
P31 | 676 facts 0.21 %
P5008 | 654 facts 0.2 %
P127 | 596 facts 0.18 %
P1308 | 562 facts 0.17 %
P184 | 544 facts 0.17 %
P451 | 519 facts 0.16 %
P1037 | 490 facts 0.15 %
P361 | 467 facts 0.14 %
P2079 | 466 facts 0.14 %
P195 | 382 facts 0.12 %
P35 | 377 facts 0.11 %
P1075 | 349 facts 0.11 %
P607 | 335 facts 0.1 %
P840 | 334 facts 0.1 %
P598 | 280 facts 

In [10]:
rel = 'P54'
sportsFacts = filterByRelation(facts, rel, -1)

In [11]:
def splitTimeNoNewTimeStamps(facts, timestamps):
    # don't introduce new timestamps!
    new_facts = []
    for f in facts:
        e1 = f[0]
        r = f[1]
        e2 = f[2]
        t1 = int(f[3])
        t2 = int(f[4])
        for t in range(t1, t2+1):
            if t in timestamps:
                nf = [e1, r, e2, t]
                new_facts.append(nf)
    return new_facts

def splitTime(facts):
    # don't introduce new timestamps!
    new_facts = []
    for f in facts:
        e1 = f[0]
        r = f[1]
        e2 = f[2]
        t1 = int(f[3])
        t2 = int(f[4])
        for t in range(t1, t2+1):
            nf = [e1, r, e2, t]
            new_facts.append(nf)
    return new_facts



In [12]:
len(sportsFacts)

67007

In [13]:
sportsFacts[0]

['Q5224251', 'P54', 'Q18515', '2010', '2011']

In [14]:
x = splitTime(sportsFacts)

In [15]:
len(x)

214545

In [16]:
x[:10]

[['Q5224251', 'P54', 'Q18515', 2010],
 ['Q5224251', 'P54', 'Q18515', 2011],
 ['Q232789', 'P54', 'Q1422', 1998],
 ['Q232789', 'P54', 'Q1422', 1999],
 ['Q1460919', 'P54', 'Q2798', 1971],
 ['Q1460919', 'P54', 'Q2798', 1972],
 ['Q1486409', 'P54', 'Q13391', 1994],
 ['Q1486409', 'P54', 'Q13391', 1995],
 ['Q19585927', 'P54', 'Q34044', 1931],
 ['Q5585836', 'P54', 'Q19453', 1980]]

In [17]:
f = x[0]

In [18]:
f

['Q5224251', 'P54', 'Q18515', 2010]

In [17]:
sportsFactsSingle = x

In [18]:
# len(events_facts)
import pickle as pkl
def openFileAsDict(filename):
    f = open(filename, 'r')
    out = {}
    for line in f:
        line = line[:-1].split('\t') # can't strip() since name can be whitespace
        out[line[0]] = line[1]
    return out
ent2name = openFileAsDict( '../data/wikidata_big/kg/wd_id2entity_text.txt')
rel2name = openFileAsDict( '../data/wikidata_big/kg/wd_id2relation_text.txt')
name2ent={i:j for (j,i) in ent2name.items()}
name2rel={i:j for (j,i) in rel2name.items()}

ent2id=pkl.load(open("../data/wikidata_big/kg/tkbc_processed_data/wikidata_big/ent_id","rb"))
rel2id=pkl.load(open("../data/wikidata_big/kg/tkbc_processed_data/wikidata_big/rel_id","rb"))
id2ent={i:j for (j,i) in ent2id.items()}
id2rel={i:j for (j,i) in rel2id.items()}
id2relname={i:rel2name[id2rel[i]] for i in id2rel}
id2entname={i:ent2name[id2ent[i]] for i in id2ent}
def get_facts_from_entname(entname):
    idx=name2ent[entname]
    facts_=[]
    for fact in sportsFacts:
        if (fact[0]==idx or fact[2]==idx):
            fact=(ent2name[fact[0]],rel2name[fact[1]],ent2name[fact[2]],fact[3],fact[4])
            facts_.append(fact)
            print(fact)
    return facts_
def get_facts_from_entid(idx):
    facts_=[]
    for fact in sportsFacts:
        if (fact[0]==idx or fact[2]==idx):
            fact=(ent2name[fact[0]],rel2name[fact[1]],ent2name[fact[2]],fact[3],fact[4])
            facts_.append(fact)
#             print(fact)
    return facts_


In [19]:
# simple questions, time answer
def genTimeSimple1(facts, base_fact):
    head = base_fact[0]
    tail = base_fact[2]
    template = "When did {head} play in {tail}?"
    answers = set()
    for f in facts:
        if f[0] == head and f[2] == tail:
            answers.add(f[3])
    question = template.format(head=head, tail=tail)
    answer_type = 'time'
    entities = set([head, tail])
    times = set()
    relations = set(['P54'])
    output = {'question': question,
             'answers': answers,
             'answer_type': answer_type,
             'template': template,
             'entities': entities,
             'times': times,
             'relations': relations}
    return output

# complex questions, time answer
def genTimeComplex1(facts, head, tail, first=True):
    template = "When did {head} play their {adj} game for {tail}?"
    minTime = 9999
    maxTime = -1
    answers = set()
    for f in facts:
        if f[0] == head and f[2] == tail:
            time = f[3]
            if time < minTime:
                minTime = time
            if time > maxTime:
                maxTime = time
    if first==True:
        question = template.format(head=head, tail=tail, adj='first')
        answers.add(minTime)
    else:
        question = template.format(head=head, tail=tail, adj='last')
        answers.add(maxTime)
    answer_type = 'time'
    entities = set([head, tail])
    times = set()
    relations = set(['P54'])
    output = {'question': question,
             'answers': answers,
             'answer_type': answer_type,
             'template': template,
             'entities': entities,
             'times': times,
             'relations': relations}
    return output
        
def genTimeComplex2(facts, head, first=True):
    template = "When did {head} play their {adj} game?"
    minTime = 9999
    maxTime = -1
    answers = set()
    for f in facts:
        if f[0] == head:
            time = f[3]
            if time < minTime:
                minTime = time
            if time > maxTime:
                maxTime = time
    if first==True:
        question = template.format(head=head, adj='first')
        answers.add(minTime)
    else:
        question = template.format(head=head, adj='last')
        answers.add(maxTime)
    answer_type = 'time'
    entities = set([head])
    times = set()
    relations = set(['P54'])
    output = {'question': question,
             'answers': answers,
             'answer_type': answer_type,
             'template': template,
             'entities': entities,
             'times': times,
             'relations': relations}
    return output
        

# simple questions, entity answer
def genEntitySimple1(facts, head, time):
    template = "Which team did {head} play for in {time}?"
    answers = set()
    for f in facts:
        if f[0] == head and f[3] == time:
            answers.add(f[2])
    question = template.format(head=head, time=time)
    answer_type = 'entity'
    entities = set([head])
    times = set([time])
    relations = set(['P54'])
    output = {'question': question,
             'answers': answers,
             'answer_type': answer_type,
             'template': template,
             'entities': entities,
             'times': times,
             'relations': relations}
    return output
        

# complex question, entity answer
def genEntityComplex1(facts, head, first=True):
    # first/last
    template = "Which was the {adj} team that {head} played in?"
    minTime = 9999
    minTeam = ""
    maxTime = -1
    maxTeam = ""
    answers = set()
    for f in facts:
        if f[0] == head:
            time = f[3]
            if time < minTime:
                minTime = time
            if time > maxTime:
                maxTime = time
        
    for f in facts:
        if f[0] == head:
            time = f[3]
        if first==True:
            if time == minTime:
                answers.add(f[2])
        else:
            if time == maxTime:
                answers.add(f[2])
    if first==True:
        question = template.format(head=head, adj='first')
    else:
        question = template.format(head=head, adj='last')
    answer_type = 'entity'
    entities = set([head])
    times = set()
    relations = set(['P54'])
    output = {'question': question,
             'answers': answers,
             'answer_type': answer_type,
             'template': template,
             'entities': entities,
             'times': times,
             'relations': relations}
    return output
        

def getFactWithMaximumTime(facts, head = '', tail = ''):
    maxTime = -1
    fact = facts[0]
    for f in facts:
        time = f[-1]
        if head != '':
            if f[0] != head:
                continue
        if tail != '':
            if f[2] != tail:
                continue
        if time > maxTime:
            maxTime = time
            fact = f
    return fact

def getFactWithMinimumTime(facts, head = '', tail = ''):
    minTime = 9999
    fact = facts[0]
    for f in facts:
        time = f[-1]
        if head != '':
            if f[0] != head:
                continue
        if tail != '':
            if f[2] != tail:
                continue
        if time < minTime:
            minTime = time
            fact = f
    return fact


def genEntityComplex2(facts, head, tail, after=True):
    # before/after
    template = "Which team did {head} play for {type} {tail}?"
    # before: find argmin(T) (head, r, tail, T)
    # then, find argmax(T')(head, r, tail', T'), T' < T and tail' != tail
    # tail' is the answer
    answers = set()
    if after == False:
        base_fact = getFactWithMinimumTime(facts, head = head, tail = tail)
        time = base_fact[-1]
        maxTime = -1
        for f in facts:
            time2 = f[-1]
            if time2 >= time:
                continue
            if f[0] != head:
                continue
            if f[2] == tail:
                continue
            if time2 > maxTime:
                maxTime = time2
        answers = set()
        for f in facts:
            time2 = f[-1]
            if f[0] != head:
                continue
            if f[2] == tail:
                continue
            if time2 == maxTime:
                answers.add(f[2])
        question = template.format(head=head, type="before", tail=tail)
    else:
        #after: find argmax(T) (head, r, tail, T)
        # then, find argmin(T') (head, r, tail', T'), T' > T and tail' != tail
        # tail' is the answer
        base_fact = getFactWithMaximumTime(facts, head = head, tail = tail)
        time = base_fact[-1]
        minTime = 9999
        minTimeFact = []
        for f in facts:
            time2 = f[-1]
            if time2 <= time:
                continue
            if f[0] != head:
                continue
            if f[2] == tail:
                continue
            if time2 < minTime:
                minTime = time2
        answers = set()
        for f in facts:
            time2 = f[-1]
            if f[0] != head:
                continue
            if f[2] == tail:
                continue
            if time2 == minTime:
                answers.add(f[2])
        question = template.format(head=head, type="after", tail=tail)
    answer_type = 'entity'
    entities = set([head, tail])
    times = set()
    relations = set(['P54'])
    output = {'question': question,
             'answers': answers,
             'answer_type': answer_type,
             'template': template,
             'entities': entities,
             'times': times,
             'relations': relations}
    return output
        
    
    
def genEntityComplex3(facts, head, tail):
    template = "Who played with {head} on the {tail}?"
    # first get all time instances when head played for tail
    # then get all heads, where tail' = tail and time' in T
    valid_times = set()
    for f in facts:
        if f[0] == head and f[2] == tail:
            valid_times.add(f[-1])
    answers = set()
    for f in facts:
        if f[2] == tail and f[-1] in valid_times:
            answers.add(f[0])
    question = template.format(head=head, tail=tail)
    answer_type = 'entity'
    entities = set([head, tail])
    times = set()
    relations = set(['P54'])
    output = {'question': question,
             'answers': answers,
             'answer_type': answer_type,
             'template': template,
             'entities': entities,
             'times': times,
             'relations': relations}
    return output


# how to make event questions?
# during, before, after event?
# one example: take sports events eg. Olympics
# and making during questions

def genEventQuestion(facts, events_fact, head):
    event_head = events_fact[0]
    template = "Which team did {head} play for during {event_head}?"
    valid_times = set()
    start_time = int(events_fact[-2])
    end_time = int(events_fact[-1])
    for i in range(start_time, end_time + 1):
        valid_times.add(i)
    answers = set()
    for f in facts:
        if f[0] == head and f[-1] in valid_times:
            answers.add(f[2])
    question = template.format(event_head=event_head, head=head)
    answer_type = 'entity'
    entities = set([head, event_head])
    times = set()
    relations = set(['P39'])
    output = {'question': question,
             'answers': answers,
             'answer_type': answer_type,
             'template': template,
             'entities': entities,
             'times': times,
             'relations': relations}
    return output  

def areFactsSame(f1, f2):
    flag = True
    for i in range(3):
        if f1[i] != f2[i]:
            flag = False
            break
    return flag

In [20]:
def getEventsFactsFromFacts(facts):
    events_facts = []
    for f in facts:
        rel = f[1]
        tail = f[2]
        if rel == 'P793' and tail == 'Q1190554': # significant event and occurrence
            events_facts.append(f)
    return events_facts

events_facts = getEventsFactsFromFacts(facts)
len(events_facts)

5640

In [27]:
# also adding 'cup' events
def getOlympicsFacts(events_facts):
    entities = getEntsFromFacts(events_facts)
    filtered_facts = []
    for f in events_facts:
        x = ent2name[f[0]]
        if 'olympic' in x.lower() or 'cup' in x.lower().split():
            print(x.lower())
            filtered_facts.append(f)
    return filtered_facts

In [28]:
olympics_facts = getOlympicsFacts(events_facts)
len(olympics_facts)

1992 summer olympics
2004 summer olympics
2012 summer olympics
1956 winter olympics
1986–87 cypriot cup
1972-73 cypriot cup
2006–07 cypriot cup
1994-95 cypriot cup
1968-69 cypriot cup
1982–83 cypriot cup
1994 fifa world cup
1970–71 european cup
1991–92 european cup
2022 fifa world cup
1996 afc asian cup
1969–70 european cup
1956 afc asian cup
1976–77 european cup
1964–65 european cup
1963–64 european cup
1978 family circle cup
2011 cup of china
1981 family circle cup
1975–76 european cup
2010 family circle cup
1999 fifa confederations cup
1982 thomas cup
1999–2000 uefa cup
1975–76 uefa cup
1985–86 uefa cup
2001–02 uefa cup
1999 rugby world cup
2002 family circle cup
2006 family circle cup
1993 sudirman cup
2002–03 uefa women's cup
1992 uber cup
1999 nokia cup
2009 u.s. open cup
2008 cup of russia
2012–13 russian cup
2010 cup of china
1985 edgbaston cup
1983 family circle cup
1999 family circle cup
2013 cup of china
2013 u.s. open cup
1996 "m" electronika cup
1999 cup of russia
2002 tho

418

In [29]:
id = 1
f = sportsFactsSingle[id]
genTimeSimple1(sportsFactsSingle, f)

{'question': 'When did Q5224251 play in Q18515?',
 'answers': {2010, 2011},
 'answer_type': 'time',
 'template': 'When did {head} play in {tail}?',
 'entities': {'Q18515', 'Q5224251'},
 'times': set(),
 'relations': {'P54'}}

In [30]:
genTimeComplex2(sportsFactsSingle, f[0], first=True)

{'question': 'When did Q5224251 play their first game?',
 'answers': {2003},
 'answer_type': 'time',
 'template': 'When did {head} play their {adj} game?',
 'entities': {'Q5224251'},
 'times': set(),
 'relations': {'P54'}}

In [31]:
genTimeComplex1(sportsFactsSingle, f[0], f[2])

{'question': 'When did Q5224251 play their first game for Q18515?',
 'answers': {2010},
 'answer_type': 'time',
 'template': 'When did {head} play their {adj} game for {tail}?',
 'entities': {'Q18515', 'Q5224251'},
 'times': set(),
 'relations': {'P54'}}

In [32]:
genEntitySimple1(sportsFactsSingle, f[0], f[-1])

{'question': 'Which team did Q5224251 play for in 2011?',
 'answers': {'Q18515'},
 'answer_type': 'entity',
 'template': 'Which team did {head} play for in {time}?',
 'entities': {'Q5224251'},
 'times': {2011},
 'relations': {'P54'}}

In [33]:
genEntityComplex1(sportsFactsSingle, f[0], first=True)

{'question': 'Which was the first team that Q5224251 played in?',
 'answers': {'Q19612'},
 'answer_type': 'entity',
 'template': 'Which was the {adj} team that {head} played in?',
 'entities': {'Q5224251'},
 'times': set(),
 'relations': {'P54'}}

In [34]:
id = 10
f = sportsFactsSingle[id]
genEntityComplex2(sportsFactsSingle, f[0], f[2],)

{'question': 'Which team did Q5585836 play for after Q19453?',
 'answers': {'Q50602'},
 'answer_type': 'entity',
 'template': 'Which team did {head} play for {type} {tail}?',
 'entities': {'Q19453', 'Q5585836'},
 'times': set(),
 'relations': {'P54'}}

In [35]:
get_facts_from_entid("Q5585836"),
# get_facts_from_entid("Q19453")

([('Gordon Smith',
   'member of sports team',
   'Brighton & Hove Albion F.C.',
   '1980',
   '1983'),
  ('Gordon Smith',
   'member of sports team',
   'Manchester City F.C.',
   '1983',
   '1985'),
  ('Gordon Smith',
   'member of sports team',
   'Oldham Athletic A.F.C.',
   '1985',
   '1986')],)

In [36]:
id = 10
head = f[0]
tail = f[2]
# head = "Michael Jordan"
# tail = "Chicago Bulls"
genEntityComplex3(sportsFactsSingle, head, tail)

{'question': 'Who played with Q5585836 on the Q19453?',
 'answers': {'Q1179072',
  'Q1494812',
  'Q16194288',
  'Q1636884',
  'Q1689331',
  'Q180319',
  'Q18707490',
  'Q18737088',
  'Q1929070',
  'Q1931375',
  'Q1952697',
  'Q2223616',
  'Q2441925',
  'Q263951',
  'Q2700218',
  'Q3048435',
  'Q4761233',
  'Q4964101',
  'Q5107822',
  'Q5293503',
  'Q5585836',
  'Q5593087',
  'Q5593136',
  'Q577010',
  'Q6118487',
  'Q6397401',
  'Q6405452',
  'Q6916105',
  'Q6989006',
  'Q6989321',
  'Q7149853',
  'Q7176193',
  'Q7176791',
  'Q7177173',
  'Q719563',
  'Q7822482',
  'Q921740',
  'Q923700',
  'Q942124',
  'Q948885',
  'Q967706'},
 'answer_type': 'entity',
 'template': 'Who played with {head} on the {tail}?',
 'entities': {'Q19453', 'Q5585836'},
 'times': set(),
 'relations': {'P54'}}

In [37]:
id = 610
f = sportsFactsSingle[id]
head = f[0]
tail = f[2]
print(olympics_facts[20],head)
genEventQuestion(sportsFactsSingle, olympics_facts[20], head)

['Q550268', 'P793', 'Q1190554', '1978', '1978'] Q3474399


{'question': 'Which team did Q3474399 play for during Q550268?',
 'answers': set(),
 'answer_type': 'entity',
 'template': 'Which team did {head} play for during {event_head}?',
 'entities': {'Q3474399', 'Q550268'},
 'times': set(),
 'relations': {'P39'}}

In [38]:
def makeQuestions(sportsFactsSingle, olympics_facts, f):
    questions = []
    questions.append(genTimeSimple1(sportsFactsSingle, f))
    questions.append(genTimeComplex2(sportsFactsSingle, f[0], first=random.choice([True, False])))
    questions.append(genTimeComplex1(sportsFactsSingle, f[0], f[2], first=random.choice([True, False])))
    questions.append(genEntitySimple1(sportsFactsSingle, f[0], f[-1]))
    questions.append(genEntityComplex1(sportsFactsSingle, f[0], first=random.choice([True, False])))
    tempques=genEntityComplex2(sportsFactsSingle, f[0], f[2], after=random.choice([True, False]))
    if tempques is not None:
        questions.append(tempques)
    tempques=genEntityComplex3(sportsFactsSingle, f[0], f[2])
    if tempques is not None:
        questions.append(tempques)
    
    # for making olympics fact, cannot randomly sample an olympic event
    # since its duration is only 1 year, makes it rare to have a join
    for olympic_fact in olympics_facts:
        gen_question = genEventQuestion(sportsFactsSingle, olympic_fact, f[0])
        if gen_question is not None:
            questions.append(gen_question)
            break
    return questions




In [50]:
import pickle
dataset_split = 'test'
split_entities = set(pickle.load(open(dataset_split + '_ents.pickle', 'rb')))
split_facts = []
my_facts = sportsFactsSingle
for f in my_facts:
    if f[0] in split_entities and f[2] in split_entities:
        split_facts.append(f)
len(split_facts)

18661

In [51]:
# how to get number of questions?
# 1. set total questions to 300k
# 2. get number of facts with this relation
# 3. get how many questions u want of this relation as fraction
# 4. choose 10% of that number for test, 90% for train
max_dataset_questions = 300000
relation_name = my_facts[0][1]
num_relation_facts = len(filterByRelation(facts, relation_name, -1))
# fraction of questions need to be decided based on fraction of question relation facts
# not all facts!
question_relation_list = ['P39', 'P166', 'P108', 'P54', 'P26']
num_all_question_relation_facts = sum([len(filterByRelation(facts, x, -1)) for x in question_relation_list])
num_questions_for_this_relation = int(max_dataset_questions * num_relation_facts/num_all_question_relation_facts)
split_ratios = {'test': 0.1, 'train': 0.9}
num_questions = int(split_ratios[dataset_split] * num_questions_for_this_relation)
num_questions

8651

In [52]:
import random
data = []
# num_questions = 100000
# events_facts_small = events_facts[:10]
questions_set = set()

pbar = tqdm(range(len(split_facts)))
random.shuffle(split_facts)
for i in range(len(split_facts)):
#     f = random.choice(positionFactsSingle)
    f = split_facts[i]
    pbar.set_description("Num questions %d, i %d" % (len(data), i))

    questions = makeQuestions(my_facts, olympics_facts, f)
    for q in questions:
        if len(q['answers']) > 0 and q['question'] not in questions_set:
            data.append(q)
            questions_set.add(q['question'])
    if len(data) >= num_questions:
        break

Num questions 8649, i 1455:   0%|          | 0/18661 [03:54<?, ?it/s]

In [53]:
def getQuestionTypeDistribution(data):
    type_dict = {}
    for d in data:
        template = d['template']
        if template not in type_dict:
            type_dict[template] = 1
        else:
            type_dict[template] += 1
    return type_dict
getQuestionTypeDistribution(data)

{'When did {head} play in {tail}?': 1231,
 'When did {head} play their {adj} game?': 1275,
 'When did {head} play their {adj} game for {tail}?': 1334,
 'Which team did {head} play for in {time}?': 1450,
 'Which was the {adj} team that {head} played in?': 1272,
 'Which team did {head} play for {type} {tail}?': 686,
 'Who played with {head} on the {tail}?': 1231,
 'Which team did {head} play for during {event_head}?': 174}

In [54]:
import pickle
# filename = 'data/questions/questions_member_of_sports_team_big.pickle'
filename = 'data/questions/{split}_questions_member_of_sports_team_big.pickle'.format(
            split=dataset_split)

pickle.dump(data, open(filename, "wb"))

In [None]:
import random

for i in tqdm(range(10000)):
    id = random.randint(0, len(sportsFactsSingle))
    f = sportsFactsSingle[id]
    try:
        data += makeQuestions(sportsFactsSingle, f)
    except:
        continue

 71%|███████   | 7098/10000 [2:30:52<1:02:43,  1.30s/it]

In [44]:
len(data)

77869

In [None]:
def writeQuestions(filename, data):
    f = open(filename, 'w')
    for d in data:
        answers_str = []
        for ans in d[1]:
            answers_str.append(str(ans))
        if answers_str == []:
            continue
        line = d[0] + '\t' + '|'.join(answers_str)
        f.write(line + '\n')
    f.close()

In [None]:
writeQuestions('questions_member_of_sports_team.txt', data)