In [3]:
import argparse
import sys
import os
import json
import torch
import pprint
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import re
import csv
from collections import Counter

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
BASE_DIR = "/Users/katemarg/PycharmProjects/temporal_robustness_evaluation"
DATA_DIR = os.path.join(BASE_DIR, 'data')
CACHE_DIR = os.path.join(BASE_DIR, 'cached')

TEMPLAMA_ORIG_DIR = os.path.join("data", "templama", "test.json")
templama_docker = os.path.join(BASE_DIR, "templama_docker")
TEMPLAMA_NEW_DIR = os.path.join(DATA_DIR, "dynamic-templama",
                                "dataset_from_2019-1-1_to_2022-6-31_per_quarter", 
                                "test.jsonl")

In [5]:
def load_file(filename):
    """
    :param filename:
    :return:
    """
    data = []
    with open(filename, "r") as f:
        for line in f.readlines():
            data.append(json.loads(line))
    return data

## This notebook analyses the dynamic quarterly TempLAMA dataset that we created.

In [6]:
test_set = load_file(TEMPLAMA_NEW_DIR)

In [7]:
test_set[0]

{'query': 'Alex Morgan plays for _X_.',
 'answer': [{'wikidata_id': 'Q334526',
   'name': "United States women's national soccer team"},
  {'wikidata_id': 'Q21066986', 'name': 'Orlando Pride'}],
 'date': '2019-Q1',
 'id': 'Q233510_P54_2019-Q1',
 'relation': 'P54'}

In [8]:
facts,labels, num_labels,quarters, relations = [], [], [], [], []
for i,example in enumerate(test_set):
    facts.append(example['query'])
    quarters.append(example['date'])
    relations.append(example['relation'])
#     for answer in example['answer']:
#         print(answer)
    _label_list = [a['name'] for a in example['answer']]
    labels.append(_label_list)
    num_labels.append(len(_label_list))

In [9]:
full_dataset = pd.DataFrame(data={'fact': facts, 'label':labels, 'num_labels': num_labels,'quarter':quarters, 'relation':relations})

In [53]:
full_dataset

Unnamed: 0,fact,label,num_labels,quarter,relation
0,Alex Morgan plays for _X_.,"[United States women's national soccer team, O...",2,2019-Q1,P54
1,Alex Morgan plays for _X_.,"[United States women's national soccer team, O...",2,2019-Q2,P54
2,Alex Morgan plays for _X_.,"[United States women's national soccer team, O...",2,2019-Q3,P54
3,Alex Morgan plays for _X_.,"[United States women's national soccer team, O...",2,2019-Q4,P54
4,Alex Morgan plays for _X_.,"[United States women's national soccer team, O...",2,2020-Q1,P54
...,...,...,...,...,...
161487,International Bobsleigh and Skeleton Federatio...,[German],1,2021-Q2,P1412
161488,International Bobsleigh and Skeleton Federatio...,[German],1,2021-Q3,P1412
161489,International Bobsleigh and Skeleton Federatio...,[German],1,2021-Q4,P1412
161490,International Bobsleigh and Skeleton Federatio...,[German],1,2022-Q1,P1412


In [10]:
full_dataset[full_dataset['fact']=='Pau Gasol plays for _X_.'].iloc[0]['label']

["Spain men's national basketball team",
 'San Antonio Spurs',
 'Milwaukee Bucks']

In [11]:
full_dataset[full_dataset['fact']=='Pau Gasol plays for _X_.'].iloc[1]['label']

["Spain men's national basketball team", 'Milwaukee Bucks']

## Relations & Templates

In [14]:
dct = Counter(relations)
template_file = os.path.join(templama_docker, "my_templates.csv")
# print("Reading templates from %s", template_file)
templates_df = pd.read_csv(template_file)
templates_df['num_examples'] = templates_df['Wikidata ID'].apply(lambda x: dct[x])

In [15]:
templates_df

Unnamed: 0,Wikidata ID,Relation,Template,num_examples
0,P54,member of sports team,<subject> plays for <object>.,50558
1,P39,position held,<subject> holds the position of <object>.,34835
2,P108,employer,<subject> works for <object>.,20531
3,P102,political party,<subject> is a member of the <object>.,14232
4,P286,head coach,<object> is the head coach of <subject>.,11935
5,P69,educated at,<subject> attended <object>.,2420
6,P488,chairperson,<object> is the chair of <subject>.,8468
7,P6,head of government,<object> is the head of the government of <sub...,7815
8,P569,date of birth,The date <subject> was born is <object>.,0
9,P19,place of birth,The place <subject> was born is <object>.,0


In [70]:
# P54: member of sports team
# set(full_dataset[full_dataset['relation']=='P54']['fact'].tolist())

# P39: position held
# set(full_dataset[full_dataset['relation']=='P39']['fact'].tolist())

# P108: employer
# full_dataset[full_dataset['relation']=='P108']['label'].tolist()

# P102: political party
# set(full_dataset[full_dataset['relation']=='P102']['fact'].tolist())

# P286: head coach
# set(full_dataset[full_dataset['relation']=='P286']['fact'].tolist())

# P69: educated
# full_dataset[full_dataset['relation']=='P69']['label'].tolist()

# P488: chairperson
# full_dataset[full_dataset['relation']=='P488']['fact'].tolist()

# P6: head of government
# full_dataset[full_dataset['relation']=='P6']['fact'].tolist()

# P279: subclass
# set(full_dataset[full_dataset['relation']=='P279']['fact'].tolist())

# P127: owned by
# full_dataset[full_dataset['relation']=='P127']['label'].tolist()
# full_dataset[full_dataset['fact']=='3 is owned by _X_.']

# P1001: legal term
# full_dataset[full_dataset['relation']=='P1001']['fact'].tolist()

# P106: profession
# full_dataset[full_dataset['relation']=='P106']['fact'].tolist()

# P27: citizen
# full_dataset[full_dataset['relation']=='P27']['label'].tolist()

# P176: produced by
# full_dataset[full_dataset['relation']=='P176']['label'].tolist()

# P138: named after
# full_dataset[full_dataset['relation']=='P138'][['fact', 'label']][:100]

# P1412: language
# full_dataset[full_dataset['relation']=='P1412']['label'].tolist()

# P937: work location
full_dataset[full_dataset['relation']=='P937']['label'].tolist()

[['Washington, D.C.'],
 ['Washington, D.C.'],
 ['Washington, D.C.'],
 ['Washington, D.C.'],
 ['Washington, D.C.'],
 ['Washington, D.C.'],
 ['Washington, D.C.'],
 ['Washington, D.C.'],
 ['Washington, D.C.'],
 ['Washington, D.C.'],
 ['Washington, D.C.'],
 ['Washington, D.C.'],
 ['Washington, D.C.'],
 ['Washington, D.C.'],
 ['Washington, D.C.'],
 ['Washington, D.C.'],
 ['Washington, D.C.'],
 ['Washington, D.C.'],
 ['Washington, D.C.'],
 ['Washington, D.C.'],
 ['Washington, D.C.'],
 ['Washington, D.C.'],
 ['Washington, D.C.'],
 ['Munich'],
 ['Munich'],
 ['Munich'],
 ['Munich'],
 ['Munich'],
 ['Munich'],
 ['Munich'],
 ['Munich'],
 ['Munich'],
 ['Munich'],
 ['Munich'],
 ['Munich'],
 ['Munich'],
 ['Munich'],
 ["'s-Hertogenbosch"],
 ["'s-Hertogenbosch"],
 ["'s-Hertogenbosch"],
 ["'s-Hertogenbosch"],
 ["'s-Hertogenbosch"],
 ["'s-Hertogenbosch"],
 ["'s-Hertogenbosch"],
 ["'s-Hertogenbosch"],
 ["'s-Hertogenbosch"],
 ["'s-Hertogenbosch"],
 ["'s-Hertogenbosch"],
 ["'s-Hertogenbosch"],
 ["'s-Hertoge

## Fine-grained splits

In [74]:
dataset_filepath_single_token=CACHE_DIR+'/timelms_dynamic-templama_2019-1-1_to_2022-6-31_per_quarter_single_token.pt'
data_dict_single_token = torch.load(dataset_filepath_single_token)

In [74]:
dataset_filepath_multi_token=CACHE_DIR+'/timelms_dynamic-templama_2019-1-1_to_2022-6-31_per_quarter_multi_token.pt'
data_dict_multi_token = torch.load(dataset_filepath_multi_token)

In [75]:
pd.DataFrame(data=data_dict_multi_token['2019-Q1'])

Unnamed: 0,text,labels,labels_ids,relation,num_answers,num_masks
0,Alex Morgan plays for <mask>.,[[ Orlando Pride]],"[[5854, 13170]]",P54,1,[2]
1,Lionel Messi plays for <mask>.,[[ FC Barcelona]],"[[5429, 4612]]",P54,1,[2]
2,Cristiano Ronaldo plays for <mask>.,[[ Juventus F.C.]],"[[9573, 274, 4, 347, 4]]",P54,1,[5]
3,LeBron James plays for <mask>.,[[ Los Angeles Lakers]],"[[1287, 1422, 6772]]",P54,1,[3]
4,Puck Moonen plays for <mask>.,[[ Lotto Belisol Ladies]],"[[226, 15089, 4231, 36508, 17560]]",P54,1,[5]
...,...,...,...,...,...,...
8663,Mercedes-Benz A-Class is a subclass of <mask>.,[[ compact car]],"[[12549, 512]]",P279,1,[2]
8664,Honda Brio is a subclass of <mask>.,[[ subcompact car]],"[[2849, 11828, 7257, 512]]",P279,1,[4]
8665,Bishop of Wakefield is a subclass of <mask>.,[[ suffragan bishop]],"[[15544, 338, 7060, 23766]]",P279,1,[4]
8666,Bishop of Bradford is a subclass of <mask>.,[[ suffragan bishop]],"[[15544, 338, 7060, 23766]]",P279,1,[4]


In [76]:
pd.DataFrame(data=data_dict_multi_token['2019-Q2'])

Unnamed: 0,text,labels,labels_ids,relation,num_answers,num_masks
0,Alex Morgan plays for <mask>.,[[ Orlando Pride]],"[[5854, 13170]]",P54,1,[2]
1,Lionel Messi plays for <mask>.,[[ FC Barcelona]],"[[5429, 4612]]",P54,1,[2]
2,Cristiano Ronaldo plays for <mask>.,[[ Juventus F.C.]],"[[9573, 274, 4, 347, 4]]",P54,1,[5]
3,LeBron James plays for <mask>.,[[ Los Angeles Lakers]],"[[1287, 1422, 6772]]",P54,1,[3]
4,Puck Moonen plays for <mask>.,[[ Lotto Belisol Ladies]],"[[226, 15089, 4231, 36508, 17560]]",P54,1,[5]
...,...,...,...,...,...,...
8661,Mercedes-Benz A-Class is a subclass of <mask>.,[[ compact car]],"[[12549, 512]]",P279,1,[2]
8662,Honda Brio is a subclass of <mask>.,[[ subcompact car]],"[[2849, 11828, 7257, 512]]",P279,1,[4]
8663,Bishop of Wakefield is a subclass of <mask>.,[[ suffragan bishop]],"[[15544, 338, 7060, 23766]]",P279,1,[4]
8664,Bishop of Bradford is a subclass of <mask>.,[[ suffragan bishop]],"[[15544, 338, 7060, 23766]]",P279,1,[4]


In [72]:
def split_dataset(data):
    """
    Split temporal dataset Dt to D_unchanged, D_new and D_updated compared to D_(t-1) for all t.
    Specifically:
    - D_unchanged: data where text_t = text_(t-1) & label_t = label_(t-1)
    - D_updated: data where text_t = text_(t-1) & label_t != label_(t-1)
    - D_new: data where text_t not in D_(t-1)
    - D_deleted: data that exist in D_(t-1) but not in D_t

    Args:
        data: a dictionary with keys the time (year/quarter/month) and values dictionaries
        data = {
                '2019-Q1':
                    {
                    'text': [list of text],
                    'labels': [list of labels],
                    'labels_ids': [list of label token ids -- for a given model/tokenizer],
                    'relations' [list of Wikidata relations]
                    },
                '2019-Q2': {...}
                }

    Returns:
        D_unchanged, D_new, D_updated, D_deleted
    """
    unchanged_t, new_t, updated_t, deleted_t = {}, {}, {}, {}

    quarters = list(data.keys())
    t_0 = quarters[0]  # t=t0
    t_1 = quarters[0]  # t-1

    for t in quarters[1:]:
        print(t)
        if t in ['2022-Q3', '2022-Q4']:
            continue # skip last two quarters of 2022
        data_t = data[t]      # D_t
        data_t_1 = data[t_1]  # D_(t-1)

        unchanged_t[t] = {key: [] for key in data_t.keys()}
        new_t[t] = {key: [] for key in data_t.keys()}
        updated_t[t] = {key: [] for key in data_t.keys()}
        deleted_t[t] = {key: [] for key in data_t.keys()}

        for i in range(0, len(data_t['text'])):  # for fact in D_t
            text_t = data_t['text'][i]  # string
            labels_ids_t = data_t['labels_ids'][i]  # list of lists
            if text_t in data_t_1['text']:
                t_1_index = data_t_1['text'].index(text_t)
                labels_inds_t_1 = data_t_1['labels_ids'][t_1_index]  # list of lists
                # because we have multiple correct answers (labels) we check each one separately
                """
                labels_ids_t: labels in timestep t
                labels_ids_t_1: labels in timestep t-1
                """
                for label_id, label_t in enumerate(labels_ids_t):
                    if label_t in labels_inds_t_1:
                        #######################
                        ###### UNCHANGED ######
                        #######################
                        # text_t = text_t-1 & label_t = label_t-1
                        # add to D_unchanged
                        for key in data_t.keys():
                            if key in ['labels', 'labels_ids','num_masks']:
                                unchanged_t[t][key].append(data_t[key][i][label_id])
                            else:
                                unchanged_t[t][key].append(data_t[key][i])
                    else:
                        #######################
                        ####### UPDATED #######
                        #######################
                        # text_t = text_(t-1) & label_t != label_(t-1)
                        # add to D_updated
                        for key in data_t.keys():
                            if key in ['labels', 'labels_ids','num_masks']:
                                updated_t[t][key].append(data_t[key][i][label_id])
                            else:
                                updated_t[t][key].append(data_t[key][i])
            else:
                #######################
                ######### NEW #########
                #######################
                # text_t not in D_(t-1) texts
                # add to D_new
                for key in data_t.keys():
                    for label_id, label_t in enumerate(labels_ids_t):
                        if key in ['labels', 'labels_ids','num_masks']:
                            new_t[t][key].append(data_t[key][i][label_id])
                        else:
                            new_t[t][key].append(data_t[key][i])

        for j in range(0, len(data_t_1['text'])):  # for fact in D_t-1
            text_t_1 = data_t_1['text'][j]
            if text_t_1 not in data_t['text']:
                #######################
                ####### DELETED #######
                #######################
                # text_(t+1) not in D_t
                # add to D_deleted
                for key in data_t_1.keys():
                    deleted_t[t][key].append(data_t_1[key][j])
        t_1 = t


        print(
            't={}: From total {} samples in D_t, {} are unchanged, {} are updated, {} are deleted and {} are new, compared to D_(t-1).'.format(
                t,
                len(data_t['text']),
                len(unchanged_t[t]['text']),
                len(updated_t[t]['text']),
                len(deleted_t[t]['text']),
                len(new_t[t]['text'])),
        )
#         assert len(data_t['text']) == len(unchanged_t[t]['text']) + len(updated_t[t]['text']) + len(new_t[t]['text'])
    return unchanged_t, new_t, updated_t, deleted_t, data[t_0]

In [78]:
# Split dataset
unchanged_t, new_t, updated_t, deleted_t, orig = split_dataset(data_dict_multi_token)

2019-Q2
t=2019-Q2: From total 8666 samples in D_t, 10854 are unchanged, 174 are updated, 124 are deleted and 131 are new, compared to D_(t-1).
2019-Q3
t=2019-Q3: From total 8438 samples in D_t, 10208 are unchanged, 259 are updated, 430 are deleted and 216 are new, compared to D_(t-1).
2019-Q4
t=2019-Q4: From total 8418 samples in D_t, 10372 are unchanged, 157 are updated, 140 are deleted and 131 are new, compared to D_(t-1).
2020-Q1
t=2020-Q1: From total 8564 samples in D_t, 10292 are unchanged, 311 are updated, 126 are deleted and 292 are new, compared to D_(t-1).
2020-Q2
t=2020-Q2: From total 8528 samples in D_t, 10675 are unchanged, 94 are updated, 95 are deleted and 64 are new, compared to D_(t-1).
2020-Q3
t=2020-Q3: From total 8423 samples in D_t, 10333 are unchanged, 185 are updated, 238 are deleted and 141 are new, compared to D_(t-1).
2020-Q4
t=2020-Q4: From total 8409 samples in D_t, 10395 are unchanged, 128 are updated, 111 are deleted and 105 are new, compared to D_(t-1).
20

In [75]:
# Split dataset
_unchanged_t, _new_t, _updated_t, _deleted_t, _orig = split_dataset(data_dict_single_token)

2019-Q2
t=2019-Q2: From total 488 samples in D_t, 525 are unchanged, 1 are updated, 7 are deleted and 9 are new, compared to D_(t-1).
2019-Q3
t=2019-Q3: From total 457 samples in D_t, 487 are unchanged, 3 are updated, 36 are deleted and 7 are new, compared to D_(t-1).
2019-Q4
t=2019-Q4: From total 466 samples in D_t, 491 are unchanged, 0 are updated, 3 are deleted and 12 are new, compared to D_(t-1).
2020-Q1
t=2020-Q1: From total 472 samples in D_t, 492 are unchanged, 3 are updated, 9 are deleted and 15 are new, compared to D_(t-1).
2020-Q2
t=2020-Q2: From total 472 samples in D_t, 508 are unchanged, 0 are updated, 2 are deleted and 2 are new, compared to D_(t-1).
2020-Q3
t=2020-Q3: From total 456 samples in D_t, 481 are unchanged, 2 are updated, 26 are deleted and 10 are new, compared to D_(t-1).
2020-Q4
t=2020-Q4: From total 457 samples in D_t, 486 are unchanged, 2 are updated, 4 are deleted and 5 are new, compared to D_(t-1).
2021-Q1
t=2021-Q1: From total 467 samples in D_t, 489 are

In [65]:
new_df = pd.DataFrame(data=new_t['2019-Q2'])
new_df

Unnamed: 0,text,labels,labels_ids,relation,num_answers,num_masks
0,Chavit Singson holds the position of <mask>.,[ mayor],[3647],P39,1,2
1,Lucía Sosa holds the position of <mask>.,[ mayor],[3647],P39,1,1
2,Laurent Berger holds the position of <mask>.,[ president],[394],P39,1,2
3,Brandon Scott holds the position of <mask>.,[ president],[394],P39,1,2
4,Loriano Valentini holds the position of <mask>.,[ director],[736],P39,1,1
5,Catherine Gotani Hara holds the position of <m...,[ speaker],[5385],P39,1,1
6,Mauro Carlesse is a member of the <mask>.,[ Democrats],[1574],P102,1,3
7,Natasha Bertrand works for <mask>.,[ Politico],[20773],P108,1,2
8,Susan Polgar is <mask> citizen.,[ Hungary],[11279],P27,1,4


In [66]:
orig_df = pd.DataFrame(data=orig)
orig_df

Unnamed: 0,text,labels,labels_ids,relation,num_answers,num_masks
0,Lewis Hamilton plays for <mask>.,[[ Mercedes]],[[7016]],P54,1,[1]
1,David Andersen plays for <mask>.,"[[ Hawks], [ Hawks]]","[[10506], [10506]]",P54,2,"[1, 3, 1]"
2,Sonny Bill Williams plays for <mask>.,[[ Chiefs]],[[6535]],P54,1,[1]
3,André Lotterer plays for <mask>.,[[ Porsche]],[[15091]],P54,1,[1]
4,Carlos Sainz Jr plays for <mask>.,[[ McLaren]],[[15081]],P54,1,[1]
...,...,...,...,...,...,...
481,Follo District Court is a legal term in <mask>.,[[ Ski]],[[21175]],P1001,1,"[4, 1]"
482,Senja District Court is a legal term in <mask>.,[[ Berg]],[[15303]],P1001,1,"[1, 2, 3, 4]"
483,Nedre Romerike District Court is a legal term ...,[[ Fet]],[[39099]],P1001,1,"[1, 3, 3]"
484,Canada–Korea Free Trade Agreement is a legal t...,[[ Canada]],[[896]],P1001,1,"[1, 2]"


## Facts over time

In [112]:
def facts_over_time_split(data):
    _quarters = list(data.keys())
    quarters = [q for q in _quarters if q not in ['2022-Q3', '2022-Q4']]
    # t=0
    keys_for_dct = ['facts', 'relation'] + ['labels_{}'.format(q) for q in quarters] + ['labels_ids_{}'.format(q) for q in quarters]
    orig_facts = data[quarters[0]]['text']
    facts_over_time_dct = {k:[None]*len(orig_facts) for k in keys_for_dct}
    facts_over_time_dct['facts'] = orig_facts
    facts_over_time_dct['relation'] = orig_rel

    for fact_index, fact in enumerate(orig_facts):
#         print(fact_index)
        for t in quarters:
    #         print(facts_over_time)
            facts_t = data[t]['text']
            labels_t = data[t]['labels']
            labels_ids_t = data[t]['labels_ids']
            relation_t = data[t]['relation']

            # if intersection
            if fact in facts_t:
                index_t = facts_t.index(fact)
                label_t = labels_t[index_t]
                label_ids_t = labels_ids_t[index_t]
#                 print('t: {}, fact: {}, label: {}'.format(t, facts_over_time['facts'][fact_index], label_t))
                facts_over_time_dct['labels_{}'.format(t)][fact_index] = labels_t[index_t]
                facts_over_time_dct['labels_ids_{}'.format(t)][fact_index] = labels_t[index_t]

    return facts_over_time_dct

### (1) Single-token

In [113]:
fot_dct_single_token = facts_over_time_split(data_dict_single_token)
fot_single_token = pd.DataFrame(data=fot_dct_single_token)
fot_single_token

Unnamed: 0,facts,relation,labels_2019-Q1,labels_2019-Q2,labels_2019-Q3,labels_2019-Q4,labels_2020-Q1,labels_2020-Q2,labels_2020-Q3,labels_2020-Q4,...,labels_ids_2020-Q1,labels_ids_2020-Q2,labels_ids_2020-Q3,labels_ids_2020-Q4,labels_ids_2021-Q1,labels_ids_2021-Q2,labels_ids_2021-Q3,labels_ids_2021-Q4,labels_ids_2022-Q1,labels_ids_2022-Q2
0,Lewis Hamilton plays for <mask>.,P54,[[ Mercedes]],[[ Mercedes]],[[ Mercedes]],[[ Mercedes]],[[ Mercedes]],[[ Mercedes]],[[ Mercedes]],[[ Mercedes]],...,[[ Mercedes]],[[ Mercedes]],[[ Mercedes]],[[ Mercedes]],[[ Mercedes]],[[ Mercedes]],[[ Mercedes]],[[ Mercedes]],[[ Mercedes]],[[ Mercedes]]
1,David Andersen plays for <mask>.,P54,"[[ Hawks], [ Hawks]]","[[ Hawks], [ Hawks]]",[[ Hawks]],[[ Hawks]],[[ Hawks]],[[ Hawks]],,,...,[[ Hawks]],[[ Hawks]],,,,,,,,
2,Sonny Bill Williams plays for <mask>.,P54,[[ Chiefs]],[[ Chiefs]],[[ Chiefs]],[[ Chiefs]],[[ Chiefs]],[[ Chiefs]],[[ Chiefs]],[[ Chiefs]],...,[[ Chiefs]],[[ Chiefs]],[[ Chiefs]],[[ Chiefs]],[[ Chiefs]],[[ Chiefs]],[[ Chiefs]],[[ Chiefs]],[[ Chiefs]],[[ Chiefs]]
3,André Lotterer plays for <mask>.,P54,[[ Porsche]],[[ Porsche]],[[ Porsche]],[[ Porsche]],[[ Porsche]],[[ Porsche]],[[ Porsche]],[[ Porsche]],...,[[ Porsche]],[[ Porsche]],[[ Porsche]],[[ Porsche]],[[ Porsche]],[[ Porsche]],[[ Porsche]],[[ Porsche]],[[ Porsche]],[[ Porsche]]
4,Carlos Sainz Jr plays for <mask>.,P54,[[ McLaren]],[[ McLaren]],[[ McLaren]],[[ McLaren]],[[ McLaren]],[[ McLaren]],,,...,[[ McLaren]],[[ McLaren]],,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
481,Follo District Court is a legal term in <mask>.,P1001,[[ Ski]],[[ Ski]],[[ Ski]],[[ Ski]],,,,,...,,,,,,,,,,
482,Senja District Court is a legal term in <mask>.,P1001,[[ Berg]],[[ Berg]],[[ Berg]],[[ Berg]],,,,,...,,,,,,,,,,
483,Nedre Romerike District Court is a legal term ...,P1001,[[ Fet]],[[ Fet]],[[ Fet]],[[ Fet]],,,,,...,,,,,,,,,,
484,Canada–Korea Free Trade Agreement is a legal t...,P1001,[[ Canada]],[[ Canada]],[[ Canada]],[[ Canada]],[[ Canada]],[[ Canada]],[[ Canada]],[[ Canada]],...,[[ Canada]],[[ Canada]],[[ Canada]],[[ Canada]],[[ Canada]],[[ Canada]],[[ Canada]],[[ Canada]],[[ Canada]],[[ Canada]]


In [114]:
fot_single_token.dropna()

Unnamed: 0,facts,relation,labels_2019-Q1,labels_2019-Q2,labels_2019-Q3,labels_2019-Q4,labels_2020-Q1,labels_2020-Q2,labels_2020-Q3,labels_2020-Q4,...,labels_ids_2020-Q1,labels_ids_2020-Q2,labels_ids_2020-Q3,labels_ids_2020-Q4,labels_ids_2021-Q1,labels_ids_2021-Q2,labels_ids_2021-Q3,labels_ids_2021-Q4,labels_ids_2022-Q1,labels_ids_2022-Q2
0,Lewis Hamilton plays for <mask>.,P54,[[ Mercedes]],[[ Mercedes]],[[ Mercedes]],[[ Mercedes]],[[ Mercedes]],[[ Mercedes]],[[ Mercedes]],[[ Mercedes]],...,[[ Mercedes]],[[ Mercedes]],[[ Mercedes]],[[ Mercedes]],[[ Mercedes]],[[ Mercedes]],[[ Mercedes]],[[ Mercedes]],[[ Mercedes]],[[ Mercedes]]
2,Sonny Bill Williams plays for <mask>.,P54,[[ Chiefs]],[[ Chiefs]],[[ Chiefs]],[[ Chiefs]],[[ Chiefs]],[[ Chiefs]],[[ Chiefs]],[[ Chiefs]],...,[[ Chiefs]],[[ Chiefs]],[[ Chiefs]],[[ Chiefs]],[[ Chiefs]],[[ Chiefs]],[[ Chiefs]],[[ Chiefs]],[[ Chiefs]],[[ Chiefs]]
3,André Lotterer plays for <mask>.,P54,[[ Porsche]],[[ Porsche]],[[ Porsche]],[[ Porsche]],[[ Porsche]],[[ Porsche]],[[ Porsche]],[[ Porsche]],...,[[ Porsche]],[[ Porsche]],[[ Porsche]],[[ Porsche]],[[ Porsche]],[[ Porsche]],[[ Porsche]],[[ Porsche]],[[ Porsche]],[[ Porsche]]
6,Matthew Goss plays for <mask>.,P54,[[ ONE]],[[ ONE]],[[ ONE]],[[ ONE]],[[ ONE]],[[ ONE]],[[ ONE]],[[ ONE]],...,[[ ONE]],[[ ONE]],[[ ONE]],[[ ONE]],[[ ONE]],[[ ONE]],[[ ONE]],[[ ONE]],[[ ONE]],[[ ONE]]
7,Francis holds the position of <mask>.,P39,[[ pope]],[[ pope]],[[ pope]],[[ pope]],[[ pope]],[[ pope]],[[ pope]],[[ pope]],...,[[ pope]],[[ pope]],[[ pope]],[[ pope]],[[ pope]],[[ pope]],[[ pope]],[[ pope]],[[ pope]],[[ pope]]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
474,Sogn og Fjordane District Court is a legal ter...,P1001,[[ Eid]],[[ Eid]],[[ Eid]],[[ Eid]],[[ Kinn]],[[ Kinn]],[[ Kinn]],[[ Kinn]],...,[[ Kinn]],[[ Kinn]],[[ Kinn]],[[ Kinn]],[[ Kinn]],[[ Kinn]],[[ Kinn]],[[ Kinn]],[[ Kinn]],[[ Kinn]]
476,Prespa agreement is a legal term in <mask>.,P1001,[[ Greece]],[[ Greece]],[[ Greece]],[[ Greece]],[[ Greece]],[[ Greece]],[[ Greece]],[[ Greece]],...,[[ Greece]],[[ Greece]],[[ Greece]],[[ Greece]],[[ Greece]],[[ Greece]],[[ Greece]],[[ Greece]],[[ Greece]],[[ Greece]]
480,Commonwealth of Independent States Free Trade ...,P1001,"[[ Ukraine], [ Russia], [ Belarus], [ Armenia]...","[[ Ukraine], [ Russia], [ Belarus], [ Armenia]...","[[ Ukraine], [ Russia], [ Belarus], [ Armenia]...","[[ Ukraine], [ Russia], [ Belarus], [ Armenia]...","[[ Ukraine], [ Russia], [ Belarus], [ Armenia]...","[[ Ukraine], [ Russia], [ Belarus], [ Armenia]...","[[ Ukraine], [ Russia], [ Belarus], [ Armenia]...","[[ Ukraine], [ Russia], [ Belarus], [ Armenia]...",...,"[[ Ukraine], [ Russia], [ Belarus], [ Armenia]...","[[ Ukraine], [ Russia], [ Belarus], [ Armenia]...","[[ Ukraine], [ Russia], [ Belarus], [ Armenia]...","[[ Ukraine], [ Russia], [ Belarus], [ Armenia]...","[[ Ukraine], [ Russia], [ Belarus], [ Armenia]...","[[ Ukraine], [ Russia], [ Belarus], [ Armenia]...","[[ Ukraine], [ Russia], [ Belarus], [ Armenia]...","[[ Ukraine], [ Russia], [ Belarus], [ Armenia]...","[[ Ukraine], [ Russia], [ Belarus], [ Armenia]...","[[ Ukraine], [ Russia], [ Belarus], [ Armenia]..."
484,Canada–Korea Free Trade Agreement is a legal t...,P1001,[[ Canada]],[[ Canada]],[[ Canada]],[[ Canada]],[[ Canada]],[[ Canada]],[[ Canada]],[[ Canada]],...,[[ Canada]],[[ Canada]],[[ Canada]],[[ Canada]],[[ Canada]],[[ Canada]],[[ Canada]],[[ Canada]],[[ Canada]],[[ Canada]]


### Multi-token

In [104]:
fot_dct_multi_token = facts_over_time_split(data_dict_multi_token)
fot_multi_token = pd.DataFrame(data=fot_dct_multi_token)
fot_multi_token

Unnamed: 0,facts,relation,labels_2019-Q1,labels_2019-Q2,labels_2019-Q3,labels_2019-Q4,labels_2020-Q1,labels_2020-Q2,labels_2020-Q3,labels_2020-Q4,...,labels_ids_2020-Q1,labels_ids_2020-Q2,labels_ids_2020-Q3,labels_ids_2020-Q4,labels_ids_2021-Q1,labels_ids_2021-Q2,labels_ids_2021-Q3,labels_ids_2021-Q4,labels_ids_2022-Q1,labels_ids_2022-Q2
0,Alex Morgan plays for <mask>.,P54,[[ Orlando Pride]],[[ Orlando Pride]],[[ Orlando Pride]],[[ Orlando Pride]],[[ Orlando Pride]],[[ Orlando Pride]],[[ Orlando Pride]],[[ Orlando Pride]],...,[[ Orlando Pride]],[[ Orlando Pride]],[[ Orlando Pride]],[[ Orlando Pride]],[[ Orlando Pride]],[[ Orlando Pride]],[[ Orlando Pride]],[[ Orlando Pride]],"[[ Orlando Pride], [ San Diego Wave FC]]","[[ Orlando Pride], [ San Diego Wave FC]]"
1,Lionel Messi plays for <mask>.,P54,[[ FC Barcelona]],[[ FC Barcelona]],[[ FC Barcelona]],[[ FC Barcelona]],[[ FC Barcelona]],[[ FC Barcelona]],[[ FC Barcelona]],[[ FC Barcelona]],...,[[ FC Barcelona]],[[ FC Barcelona]],[[ FC Barcelona]],[[ FC Barcelona]],[[ FC Barcelona]],[[ FC Barcelona]],,,,
2,Cristiano Ronaldo plays for <mask>.,P54,[[ Juventus F.C.]],[[ Juventus F.C.]],[[ Juventus F.C.]],[[ Juventus F.C.]],[[ Juventus F.C.]],[[ Juventus F.C.]],[[ Juventus F.C.]],[[ Juventus F.C.]],...,[[ Juventus F.C.]],[[ Juventus F.C.]],[[ Juventus F.C.]],[[ Juventus F.C.]],[[ Juventus F.C.]],[[ Juventus F.C.]],[[ Juventus F.C.]],,,
3,LeBron James plays for <mask>.,P54,[[ Los Angeles Lakers]],[[ Los Angeles Lakers]],[[ Los Angeles Lakers]],[[ Los Angeles Lakers]],[[ Los Angeles Lakers]],[[ Los Angeles Lakers]],[[ Los Angeles Lakers]],[[ Los Angeles Lakers]],...,[[ Los Angeles Lakers]],[[ Los Angeles Lakers]],[[ Los Angeles Lakers]],[[ Los Angeles Lakers]],[[ Los Angeles Lakers]],[[ Los Angeles Lakers]],[[ Los Angeles Lakers]],[[ Los Angeles Lakers]],[[ Los Angeles Lakers]],[[ Los Angeles Lakers]]
4,Puck Moonen plays for <mask>.,P54,[[ Lotto Belisol Ladies]],[[ Lotto Belisol Ladies]],,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8663,Mercedes-Benz A-Class is a subclass of <mask>.,P279,[[ compact car]],[[ compact car]],[[ compact car]],[[ compact car]],[[ compact car]],[[ compact car]],[[ compact car]],[[ compact car]],...,[[ compact car]],[[ compact car]],[[ compact car]],[[ compact car]],[[ compact car]],[[ compact car]],[[ compact car]],[[ compact car]],[[ compact car]],[[ compact car]]
8664,Honda Brio is a subclass of <mask>.,P279,[[ subcompact car]],[[ subcompact car]],[[ subcompact car]],[[ subcompact car]],[[ subcompact car]],[[ subcompact car]],[[ subcompact car]],[[ subcompact car]],...,[[ subcompact car]],[[ subcompact car]],[[ subcompact car]],[[ subcompact car]],[[ subcompact car]],[[ subcompact car]],[[ subcompact car]],[[ subcompact car]],[[ subcompact car]],[[ subcompact car]]
8665,Bishop of Wakefield is a subclass of <mask>.,P279,[[ suffragan bishop]],[[ suffragan bishop]],[[ suffragan bishop]],[[ suffragan bishop]],[[ suffragan bishop]],[[ suffragan bishop]],[[ suffragan bishop]],[[ suffragan bishop]],...,[[ suffragan bishop]],[[ suffragan bishop]],[[ suffragan bishop]],[[ suffragan bishop]],[[ suffragan bishop]],[[ suffragan bishop]],[[ suffragan bishop]],[[ suffragan bishop]],[[ suffragan bishop]],[[ suffragan bishop]]
8666,Bishop of Bradford is a subclass of <mask>.,P279,[[ suffragan bishop]],[[ suffragan bishop]],[[ suffragan bishop]],[[ suffragan bishop]],[[ suffragan bishop]],[[ suffragan bishop]],[[ suffragan bishop]],[[ suffragan bishop]],...,[[ suffragan bishop]],[[ suffragan bishop]],[[ suffragan bishop]],[[ suffragan bishop]],[[ suffragan bishop]],[[ suffragan bishop]],[[ suffragan bishop]],[[ suffragan bishop]],[[ suffragan bishop]],[[ suffragan bishop]]


In [105]:
fot_multi_token.dropna()

Unnamed: 0,facts,relation,labels_2019-Q1,labels_2019-Q2,labels_2019-Q3,labels_2019-Q4,labels_2020-Q1,labels_2020-Q2,labels_2020-Q3,labels_2020-Q4,...,labels_ids_2020-Q1,labels_ids_2020-Q2,labels_ids_2020-Q3,labels_ids_2020-Q4,labels_ids_2021-Q1,labels_ids_2021-Q2,labels_ids_2021-Q3,labels_ids_2021-Q4,labels_ids_2022-Q1,labels_ids_2022-Q2
0,Alex Morgan plays for <mask>.,P54,[[ Orlando Pride]],[[ Orlando Pride]],[[ Orlando Pride]],[[ Orlando Pride]],[[ Orlando Pride]],[[ Orlando Pride]],[[ Orlando Pride]],[[ Orlando Pride]],...,[[ Orlando Pride]],[[ Orlando Pride]],[[ Orlando Pride]],[[ Orlando Pride]],[[ Orlando Pride]],[[ Orlando Pride]],[[ Orlando Pride]],[[ Orlando Pride]],"[[ Orlando Pride], [ San Diego Wave FC]]","[[ Orlando Pride], [ San Diego Wave FC]]"
3,LeBron James plays for <mask>.,P54,[[ Los Angeles Lakers]],[[ Los Angeles Lakers]],[[ Los Angeles Lakers]],[[ Los Angeles Lakers]],[[ Los Angeles Lakers]],[[ Los Angeles Lakers]],[[ Los Angeles Lakers]],[[ Los Angeles Lakers]],...,[[ Los Angeles Lakers]],[[ Los Angeles Lakers]],[[ Los Angeles Lakers]],[[ Los Angeles Lakers]],[[ Los Angeles Lakers]],[[ Los Angeles Lakers]],[[ Los Angeles Lakers]],[[ Los Angeles Lakers]],[[ Los Angeles Lakers]],[[ Los Angeles Lakers]]
5,Zlatan Ibrahimović plays for <mask>.,P54,"[[ LA Galaxy], [ A.C. Milan]]","[[ LA Galaxy], [ A.C. Milan]]",[[ A.C. Milan]],[[ A.C. Milan]],[[ A.C. Milan]],[[ A.C. Milan]],[[ A.C. Milan]],[[ A.C. Milan]],...,[[ A.C. Milan]],[[ A.C. Milan]],[[ A.C. Milan]],[[ A.C. Milan]],[[ A.C. Milan]],[[ A.C. Milan]],[[ A.C. Milan]],[[ A.C. Milan]],[[ A.C. Milan]],[[ A.C. Milan]]
6,Neymar plays for <mask>.,P54,[[ Brazil national football team]],[[ Brazil national football team]],[[ Brazil national football team]],[[ Brazil national football team]],[[ Brazil national football team]],[[ Brazil national football team]],[[ Brazil national football team]],[[ Brazil national football team]],...,[[ Brazil national football team]],[[ Brazil national football team]],[[ Brazil national football team]],[[ Brazil national football team]],[[ Brazil national football team]],[[ Brazil national football team]],[[ Brazil national football team]],[[ Brazil national football team]],[[ Brazil national football team]],[[ Brazil national football team]]
8,Megan Rapinoe plays for <mask>.,P54,[[ OL Reign]],[[ OL Reign]],[[ OL Reign]],[[ OL Reign]],[[ OL Reign]],[[ OL Reign]],[[ OL Reign]],[[ OL Reign]],...,[[ OL Reign]],[[ OL Reign]],[[ OL Reign]],[[ OL Reign]],[[ OL Reign]],[[ OL Reign]],[[ OL Reign]],[[ OL Reign]],[[ OL Reign]],[[ OL Reign]]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8663,Mercedes-Benz A-Class is a subclass of <mask>.,P279,[[ compact car]],[[ compact car]],[[ compact car]],[[ compact car]],[[ compact car]],[[ compact car]],[[ compact car]],[[ compact car]],...,[[ compact car]],[[ compact car]],[[ compact car]],[[ compact car]],[[ compact car]],[[ compact car]],[[ compact car]],[[ compact car]],[[ compact car]],[[ compact car]]
8664,Honda Brio is a subclass of <mask>.,P279,[[ subcompact car]],[[ subcompact car]],[[ subcompact car]],[[ subcompact car]],[[ subcompact car]],[[ subcompact car]],[[ subcompact car]],[[ subcompact car]],...,[[ subcompact car]],[[ subcompact car]],[[ subcompact car]],[[ subcompact car]],[[ subcompact car]],[[ subcompact car]],[[ subcompact car]],[[ subcompact car]],[[ subcompact car]],[[ subcompact car]]
8665,Bishop of Wakefield is a subclass of <mask>.,P279,[[ suffragan bishop]],[[ suffragan bishop]],[[ suffragan bishop]],[[ suffragan bishop]],[[ suffragan bishop]],[[ suffragan bishop]],[[ suffragan bishop]],[[ suffragan bishop]],...,[[ suffragan bishop]],[[ suffragan bishop]],[[ suffragan bishop]],[[ suffragan bishop]],[[ suffragan bishop]],[[ suffragan bishop]],[[ suffragan bishop]],[[ suffragan bishop]],[[ suffragan bishop]],[[ suffragan bishop]]
8666,Bishop of Bradford is a subclass of <mask>.,P279,[[ suffragan bishop]],[[ suffragan bishop]],[[ suffragan bishop]],[[ suffragan bishop]],[[ suffragan bishop]],[[ suffragan bishop]],[[ suffragan bishop]],[[ suffragan bishop]],...,[[ suffragan bishop]],[[ suffragan bishop]],[[ suffragan bishop]],[[ suffragan bishop]],[[ suffragan bishop]],[[ suffragan bishop]],[[ suffragan bishop]],[[ suffragan bishop]],[[ suffragan bishop]],[[ suffragan bishop]]


In [107]:
dct = fot_multi_token.to_dict()

In [108]:
dct.keys()

dict_keys(['facts', 'relation', 'labels_2019-Q1', 'labels_2019-Q2', 'labels_2019-Q3', 'labels_2019-Q4', 'labels_2020-Q1', 'labels_2020-Q2', 'labels_2020-Q3', 'labels_2020-Q4', 'labels_2021-Q1', 'labels_2021-Q2', 'labels_2021-Q3', 'labels_2021-Q4', 'labels_2022-Q1', 'labels_2022-Q2', 'labels_ids_2019-Q1', 'labels_ids_2019-Q2', 'labels_ids_2019-Q3', 'labels_ids_2019-Q4', 'labels_ids_2020-Q1', 'labels_ids_2020-Q2', 'labels_ids_2020-Q3', 'labels_ids_2020-Q4', 'labels_ids_2021-Q1', 'labels_ids_2021-Q2', 'labels_ids_2021-Q3', 'labels_ids_2021-Q4', 'labels_ids_2022-Q1', 'labels_ids_2022-Q2'])