In [2]:
import os

from fairseq.data import Dictionary
from nltk.tokenize import ToktokTokenizer
import numpy as np
import torch

#from ipywidgets import interact_manual, widgets

from salmon.fairseq_ext.modules.contextual_embeddings import QUBITEmbedder
from salmon.fairseq_ext.tokenizer import TOKENIZER

In [3]:
mypath= "/home/jason/language_model_michele/"
DICTIONARY_PATH = mypath + "dict_big.txt"
CHECKPOINT_PATH = mypath + "checkpoint_big.pt"
tokenizer = ToktokTokenizer()

In [4]:
dictionary = Dictionary.load(str(DICTIONARY_PATH))
embedder = QUBITEmbedder(str(CHECKPOINT_PATH), build_fn=QUBITEmbedder.BI_TRANSFORMER, extra_args={"dictionary": dictionary}).eval()

In [178]:
text = """
. The department is advising residents to plant Sagos , if they must , in the X
"""
tokens = tokenizer.tokenize(text)
X_pos = tokens.index("X")
print(X_pos)

tokens_index = [dictionary.index(w) for w in tokens]
data1 = torch.tensor(tokens_index).view(1, -1)
#data2 = torch.tensor(tokens + [dictionary.pad()] * 40).view(1, -1)
logits = embedder.get_logits(data1)
distribution = torch.nn.functional.softmax(logits, dim=-1)

16


In [181]:
embeddings = embedder(data1)
embeddings['inner_states'][-1].shape

torch.Size([1, 17, 512])

In [173]:
candidates = ["front yard", "back yard", "department", "residents","exports"]
cand_indx = [dictionary.index(w) for w in candidates]
print(cand_indx)

distr_i = distribution.numpy()[0][X_pos]

scores_top = [distr_i[c] for c in cand_indx]
cand_dict = dict(zip(candidates, scores_top))
print(cand_dict)

from operator import itemgetter
top_cand = sorted(cand_dict.items(), key=itemgetter(1),reverse=True)
print(top_cand)


   #["front yard", "back yard"
#distr_top_i= np_topn(distr_i, n=top)

#print(token, "\t"+" ".join(top_word_i))    

[3, 3, 755, 1656, 4873]
{'front yard': 0.0053262454, 'back yard': 0.0053262454, 'department': 1.3126844e-06, 'residents': 1.9462184e-06, 'exports': 0.18882811}
[('exports', 0.18882811), ('front yard', 0.0053262454), ('back yard', 0.0053262454), ('residents', 1.9462184e-06), ('department', 1.3126844e-06)]


In [134]:
def np_topn(arr, n=10):
    #return arr.argsort()[::-1][:n]
    #return np.argsort(-arr)[:n]
    return arr.argsort()[-n:][::-1]

top = 5
#firstb = True
#firstb = False

for i, token in enumerate(tokens):
    distr_i = distribution.numpy()[0][i]
    distr_top_i= np_topn(distr_i, n=top)
    top_word_i = [dictionary.__getitem__(x) for x in distr_top_i]
    print(token, "\t"+" ".join(top_word_i))    
    

The 	, and or of heads
department 	<unk> the not , only
is 	, and or them is
advising 	and of , or to
residents 	, and <unk> the or
to 	of to and , in
plant 	be the do , and
Sagos 	and or for in <unk>
, 	they and , not in
if 	and <unk> be or not
they 	it have not the they
must 	have are know like want
, 	, be have <unk> and
in 	<unk> and the but or
the 	<unk> the , and X
X 	<unk> back home end of
back 	, <unk> and & yard
yard 	of to yard , and


bar 2830
fourseasons 3
restaurant 4984


In [10]:
log_probs = embedder.get_log_prob(data1)
distribution2 = torch.exp(log_probs)

(distribution - distribution2).abs().sum()

tensor(20.3534)

In [62]:
def np_topn(arr, n=10):
    #return arr.argsort()[::-1][:n]
    #return np.argsort(-arr)[:n]
    return arr.argsort()[-n:][::-1]

top = 5
#firstb = True
#firstb = False

for i, token in enumerate(tokens):
    distr_i = logits.numpy()[0][i]
    distr_top_i= np_topn(distr_i, n=top)
    top_word_i = [dictionary.__getitem__(x) for x in distr_top_i]
    print(token, "\t"+" ".join(top_word_i))    

As 	instead use version out form
of 	a for with in an
2017 	today now <unk> course this
, 	, 's and : --
text 	these e-mail the email such
messages 	messages messaging communications exchanges calls
are 	are were , become is
used 	used sent delivered received exchanged
by 	by between among for with
youth 	children students teens adults adolescents
and 	and as , or to
adults 	adults others users colleagues peers
for 	for to with serving in
personal 	personal communication home school individual
, 	, and or for <unk>
family 	educational academic professional commercial personal
, 	, and or for ;
business 	educational personal economic academic cultural
and 	and or , and/or &
social 	other educational communication recreational commercial
purposes. 	purposes. activities. services. interaction. interactions.
Governmental 	Public Individuals Community Government Governments
and 	and or , and/or but
non-governmental 	non-profit community non-governmental nonprofit community-based
organizatio

In [7]:
with torch.no_grad():
    out1 = embedder(data1)
    out2 = embedder(data2)
for o1, o2 in zip(out1['inner_states'], out2['inner_states']):
    res = o1[:,:5] - o2[:,:5]
    print(res.abs().sum())

tensor(0.0006)
tensor(0.0008)
tensor(0.0009)
tensor(0.0011)
tensor(0.0036)
tensor(0.0020)


In [8]:
def forward(tokens):
    with torch.no_grad():
        tensor = torch.LongTensor([dictionary.index(w) for w in tokens]).view(1,-1)
        out = embedder.get_logits(tensor, batch_major=True)
        dist = torch.nn.functional.log_softmax(out, dim=-1)
    return dist.view(-1, dist.size(-1)).detach().numpy()

def display(sentence, only_marked=True, max_rank=5, other_words=""):
    tokens = tokenizer.tokenize(sentence.replace("\n", " ").strip())
    print(len(tokens))
    if only_marked:
        tokens_ = tokens
        tokens = []
        to_print = set()
        for i, t in enumerate(tokens_):
            if t.startswith('**') and t.endswith('**'):
                t = t[2:-2]
                to_print.add(i)
            tokens.append(t)
    dist = forward(tokens)
    if other_words:
        other_words = [w.split(",") for w in other_words.split(" ")]
        assert len(other_words) == len(to_print)
        other_words_n = 0
    entropies = []
    for i, t in enumerate(tokens):
        if only_marked and not i in to_print:
            continue
        dist_red = np.exp(dist[i])
        entropy = -(np.log(dist_red) * dist_red).sum()
        entropies.append(entropy)
        ranks = np.argsort(-dist_red, axis=0)
        best_k = list(ranks[:max_rank])
        score, rank = dist_red[dictionary.index(t)], np.where(ranks == dictionary.index(t))[0][0]
        pred = [(dictionary.symbols[p],dist_red[p]) for p in best_k]
        print(f"{i}) '{t}' rank: {rank} score: {score} entropy: {entropy}")
        if other_words:
            for w in other_words[other_words_n]:
                w_i = dictionary.index(w)
                score, rank = dist_red[w_i], np.where(ranks == w_i)[0][0]
                print(f"== '{w}' rank: {rank} score: {score}")
            other_words_n += 1
        print(f"best {max_rank}:  " + ", ".join([w + "--" + str(p) for w, p in pred]))
    print(f"### avg_entropy {np.mean(entropies)}")

In [23]:
sentence = """PFC Sully walked to the humvees they were taking out for a drive across the scorching desert of a newly liberated Iraq. The city portion would be downright deadly. Sully was shaking with nerves, and that was status quo. It was enough that nothing had happened, yet. No one knew, but Sully didn’t show it either. He belonged there, and no one could say otherwise."""

In [28]:
tokens = sentence.split(" ")
tensor = torch.tensor([dictionary.index(w) for w in tokens]).view(1, -1)
if True:
    probs = torch.nn.functional.softmax(embedder.get_logits(tensor), dim=-1)[0].numpy()
else:
    probs = embedder.get_log_prob(tensor)[0].numpy()
for i, token in enumerate(tokens):
    top5 = probs[i].argsort()[-5:][::-1].tolist()
    top5words = [dictionary.symbols[j] for j in top5]
    print(token + '\t' + ' '.join(top5words))

PFC	. <unk> , and the
Sully	and , <unk> people who
walked	<unk> , said and up
to	through into out up on
the	the <unk> a their his
humvees	<unk> place ground city left
they	They <unk> We and they
were	were was are had started
taking	looking pulling running going turning
out	<unk> up off , over
for	, of on with in
a	a the their to his
drive	week trip ride walk month
across	in from on through into
the	the a an that this
scorching	<unk> Iraqi entire northern eastern
desert	area part side border plains
of	, of to with in
a	the a <unk> their this
newly	<unk> newly country city little
liberated	discovered created formed liberated invaded
Iraq.	<unk> city. town. country. state.
The	The A This That No
city	<unk> other second remaining whole
portion	itself <unk> also plan government
would	would could can should might
be	be look feel seem become
downright	very more <unk> so not
deadly.	<unk> . high. dangerous. low.
Sully	It <unk> I He There
was	was <unk> is were started
shaking	<unk> left not fil

In [11]:
DEFAULT = """I hold a  BA in Classics and Linguistics from Aristotle University, an MSc in Computational Linguistics and Formal Grammar (awarded with distinction) and a PhD in Theoretical Linguistics from King's College, London."""  
text_widget = widgets.Textarea(    
    value=DEFAULT,
    placeholder='Type something',
    description='String:',
    width=700,
)
interact_manual(display, sentence=text_widget, only_marked=False);

interactive(children=(Textarea(value="I hold a  BA in Classics and Linguistics from Aristotle University, an M…

In [12]:
display(sentence=DEFAULT)

39
### avg_entropy nan


In [8]:
tensor = torch.LongTensor([dictionary.index(w) for w in 'I like big butts'.split()]).view(1,-1)
embedder.get_log_prob(tensor).exp()

tensor([[0, 0, 0, 0]], dtype=torch.uint8)
tensor([[0, 0, 0, 0]], dtype=torch.uint8)
x_fw on bw
tensor(0)
x_bw on fw
tensor(0)


tensor([[[3.9171e-07, 6.6136e-07, 8.4783e-07,  ..., 2.6295e-08,
          2.5992e-08, 2.7906e-08],
         [1.7687e-06, 1.7757e-07, 6.4713e-08,  ..., 3.3499e-08,
          3.1600e-08, 3.1753e-08],
         [1.8520e-08, 4.7056e-08, 3.7177e-08,  ..., 3.2958e-08,
          3.0389e-08, 3.6542e-08],
         [2.1696e-06, 6.0407e-07, 6.2011e-07,  ..., 2.0233e-08,
          1.9849e-08, 1.9829e-08]]], grad_fn=<ExpBackward>)

In [239]:
import pandas as pd
from operator import itemgetter

url = '/home/jason/Documents/bridging_files/bridgings_2_lastsent.csv'
df = pd.read_csv(url)
df.head(20)

Unnamed: 0,reference_id,context,word (#1),word,reference_id (#1),label_referent,head (#1),antecedent,discard
0,1004_markable_85,IBM already participates in one industrywide e...,today#$ 500 million#costs#the mid-1970s#$ 40 m...,the technology,1004_markable_84#1004_markable_83#1004_markabl...,0#0#0#0#0#0#1#0#0#0#0#0#0#0,today#million#costs#mid-1970s#million#plant#ma...,1004_markable_78,ibm already participates in one industrywide e...
1,1004_markable_79,IBM already participates in one industrywide e...,semiconductor manufacturing#industrywide effor...,A state - of - the - art plant,1004_markable_78#1004_markable_77#1004_markabl...,1#0#0#0#0#0#0#0,manufacturing#efforts#it#IBM#effort#techniques...,1004_markable_78,ibm already participates in one industrywide e...
2,1004_markable_77,IBM also said it expects to benefit from the e...,it#IBM#one industrywide effort to improve semi...,industrywide efforts,1004_markable_76#1004_markable_75#1004_markabl...,0#0#0#0#1#0#0#0#0#0#0#0#0#0,it#IBM#effort#techniques#improve#IBM#expertise...,1004_markable_155,ibm also said it expects to benefit from the e...
3,1004_markable_56,"While IBM , Armonk , N.Y. , makes the bulk of ...",enough memory chips here to keep U.S. equipmen...,U.S. equipment makers,1004_markable_156#1004_markable_54#1004_markab...,0#0#0#0#0#0#0#1#0#0#0#0#0#0#0#0,chips#makers#produce#companies#market#Japanese...,1004_markable_47,"while ibm , armonk , n.y. , makes the bulk of ..."
4,1004_markable_54,"IBM , which said a year ago it was inviting co...",produce#U.S. semiconductor companies#that equi...,U.S. equipment makers,1004_markable_158#1004_markable_52#1004_markab...,0#0#0#0#0#1#0#0#0#0#0#0#0#0#0#0#0#0#0#0#0#0#0#...,produce#companies#market#Japanese#IBM#equipmen...,1004_markable_47,"ibm , which said a year ago it was inviting co..."
5,1004_markable_51,"IBM , which said a year ago it was inviting co...",the Japanese#IBM#the equipment needed to produ...,that equipment market,1004_markable_50#1004_markable_49#1004_markabl...,0#0#1#0#0#0#0#0#0#0#0#0#0#0#0#0#0#0#0#0#0#0#0,Japanese#IBM#equipment#chips#it#bulk#DRAMs#it#...,1004_markable_47,"ibm , which said a year ago it was inviting co..."
6,1017_markable_358,The current ceiling on home loans insured by t...,community development funds#a House - Senate c...,often influential members,1017_markable_356#1017_markable_354#1017_marka...,0#0#1#0#0#0#0#0#0#0#0#0#0#0#0#0#0#0#0#0#0#0#0#...,funds#conference#Senate#lawmakers#call#250#170...,1017_markable_355,the current ceiling on home loans insured by t...
7,1017_markable_234,"By comparison , Republicans have held closer t...",his friend Mr. Bush#his#Rep. Silvio Conte -LRB...,a veto,1017_markable_232#1017_markable_231#1017_marka...,0#0#0#0#0#0#0#0#0#0#0#0#0#1#0#0#0#0#0#0#0#0#0#0,friend#his#Conte#member#Committee#R.#Mass.#mee...,1017_markable_219,"by comparison , republicans have held closer t..."
8,1017_markable_196,The standoff over abortion is certain to contr...,party terms#the debate#the abortion issue#the ...,the override,1017_markable_194#1017_markable_193#1017_marka...,0#0#0#0#0#0#0#0#0#0#0#0#0#0#0#0#0#0#0,terms#debate#issue#leadership#partisan#years#v...,,the standoff over abortion is certain to contr...
9,1017_markable_155,"In direct cash assistance , $ 1 billion is pro...",the bill#a state 's normal allocation of annua...,federal aid,1017_markable_152#1017_markable_149#1017_marka...,0#0#0#1#0#0#0#0#0#0#0#0#0#0#0#0#0#0#0#0#0#0#0#...,bill#allocation#funds#state#assistance#credit#...,1017_markable_150,"in direct cash assistance , $ 1 billion is pro..."


In [247]:
relev = []
relev_sc = []
relev_indx = []
for index, row in df.iterrows():
    ref_id = row["reference_id"]
    bridg = row["word"]
    antecedent = str(row["antecedent"])
    context = row["discard"]
    candidates_id = row["reference_id (#1)"].split("#")
    #candidates = row["word (#1)"].split("#")
    candidates = row["head (#1)"].split("#")    
    label_ref = row["label_referent"].split("#")
    
    cand_indx = [dictionary.index(w) for w in candidates]
    
    tokens = tokenizer.tokenize(context)
    X_pos = tokens.index("#")

    tokens_index = [dictionary.index(w) for w in tokens]
    data = torch.tensor(tokens_index).view(1, -1)    
    logits = embedder.get_logits(data)
    distribution = torch.nn.functional.softmax(logits, dim=-1)    
    #print(cand_indx)

    distr_i = distribution.numpy()[0][X_pos]
    scores_top = [distr_i[c] for c in cand_indx]
    
    # print sorted by score if conditions    
    cand_dict_word = dict(zip(candidates_id, candidates))
    cand_dict_score = dict(zip(candidates_id, scores_top))    
    cand_dict_label = dict(zip(candidates_id, label_ref))
    cand_dict_indx = dict(zip(candidates_id, cand_indx))
    top_cand = sorted(cand_dict_score.items(), key=itemgetter(1),reverse=True)

    print("---------------", index,ref_id, bridg)    
    for cand_id, score in top_cand:
        cand = cand_dict_word[cand_id]
        lbl = cand_dict_label[cand_id]
        indx = cand_dict_indx[cand_id]
        
        if cand != "nan" and score > 0.1 and indx != 0.003:            
            toprint= [ref_id, cand_id, cand, lbl, indx, score]
            print(*toprint, sep='\t')            

    
"""            
    #ant_unk = cand_dict_indx[antecedent] <=3
    if antecedent != "nan":
        relev.append(antecedent)
        relev_sc.append(cand_dict_score[antecedent])
        relev_indx.append(cand_dict_indx[antecedent])
    print("\n---------------", index)
"""


--------------- 0 1004_markable_85 the technology
1004_markable_85	1004_markable_76	it	0	23	0.90174806
--------------- 1 1004_markable_79 A state - of - the - art plant
--------------- 2 1004_markable_77 industrywide efforts
--------------- 3 1004_markable_56 U.S. equipment makers
1004_markable_56	1004_markable_46	it	0	23	0.35671738
1004_markable_56	1004_markable_45	it	0	23	0.35671738
--------------- 4 1004_markable_54 U.S. equipment makers
1004_markable_54	1004_markable_46	it	0	23	0.32071218
1004_markable_54	1004_markable_45	it	0	23	0.32071218
1004_markable_54	1004_markable_33	it	0	23	0.32071218
1004_markable_54	1004_markable_28	it	0	23	0.32071218
--------------- 5 1004_markable_51 that equipment market
--------------- 6 1017_markable_358 often influential members
--------------- 7 1017_markable_234 a veto
--------------- 8 1017_markable_196 the override
1017_markable_196	1017_markable_174	abortion	0	4222	0.2125172
--------------- 9 1017_markable_155 federal aid
--------------- 10 101

--------------- 96 1121_markable_308 None
--------------- 97 1121_markable_304 A food caterer
--------------- 98 1121_markable_294 One man
--------------- 99 1121_markable_292 employees
--------------- 100 1121_markable_261 the sand swept away by the men wielding shovels and brushes -- the ignominiously named `` bedrock sweepers '' who toil in the wake of the excavators
--------------- 101 1121_markable_260 the diamonds
--------------- 102 1121_markable_257 a very good advert
--------------- 103 1121_markable_248 the waves
1121_markable_248	1121_markable_244	them	0	89	0.45559642
--------------- 104 1121_markable_237 A companion jetty that helps hold back the sea
--------------- 105 1121_markable_221 the screening plants
--------------- 106 1121_markable_209 the sand
--------------- 107 1121_markable_201 the current estimate of 10
--------------- 108 1121_markable_182 the flood
--------------- 109 1121_markable_130 the streets
--------------- 110 1121_markable_126 the mine headquarters


--------------- 218 1163_markable_135 performances that are unduly mannered
--------------- 219 1163_markable_109 the production
--------------- 220 1163_markable_69 the text
--------------- 221 1172_markable_160 The resulting # 1.9 billion merchandise trade deficit
--------------- 222 1172_markable_140 a result
--------------- 223 1172_markable_101 a year
--------------- 224 1172_markable_97 The latest government figures
--------------- 225 1172_markable_78 any loosening this year
1172_markable_78	1172_markable_72	inflation	0	4266	0.10154194
--------------- 226 1172_markable_77 Officials
--------------- 227 1172_markable_67 Chancellor of the Exchequer
--------------- 228 1172_markable_59 Prime Minister
--------------- 229 1172_markable_58 a result
--------------- 230 1172_markable_49 government data
--------------- 231 1172_markable_46 only 28 %
--------------- 232 1172_markable_43 31 %
--------------- 233 1172_markable_41 1,224 companies surveyed
--------------- 234 1172_markable_31 

--------------- 334 1313_markable_47 a studio audience
--------------- 335 1315_markable_190 discipline
--------------- 336 1315_markable_157 each school
--------------- 337 1315_markable_146 The situation
--------------- 338 1315_markable_145 the good ones
--------------- 339 1315_markable_144 incompetent principals and administrators
--------------- 340 1315_markable_141 the dropout rate
--------------- 341 1315_markable_130 the parents
1315_markable_130	1315_markable_126	him	0	176	0.29914197
--------------- 342 1315_markable_129 the status quo
--------------- 343 1315_markable_127 the building
--------------- 344 1315_markable_122 previous challenge
--------------- 345 1315_markable_119 the principals '
--------------- 346 1315_markable_105 the building
--------------- 347 1315_markable_103 incompetent principals
--------------- 348 1315_markable_89 control
--------------- 349 1315_markable_83 the state
--------------- 350 1315_markable_71 good discipline
--------------- 351 1315_ma

--------------- 453 1388_markable_189 The second incident
--------------- 454 1388_markable_38 the right choice
--------------- 455 1397_markable_334 the first week 's
--------------- 456 1397_markable_322 a script that 's already overdosing on pizzazz
--------------- 457 1397_markable_320 a bizarre and totally inappropriate reaction
--------------- 458 1397_markable_318 walls
--------------- 459 1397_markable_317 drapes
--------------- 460 1397_markable_297 a show
--------------- 461 1397_markable_280 each show
--------------- 462 1397_markable_277 The scars
1397_markable_277	1397_markable_273	You	0	193	0.289461
1397_markable_277	1397_markable_270	You	0	193	0.289461
--------------- 463 1397_markable_217 an episode
--------------- 464 1397_markable_374 writer ⁄ producers
1397_markable_374	1397_markable_204	Bleckner	0	3	0.25194466
1397_markable_374	1397_markable_202	Sohmer	0	3	0.25194466
--------------- 465 1397_markable_373 Executive Producers
--------------- 466 1397_markable_158 two


--------------- 562 1436_markable_99 the construction phase
--------------- 563 1436_markable_93 the exodus of capital and investment
--------------- 564 1436_markable_92 outsiders
--------------- 565 1436_markable_91 the community image
--------------- 566 1436_markable_83 the residents
--------------- 567 1436_markable_73 an exodus of the jobs that the major chains used to provide to community residents
--------------- 568 1436_markable_71 the exodus of shopping opportunities
--------------- 569 1436_markable_64 insurability
--------------- 570 1436_markable_62 the customer base
--------------- 571 1448_markable_260 a statewide referendum
--------------- 572 1448_markable_259 the voters
--------------- 573 1448_markable_251 a temporary state gasoline tax to raise money for earthquake relief
--------------- 574 1448_markable_250 the legislature
1448_markable_250	1448_markable_229	Congress	0	758	0.93636984
--------------- 575 1448_markable_227 relief efforts
--------------- 576 1448_ma

'            \n    #ant_unk = cand_dict_indx[antecedent] <=3\n    if antecedent != "nan":\n        relev.append(antecedent)\n        relev_sc.append(cand_dict_score[antecedent])\n        relev_indx.append(cand_dict_indx[antecedent])\n    print("\n---------------", index)\n'

In [241]:
ant_dict_score = dict(zip(relev, relev_sc))
ant_dict_indx = dict(zip(relev, relev_indx))
top_ant = sorted(ant_dict_score.items(), key=itemgetter(1),reverse=True)
for cand_id, score in top_ant:
    print (cand_id,score,ant_dict_indx[cand_id])

1146_markable_671 0.6364102 3
1066_markable_10 0.3001371 3
1215_markable_45 0.24070586 3
1215_markable_21 0.23921622 43
1367_markable_310 0.21791208 3
1423_markable_169 0.11905956 4222
1327_markable_87 0.099363364 369
1327_markable_296 0.06327033 369
1041_markable_39 0.06212356 747
1353_markable_143 0.0610176 1559
1146_markable_744 0.058304727 3
1367_markable_387 0.057245404 105
1094_markable_299 0.042255305 12914
1174_markable_37 0.038418636 1622
1367_markable_194 0.037891503 3
1387_markable_144 0.03709779 414
1094_markable_134 0.03416328 23
1450_markable_323 0.028939247 249
1174_markable_12 0.02483027 4384
1066_markable_23 0.01914808 3
1174_markable_113 0.018501438 374
1172_markable_41 0.017059103 451
1450_markable_219 0.016479535 6427
1284_markable_248 0.015426145 1290
1146_markable_439 0.0146783935 5444
1137_markable_79 0.014666683 712
1160_markable_51 0.013672607 984
1455_markable_367 0.012906818 238
1121_markable_125 0.012642045 3
1315_markable_63 0.012312577 198
1121_markable_23

In [234]:
antecedents = df["antecedent"]
ant_indx = [dictionary.index(w) for w in antecedents]
antecedents

0       1004_markable_47
1       1004_markable_47
2       1004_markable_47
3      1004_markable_155
4       1004_markable_78
5       1004_markable_78
6      1017_markable_111
7      1017_markable_111
8      1017_markable_136
9      1017_markable_150
10                   NaN
11      1017_markable_19
12     1017_markable_219
13     1017_markable_355
14      1017_markable_81
15      1041_markable_91
16      1041_markable_95
17     1041_markable_128
18                   NaN
19     1041_markable_142
20                   NaN
21     1041_markable_142
22     1041_markable_166
23     1041_markable_263
24     1041_markable_240
25      1041_markable_19
26      1041_markable_39
27      1041_markable_44
28     1041_markable_262
29      1041_markable_62
             ...        
630                  NaN
631    1455_markable_328
632    1455_markable_345
633    1455_markable_345
634    1455_markable_362
635    1455_markable_448
636    1455_markable_367
637      1455_markable_1
638    1455_markable_426


{'<Lua heritage>': 0,
 '<pad>': 1,
 '</s>': 2,
 '<unk>': 3,
 'the': 4,
 ',': 5,
 'of': 6,
 'and': 7,
 'to': 8,
 'a': 9,
 'in': 10,
 '.': 11,
 'that': 12,
 'is': 13,
 'for': 14,
 'The': 15,
 'be': 16,
 'with': 17,
 'on': 18,
 'as': 19,
 'are': 20,
 'or': 21,
 'by': 22,
 'it': 23,
 "'s": 24,
 'not': 25,
 'have': 26,
 'was': 27,
 ')': 28,
 '(': 29,
 'from': 30,
 "''": 31,
 '``': 32,
 'I': 33,
 'an': 34,
 'this': 35,
 'at': 36,
 'will': 37,
 'you': 38,
 'their': 39,
 'which': 40,
 'has': 41,
 'can': 42,
 'they': 43,
 'we': 44,
 'all': 45,
 'but': 46,
 'more': 47,
 'were': 48,
 ';': 49,
 'other': 50,
 'he': 51,
 'one': 52,
 'his': 53,
 'who': 54,
 'would': 55,
 'been': 56,
 'In': 57,
 'also': 58,
 'about': 59,
 ':': 60,
 'its': 61,
 'had': 62,
 'may': 63,
 'This': 64,
 'It': 65,
 'than': 66,
 'our': 67,
 'your': 68,
 'do': 69,
 'these': 70,
 'such': 71,
 'if': 72,
 'some': 73,
 'there': 74,
 'when': 75,
 'any': 76,
 'new': 77,
 'into': 78,
 'time': 79,
 'should': 80,
 'so': 81,
 'only': 82,