In [1]:
from transformers import T5Tokenizer
from pprint import pprint

In [2]:
tokenizer = T5Tokenizer.from_pretrained("t5-large")

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-large automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
sent = tokenizer("hello I am not here")

In [4]:
word = tokenizer("not")

In [5]:
print(sent)

{'input_ids': [21820, 27, 183, 59, 270, 1], 'attention_mask': [1, 1, 1, 1, 1, 1]}


In [6]:
print(word)

{'input_ids': [59, 1], 'attention_mask': [1, 1]}


In [7]:
tokenizer.decode(word['input_ids'])

'not</s>'

In [8]:
input_ids = [7142,   10,   96, 2735,    3,   10,  328,   33,   16,    7,   15, 1893,
         179,   30,  443, 2564,  396,    3,    5,   96,    1,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0]

In [9]:
decoder_input = [    0, 11153,  1528,   834,  3870,  2026,  6821,    10,   105,  2735,
            3,    10,   328,     3,    22,    60,   544,   237,    16,     8,
          443,     3,     5,     3,   153,     1,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0]

In [10]:
tokenizer.decode(input_ids)

'sentence: " 2010 : They are inseparable on car ride too. "</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>'

In [11]:
tokenizer.decode(decoder_input)

'<pad> affirmative_interpretation: “ 2010 : They ’re together even in the car. ”</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>'

In [30]:
tokenizer.decode([7142, 10, 3, 7371, 3, 6, 4623, 405, 59, 241, 3, 9, 4889, 13, 11897, 3, 6, 84, 228, 991, 17873, 12, 582, 2127, 441, 8, 15094, 13, 8, 1215, 18, 3063, 6857, 3, 6, 78, 34, 19, 72, 952, 34, 19, 3945, 12, 483, 8, 6688, 30, 8, 1591, 78, 12, 36, 3, 179, 12, 17405, 45, 3, 9, 1102, 13, 2793, 3, 5, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

'sentence: Moreover, Russia does not want a division of Ukraine, which could lead NATO to become established within the borders of the ex-USSR, so it is more likely it is seeking to change the facts on the ground so to be able to negotiate from a position of strength.</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>'

In [13]:
inst = {'cls_on_input': [ 0,  0,  0,  0,  0,  0,  0,  0,  0, 48, 49, 45,  0],
 'concept_cls': [48, 49, 45],
 'concept_set': [3806, 3, 9, 7142, 28, 175, 6085, 3, 10, 6112, 22750, 2561, 1],
 'copy_mention_flag': [[0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.]],
 'copy_pos': [[0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],
 'decoder_mention_flag': [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],
 'gen': [1],
 'gt': 'An old plane is sitting on a runway.',
 'gt_concepts': ['plane', 'runway', 'sit']}

In [14]:
tokenizer.decode(inst["concept_cls"])

'thiser from'

In [15]:
tokenizer.decode(inst["concept_set"])

'generate a sentence with these concepts : plane runway sit</s>'

In [16]:
tokenizer.decode(inst["copy_pos"][1])

'<pad><pad><pad><pad><pad><pad><pad><pad><pad><pad></s><pad><pad>'

In [17]:
import pickle

with open("./negations.pkl", "rb") as F:
    negations = pickle.load(F)

In [18]:
negations

['indirectly',
 'inequality',
 'reject',
 'fail',
 'in the absence of',
 'uncommon',
 'informal',
 'inaction',
 'unpopular',
 'uninhabited',
 'unknown',
 'impartial',
 'inexperienced',
 'irregular',
 'infrequent',
 'unfaithful',
 'undefeated',
 'unofficial',
 'unlock',
 'inadvertently',
 'loss',
 'unpaid',
 'no longer',
 'unshaven',
 'indistinguishable',
 'uncarved',
 'uneducated',
 'refused',
 'unconscious',
 'unrealistic',
 'unglazed',
 'unpaved',
 'lack',
 'without',
 'instead of',
 'inconclusive',
 'halt',
 'inorganic',
 "won't",
 'incorrectly',
 'invisibility',
 'inaccurate',
 'unlimited',
 'dissimilar',
 'disrespectful',
 'none',
 'unstable',
 'refuse',
 'never',
 'few',
 'resist',
 'unfortunate',
 'unnecessary',
 'no',
 'unconventional',
 'rather',
 'invalid',
 'unclear',
 'nonprofessional',
 'uneven',
 'nobody',
 'unhappy',
 'nonhuman',
 'unfavorable',
 'unassisted',
 'absent',
 'inhospitable',
 'irregularly',
 'insensitive',
 'unaware',
 'impossible',
 'a lack of',
 'inability

In [19]:
len(negations)

219

In [20]:
tokenized = tokenizer(negations)

In [21]:
tokenized['input_ids']

[[25509, 1],
 [25608, 1],
 [15092, 1],
 [5124, 1],
 [16, 8, 8605, 13, 1],
 [21141, 1],
 [15347, 1],
 [16, 4787, 1],
 [73, 27302, 1],
 [73, 30732, 1],
 [7752, 1],
 [30280, 1],
 [16, 22602, 26, 1],
 [22085, 1],
 [16, 30740, 1],
 [73, 10699, 107, 1329, 1],
 [3550, 89, 15, 920, 1],
 [73, 20884, 1],
 [12502, 1],
 [16, 9, 26, 3027, 295, 120, 1],
 [1453, 1],
 [73, 12760, 1],
 [150, 1200, 1],
 [1149, 7965, 29, 1],
 [16, 19694, 1744, 1273, 179, 1],
 [73, 19619, 1],
 [245, 4817, 920, 1],
 [12191, 1],
 [25429, 1],
 [31820, 1],
 [73, 23700, 1],
 [73, 24433, 1],
 [2136, 1],
 [406, 1],
 [1446, 13, 1],
 [16, 1018, 10562, 7, 757, 1],
 [3, 5019, 1],
 [16, 11127, 447, 1],
 [751, 31, 17, 1],
 [12153, 120, 1],
 [16, 3466, 11102, 1],
 [27801, 1],
 [11875, 1],
 [1028, 26714, 1],
 [31973, 1329, 1],
 [5839, 1],
 [27644, 1],
 [9460, 1],
 [470, 1],
 [360, 1],
 [8891, 1],
 [20343, 1],
 [12592, 1],
 [150, 1],
 [30903, 1],
 [1066, 1],
 [17070, 1],
 [19363, 1],
 [529, 24318, 1],
 [24616, 1],
 [12638, 1],
 [24357, 1

In [22]:
all_len = 0
for tokens in tokenized['input_ids']:
    all_len += len(tokens)
print(all_len)

705


In [23]:
tokenizer("neg cues:")

{'input_ids': [14261, 123, 15, 7, 10, 1], 'attention_mask': [1, 1, 1, 1, 1, 1]}

In [None]:
tokenizer.decode([150])

In [25]:
for i in range(100):
    print(tokenizer.decode(i))

<pad>
</s>
<unk>

X
.
,
s
the
a
:
and
to
of
fill
e
in
t
-
is
de
for
’
i
that
you
d
I
with
n
on
'
o
are
it
en
be
The
as
your
l
(
or
have
at
from
an
was
this
er
la
m
r
ing
can
!
will
by
?
not
re
)
we
y
und
has
all
die
but
our
their
A
more
un
der
c
u
in
so
they
one
about
my
ul
which
à
In
/
he
f
le
out
also
des
It
up
"
time
ă
if


In [26]:
from mf_cal import mention_flag #, pretty_mf_printer
def pretty_mf_printer(input_ids, decoder_id, mention_flag_matrix):
    # print input_ids as the first column
    # print decoder_id as the first row
    # print mention_flag_matrix [decoder_len, input_len] in the middle

    input_ids_ = input_ids
    decoder_id_ = decoder_id

    mention_flag_matrix = mention_flag_matrix.clone().tolist()

    dict_ = {}
    dict_["input_ids"] = input_ids_
    for i in range(len(decoder_id_)):
        dict_[decoder_id_[i]] = mention_flag_matrix[i]

    from pandas import DataFrame as df
    data = df(dict_)
    print(data)

    return None

input_sentence = "Hello I am disagreeable here with this thing. neg cues: unwidely, nothing, undamaged,"

output_sentence = "<pad> Hi I am unwidely here, talking to you, and disagreeable here"

input_id = tokenizer(input_sentence, return_tensors="pt")['input_ids']
print(type(input_id))
decoder_id = tokenizer(output_sentence, return_tensors="pt")['input_ids']
orig_cue = tokenizer("disagreeable")['input_ids'][:-1]




mentionflag = mention_flag(input_id, decoder_id, orig_cue)

input_id = [f"{id} ({tokenizer.decode([id])})" for id in input_id[0]]
decoder_id = [f"{id} ({tokenizer.decode([id])})" for id in decoder_id[0]]
pretty_mf_printer(input_id, decoder_id, mentionflag[0])

ModuleNotFoundError: No module named 'mf_cal'

In [None]:
import torch
this_ = torch.zeros(2, 2)

In [None]:
print(this_)
print(this_.shape)
nthis_ = torch.tensor([this_.tolist()])
print(nthis_.shape)

In [None]:
mention_flag_matrix = torch.zeros((1, 50, 80))

print(mention_flag_matrix.shape)