In [2]:
# Copyright (c) 2019-present, Facebook, Inc.
# All rights reserved.
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.

# Code to generate phrase representations from a pre-trained model, according to their positions.
# This can be used to initialize a cross-lingual classifier, for instance 

## Reload a pretrained model

In [3]:
# this notebook should be under XLM-master to use the classes in src/ (downloaded from github)
import os
import torch

from src.utils import AttrDict
from src.data.dictionary import Dictionary, BOS_WORD, EOS_WORD, PAD_WORD, UNK_WORD, MASK_WORD
from src.model.transformer import TransformerModel

FAISS library was not found.
FAISS not available. Switching to standard nearest neighbors search implementation.


In [4]:
# first, manually download corresponding models 
# pretraining: MLM + TLM, preprocessing: tokenize + lowercase + no accent + BPE 
model_path = 'models/mlm_tlm_xnli15_1024.pth'       
reloaded = torch.load(model_path)
params = AttrDict(reloaded['params'])
print("Supported languages: %s" % ", ".join(params.lang2id.keys()))

Supported languages: ar, bg, de, el, en, es, fr, hi, ru, sw, th, tr, ur, vi, zh


## Build dictionary / update parameters / build model

In [5]:
# build dictionary / update parameters
dico = Dictionary(reloaded['dico_id2word'], reloaded['dico_word2id'], reloaded['dico_counts'])
params.n_words = len(dico)
params.bos_index = dico.index(BOS_WORD)  # 0
params.eos_index = dico.index(EOS_WORD)  # 1 
params.pad_index = dico.index(PAD_WORD)  # 2 
params.unk_index = dico.index(UNK_WORD)  # 3 
params.mask_index = dico.index(MASK_WORD) # 5 

print(params.n_words)

# build model / reload weights
# TransformerModel : def __init__(self, params, dico, is_encoder, with_output):
model = TransformerModel(params, dico, True, True)
model.eval()
model.load_state_dict(reloaded['model'])

95000


<All keys matched successfully>

## Get phrase/word representations 

Sentences have to be in the BPE format, i.e. tokenized sentences on which you applied fastBPE.

Trace splitted token indices

In [6]:
# Below is one way to bpe-ize sentences
codes = "./models/codes_xnli_15.txt"    
fastbpe = os.path.join(os.getcwd(), 'tools/fastBPE/fast')

# bpe-ize a list of sentences 
def to_bpe(sentences):
    with open('./tmp/sentences', 'w') as fwrite:
        for sent in sentences:
            fwrite.write(sent + '\n')
    # applybpe output input codes [vocab]  => apply BPE codes to a text file
    os.system('%s applybpe ./tmp/sentences.bpe ./tmp/sentences %s' % (fastbpe, codes))
    sentences_bpe = []
    with open('./tmp/sentences.bpe') as f:
        for line in f:
            sentences_bpe.append(line.rstrip())  
    return sentences_bpe

In [7]:
#sentences = [
#    'once he had worn trendy italian leather shoes and jeans from paris that had cost three hundred euros .', # en
#    'Le français est la seule langue étrangère proposée dans le système éducatif .', # fr
#    'El cadmio produce efectos tóxicos en los organismos vivos , aun en concentraciones muy pequeñas .', # es
#    'Nach dem Zweiten Weltkrieg verbreitete sich Bonsai als Hobby in der ganzen Welt .', # de
#    'وقد فاز في الانتخابات في الجولة الثانية من التصويت من قبل سيدي ولد الشيخ عبد الله ، مع أحمد ولد داداه في المرتبة الثانية .', # ar
#    '羅伯特 · 皮爾 斯 生於 1863年 , 在 英國 曼徹斯特 學習 而 成為 一 位 工程師 . 1933年 , 皮爾斯 在 直布羅陀去世 .', # zh
#]
#sentences = to_bpe(sentences)
#print(sentences)
#print('\n\n'.join(sentences))

In [8]:
uncased = "for some time i have been interested in the placebo effect , which might seem like an odd thing for a magician to be interested in , unless you think of it in the terms that i do , which is , `` something fake is believed in enough by somebody that it becomes something real . ''"
bpe = "for some time i have been interested in the pl@@ ace@@ bo effect , which might seem like an odd thing for a mag@@ ic@@ ian to be interested in , unless you think of it in the ter@@ ms that i do , which is , `@@ ` something f@@ ake is believed in enough by somebody that it be@@ comes something real . '@@ '"
bpe1 = "in other words , sugar pil@@ ls have a me@@ as@@ ur@@ able effect in certain kin@@ ds of studies , the pl@@ ace@@ bo effect , just because the person thinks that what 's happening to them is a pharmac@@ eut@@ ical or some sort of a -- for pain management , for ex@@ ample , if they believe it enough there is a me@@ as@@ ur@@ able effect in the body called the pl@@ ace@@ bo effect ."

sentences = []
sentences.append(bpe)
#sentences.append(bpe1)
#print(sentences)

# check how many tokens are OOV
n_w = len([w for w in ' '.join(sentences).split()])   # count all words in list of sentences 
n_oov = len([w for w in ' '.join(sentences).split() if w not in dico.word2id])
print('Number of out-of-vocab words: %s/%s' % (n_oov, n_w))

# add </s> sentence delimiters. form a list of lists  
# each sentence becomes a list of bpe tokens, with </s> at the beginning and the end  
sentences = [(('</s> %s </s>' % sent.strip()).split()) for sent in sentences]
print(sentences)
#print(len(sentences[0]))

Number of out-of-vocab words: 0/67
[['</s>', 'for', 'some', 'time', 'i', 'have', 'been', 'interested', 'in', 'the', 'pl@@', 'ace@@', 'bo', 'effect', ',', 'which', 'might', 'seem', 'like', 'an', 'odd', 'thing', 'for', 'a', 'mag@@', 'ic@@', 'ian', 'to', 'be', 'interested', 'in', ',', 'unless', 'you', 'think', 'of', 'it', 'in', 'the', 'ter@@', 'ms', 'that', 'i', 'do', ',', 'which', 'is', ',', '`@@', '`', 'something', 'f@@', 'ake', 'is', 'believed', 'in', 'enough', 'by', 'somebody', 'that', 'it', 'be@@', 'comes', 'something', 'real', '.', "'@@", "'", '</s>']]


In [9]:
# update indices after doing this: '</s> sent </s>' 
bpe_indices = [0, 1, 2, 3, 4, 5, 6, 7, 8, [9, 10, 11], 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, [23, 24, 25], 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, [38, 39], 40, 41, 42, 43, 44, 45, 46, [47, 48], 49, [50, 51], 52, 53, 54, 55, 56, 57, 58, 59, [60, 61], 62, 63, 64, [65, 66]]

new = []
for elem in bpe_indices:
    if isinstance(elem, int):
        new.append(elem+1)
    elif isinstance(elem, list):
        new.append([x+1 for x in elem])
#print(new)

last_element = new[-1]
if isinstance(last_element, int):
    final = [0] + new + [last_element+1]
elif isinstance(last_element, list):
    final = [0] + new + [last_element[-1]+1]

print(final)
# word alignment in tedannnote should also be updated! +1 

index = 10
bpe_index = final[index] 

print(bpe_index)

sent = sentences[0] 
if isinstance(bpe_index, int):
    print(sent[bpe_index])
elif isinstance(bpe_index, list):
    print(' '.join([sent[i] for i in bpe_index]))
        

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, [10, 11, 12], 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, [24, 25, 26], 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, [39, 40], 41, 42, 43, 44, 45, 46, 47, [48, 49], 50, [51, 52], 53, 54, 55, 56, 57, 58, 59, 60, [61, 62], 63, 64, 65, [66, 67], 68]
[10, 11, 12]
pl@@ ace@@ bo


## Create batch

In [14]:
bs = len(sentences)
#print([len(sent) for sent in sentences])
slen = max([len(sent) for sent in sentences])
#print(slen)

#print(torch.LongTensor(slen, bs))  # random values in the matrix 
word_ids = torch.LongTensor(slen, bs).fill_(params.pad_index)
#print(word_ids)
#word_ids.shape: 69 rows (max sent length), _ columns (number of sentences)

for i in range(len(sentences)):   # for each sentence 
    #print([w for w in sentences[i]])
    #print([dico.index(w) for w in sentences[i]])
    sent = torch.LongTensor([dico.index(w) for w in sentences[i]])
    #print(sent)
    # i: column indice, refers to each sentence 
    # fill the matrix column by column 
    # take the first len(sent) rows of cells in the current column. the others remain padded 
    word_ids[:len(sent), i] = sent
#print(word_ids)
lengths = torch.LongTensor([len(sent) for sent in sentences])
#print(lengths)

# NOTE: No more language id (removed it in a later version)
# langs = torch.LongTensor([params.lang2id[lang] for _, lang in sentences]).unsqueeze(0).expand(slen, bs) if params.n_langs > 1 else None
langs = None

## Forward

In [11]:
tensor = model('fwd', x=word_ids, lengths=lengths, langs=langs, causal=False).contiguous()
# 69: max sentence length
# 1: number of sentences, i.e. batch size 
# 1024 hidden states of the model: each word representation has 1024 dimensions 
# 1个大tensor内包含69个matrix, 每个matrix 1 row, 1024 columns (dimensions) 
#print(tensor.size())   # torch.Size([69, 1, 1024])
#print(tensor)

print(bpe_index)

a = []
if isinstance(bpe_index, list):
    for x in bpe_index:
        print(tensor[x]) 
        a.append(tensor[x])
print(torch.stack(a))     # turn a list of tensors to one tensor 

# max-pooling of input tensors 
print(torch.max(torch.stack(a), 0).values)

#### until here, I managed to get representation of one source phrase after max-pooling 

#print(tensor[0][0].size())  # first hidden state of the first sentence 
#print(tensor[1].size())  # second hidden state of all the sentences in the batch 

[10, 11, 12]
tensor([[ -4.2908,  -0.0421,  -1.9604,  ...,   2.3874,  -1.3416, -13.1386]],
       grad_fn=<SelectBackward>)
tensor([[ 2.1268,  2.2024,  0.5680,  ...,  0.5634, -0.5559, -4.5375]],
       grad_fn=<SelectBackward>)
tensor([[-0.4283, -1.1916,  1.2353,  ...,  0.7101, -2.6170, -9.7668]],
       grad_fn=<SelectBackward>)
tensor([[[ -4.2908,  -0.0421,  -1.9604,  ...,   2.3874,  -1.3416, -13.1386]],

        [[  2.1268,   2.2024,   0.5680,  ...,   0.5634,  -0.5559,  -4.5375]],

        [[ -0.4283,  -1.1916,   1.2353,  ...,   0.7101,  -2.6170,  -9.7668]]],
       grad_fn=<StackBackward>)
tensor([[ 2.1268,  2.2024,  1.2353,  ...,  2.3874, -0.5559, -4.5375]],
       grad_fn=<MaxBackward0>)


The variable tensor is of shape (max_sequence_length, batch_size, model_dimension).

tensor[0] is a tensor of shape (batch_size, model_dimension) that corresponds to the first hidden state of the last layer of each sentence.

This is the vector that we use to finetune on the GLUE and XNLI tasks.

In [12]:
#word_id=dico.index('cat')
#print(word_id)
#emb = model.embeddings.weight[word_id]
#print(emb.size())

In [13]:
#tensor = model.embeddings(word_ids)
#print(tensor.size())