In [1]:
# Copyright (c) 2019-present, Facebook, Inc.
# All rights reserved.
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.

# Code to generate sentence representations from a pre-trained model.
# This can be used to initialize a cross-lingual classifier, for instance.

# Attention, this notebook contains the same content as generate-embeddings.ipynb 
# but I write many comments here 

## Reload a pretrained model

In [2]:
# this notebook should be under XLM-master to use the classes in src/ (downloaded from github)
import os
import torch

from src.utils import AttrDict
from src.data.dictionary import Dictionary, BOS_WORD, EOS_WORD, PAD_WORD, UNK_WORD, MASK_WORD
from src.model.transformer import TransformerModel

FAISS library was not found.
FAISS not available. Switching to standard nearest neighbors search implementation.


In [3]:
# first, manually download corresponding models 
# pretraining: MLM + TLM, preprocessing: tokenize + lowercase + no accent + BPE 
model_path = 'models/mlm_tlm_xnli15_1024.pth'       
reloaded = torch.load(model_path)
params = AttrDict(reloaded['params'])
print("Supported languages: %s" % ", ".join(params.lang2id.keys()))

Supported languages: ar, bg, de, el, en, es, fr, hi, ru, sw, th, tr, ur, vi, zh


## Build dictionary / update parameters / build model

In [4]:
# build dictionary / update parameters
dico = Dictionary(reloaded['dico_id2word'], reloaded['dico_word2id'], reloaded['dico_counts'])
params.n_words = len(dico)
params.bos_index = dico.index(BOS_WORD)
params.eos_index = dico.index(EOS_WORD)
params.pad_index = dico.index(PAD_WORD)
params.unk_index = dico.index(UNK_WORD)
params.mask_index = dico.index(MASK_WORD)

print(params.n_words)

# build model / reload weights
# TransformerModel : def __init__(self, params, dico, is_encoder, with_output):
model = TransformerModel(params, dico, True, True)
# I don't understand here:  
model.eval()
model.load_state_dict(reloaded['model'])

95000


<All keys matched successfully>

## Get sentence representations

Sentences have to be in the BPE format, i.e. tokenized sentences on which you applied fastBPE.

Below you can see an example for English, French, Spanish, German, Arabic and Chinese sentences.

In [5]:
# Below is one way to bpe-ize sentences
codes = "./models/codes_xnli_15.txt"    # path to the codes of the model  
fastbpe = os.path.join(os.getcwd(), 'tools/fastBPE/fast')

def to_bpe(sentences):
    with open('./tmp/sentences', 'w') as fwrite:
        for sent in sentences:
            fwrite.write(sent + '\n')
    # applybpe output input codes [vocab]  => apply BPE codes to a text file
    os.system('%s applybpe ./tmp/sentences.bpe ./tmp/sentences %s' % (fastbpe, codes))
    sentences_bpe = []
    with open('./tmp/sentences.bpe') as f:
        for line in f:
            sentences_bpe.append(line.rstrip())  
    return sentences_bpe

In [6]:
sentences = [
    'once he had worn trendy italian leather shoes and jeans from paris that had cost three hundred euros .', # en
    'Le français est la seule langue étrangère proposée dans le système éducatif .', # fr
    'El cadmio produce efectos tóxicos en los organismos vivos , aun en concentraciones muy pequeñas .', # es
    'Nach dem Zweiten Weltkrieg verbreitete sich Bonsai als Hobby in der ganzen Welt .', # de
    'وقد فاز في الانتخابات في الجولة الثانية من التصويت من قبل سيدي ولد الشيخ عبد الله ، مع أحمد ولد داداه في المرتبة الثانية .', # ar
    '羅伯特 · 皮爾 斯 生於 1863年 , 在 英國 曼徹斯特 學習 而 成為 一 位 工程師 . 1933年 , 皮爾斯 在 直布羅陀去世 .', # zh
]
sentences = to_bpe(sentences)
print('\n\n'.join(sentences))

once he had worn tren@@ dy itali@@ an leather shoes and jeans from paris that had cost three hundred euros .

L@@ e fran@@ ç@@ ais est la seule langue é@@ tr@@ ang@@ è@@ re pro@@ pos@@ é@@ e dans le syst@@ è@@ me é@@ du@@ cat@@ if .

E@@ l cad@@ mio produce efectos t@@ ó@@ x@@ icos en los organismos viv@@ os , aun en concentr@@ aciones muy pe@@ que@@ ñ@@ as .

N@@ ach dem Z@@ weiten W@@ el@@ t@@ krieg verbreit@@ ete sich B@@ on@@ sai als H@@ ob@@ by in der ganzen W@@ elt .

وقد فاز في الانتخابات في الج@@ ولة الثانية من التصويت من قبل سيدي ولد الشيخ عبد الله ، مع أ@@ حمد ولد د@@ اد@@ اه في المرتبة الثانية .

羅@@ 伯特 · 皮@@ 爾 斯 生於 186@@ 3年 , 在 英國 曼@@ 徹@@ 斯特 學習 而 成為 一 位 工程@@ 師 . 1933年 , 皮@@ 爾@@ 斯 在 直@@ 布@@ 羅@@ 陀@@ 去世 .


In [7]:
# check how many tokens are OOV
n_w = len([w for w in ' '.join(sentences).split()])
n_oov = len([w for w in ' '.join(sentences).split() if w not in dico.word2id])
print('Number of out-of-vocab words: %s/%s' % (n_oov, n_w))
# add </s> sentence delimiters. form a list of lists 
# don't foreget here, we append </s> 
sentences = [(('</s> %s </s>' % sent.strip()).split()) for sent in sentences]
b = sentences 
#print(sentences)
print(b[0])

Number of out-of-vocab words: 17/164
['</s>', 'once', 'he', 'had', 'worn', 'tren@@', 'dy', 'itali@@', 'an', 'leather', 'shoes', 'and', 'jeans', 'from', 'paris', 'that', 'had', 'cost', 'three', 'hundred', 'euros', '.', '</s>']


## Create batch

In [8]:
bs = len(sentences)
#print([len(sent) for sent in sentences])
slen = max([len(sent) for sent in sentences])

#print(dico.id2word[5])
#params.bos_index 0 <s> 
#params.eos_index 1 </s>  but </s> is also used for bos  
#params.pad_index 2 <pad>
#params.unk_index 3 <unk>       4: <special0> 
#params.mask_index 5 <special1> 

#print(torch.LongTensor(slen, bs))  # random values in the matrix 
word_ids = torch.LongTensor(slen, bs).fill_(params.pad_index)
#print(word_ids)
# word_ids.shape: 37 rows (max sent length), 6 columns (number of sentences)

for i in range(len(sentences)):  # i from 0 to 5 
    #print([w for w in sentences[i]])
    #print([dico.index(w) for w in sentences[i]])
    sent = torch.LongTensor([dico.index(w) for w in sentences[i]])
    #print(sent)
    # i: column indice, refers to each sentence 
    # fill the matrix column by column 
    # take the first len(sent) rows of cells in the current column. the others remain padded 
    word_ids[:len(sent), i] = sent
#print(word_ids)
lengths = torch.LongTensor([len(sent) for sent in sentences])
#print(lengths)


In [9]:
# NOTE: No more language id (removed it in a later version)
# langs = torch.LongTensor([params.lang2id[lang] for _, lang in sentences]).unsqueeze(0).expand(slen, bs) if params.n_langs > 1 else None
langs = None

## Forward

In [10]:
tensor = model('fwd', x=word_ids, lengths=lengths, langs=langs, causal=False).contiguous()
# 1024 hidden states of the model: each word embedding has 1024 dimensions 
# 37: max sentence length
# 6: number of sentences, i.e. batch size 
# 1个大matrix内包含37个2维matrix, 每个6rows, 1024 columns (dimensions) 
print(tensor.size())
print(tensor[0])

print(tensor[0][0].size())  # first hidden state of the first sentence 
print(tensor[1].size())  # second hidden state of all the sentences in the batch 

torch.Size([37, 6, 1024])
tensor([[-0.4258,  3.7078, -1.2446,  ...,  1.6236,  1.8969, -7.4575],
        [-1.4627,  5.1548, -4.7037,  ..., -0.6723,  3.7204,  0.4129],
        [-0.4615,  8.3762, -3.5408,  ..., -0.7099,  3.9102, -2.8770],
        [-1.0137,  2.1509, -1.8360,  ..., -1.2961,  2.6104,  2.4603],
        [ 2.5150,  2.6423, -1.1369,  ..., -0.3289,  3.2460, -7.8119],
        [ 1.1674, -0.5507, -2.0623,  ..., -0.8173,  2.9915,  1.9788]],
       grad_fn=<SelectBackward>)
torch.Size([1024])
torch.Size([6, 1024])


The variable tensor is of shape (max_sequence_length, batch_size, model_dimension).

tensor[0] is a tensor of shape (batch_size, model_dimension) that corresponds to the first hidden state of the last layer of each sentence.

This is the vector that we use to finetune on the GLUE and XNLI tasks.

In [11]:
word_id=dico.index('cat')
print(word_id)
emb = model.embeddings.weight[word_id]
print(emb.size())

9671
torch.Size([1024])


In [12]:
tensor = model.embeddings(word_ids)
print(tensor.size())

torch.Size([37, 6, 1024])
