PyTorch implementation of FB's XLM model. GitHub link [here.](https://github.com/facebookresearch/XLM/blob/master/README.md#ii-cross-lingual-language-model-pretraining-xlm)

This model is also available in `pytorch-transformers` by HuggingFace!

In [1]:
import os
import torch

from XLM.src.utils import AttrDict
from XLM.src.data.dictionary import Dictionary, BOS_WORD, EOS_WORD, PAD_WORD, UNK_WORD, MASK_WORD
from XLM.src.model.transformer import TransformerModel

FAISS library was not found.
FAISS not available. Switching to standard nearest neighbors search implementation.


In [2]:
model_path = 'XLM/models/mlm_100_1280.pth'
reloaded = torch.load(model_path, map_location=torch.device('cpu'))
params = AttrDict(reloaded['params'])

# build dictionary / update parameters
dico = Dictionary(reloaded['dico_id2word'], reloaded['dico_word2id'], reloaded['dico_counts'])
params.n_words = len(dico)
params.bos_index = dico.index(BOS_WORD)
params.eos_index = dico.index(EOS_WORD)
params.pad_index = dico.index(PAD_WORD)
params.unk_index = dico.index(UNK_WORD)
params.mask_index = dico.index(MASK_WORD)

# build model / reload weights
model = TransformerModel(params, dico, True, True)
model.eval()
model.load_state_dict(reloaded['model'])

<All keys matched successfully>

In [3]:
model

TransformerModel(
  (position_embeddings): Embedding(512, 1280)
  (embeddings): Embedding(200000, 1280, padding_idx=2)
  (layer_norm_emb): LayerNorm((1280,), eps=1e-12, elementwise_affine=True)
  (attentions): ModuleList(
    (0): MultiHeadAttention(
      (q_lin): Linear(in_features=1280, out_features=1280, bias=True)
      (k_lin): Linear(in_features=1280, out_features=1280, bias=True)
      (v_lin): Linear(in_features=1280, out_features=1280, bias=True)
      (out_lin): Linear(in_features=1280, out_features=1280, bias=True)
    )
    (1): MultiHeadAttention(
      (q_lin): Linear(in_features=1280, out_features=1280, bias=True)
      (k_lin): Linear(in_features=1280, out_features=1280, bias=True)
      (v_lin): Linear(in_features=1280, out_features=1280, bias=True)
      (out_lin): Linear(in_features=1280, out_features=1280, bias=True)
    )
    (2): MultiHeadAttention(
      (q_lin): Linear(in_features=1280, out_features=1280, bias=True)
      (k_lin): Linear(in_features=1280, out_f

In [17]:
codes = os.path.join(os.getcwd(), 'XLM/codes_xnli_100.txt')
fastbpe = os.path.join(os.getcwd(), 'XLM/tools/fastBPE/fast')

def to_bpe(sentences):
    # write sentences to tmp file
    with open('/tmp/sentences', 'w') as fwrite:
        for sent in sentences:
            fwrite.write(sent + '\n')
    
    # apply bpe to tmp file
    os.system('%s applybpe /tmp/sentences.bpe /tmp/sentences %s' % (fastbpe, codes))
    
    # load bpe-ized sentences
    sentences_bpe = []
    with open('/tmp/sentences.bpe') as f:
        for line in f:
            sentences_bpe.append(line.rstrip())
    
    return sentences_bpe

In [26]:
def load_example_sentences(path):
    
    with open(path, 'r') as fread:
        sents_raw = fread.readlines()[:350]

    return sents_raw
        
sents = load_example_sentences('../data/en/CONLL2003/train.txt')

# Remove POS tags and other unnecessary formatting
sents = ' '.join([s.split(' ')[0] for s in sents_raw[1:] if s != '\n']).split('.')
sents = [sent.strip() + '.' for sent in sents]

In [20]:
to_bpe(sents)

['EU re@@ jects German call to bo@@ yc@@ ott British lam@@ b.',
 'Peter Blackburn BR@@ US@@ SE@@ LS 1996-@@ 0@@ 8-@@ 22 The European Commission said on Thur@@ sday it dis@@ agreed with German ad@@ vice to consum@@ ers to sh@@ un British lam@@ b until scientists deter@@ mine whether mad cow disease can be transmit@@ ted to she@@ ep@@ .',
 "Germany 's represent@@ ative to the European Union 's veterin@@ ary committee Werner Zw@@ ing@@ mann said on Wed@@ nes@@ day consum@@ ers should buy she@@ ep@@ meat from countries other than Britain until the scientific ad@@ vice was cle@@ ar@@ er@@ .",
 '" We do n@@ \'t support any such recommen@@ d@@ ation because we do n@@ \'t see any gro@@ unds for it , " the Commission \'s chief spo@@ kes@@ man Nikolaus van der Pas told a news brief@@ ing@@ .',
 'He said further scientific study was required and if it was found that action was needed it should be taken by the European Union@@ .',
 'He said a propos@@ al last month by EU Farm Com@@ missioner Franz

In [40]:
ru_sents_raw = load_example_sentences('../data/ru/raw/ru/brexit_ru.txt_file_1032.txt')
to_bpe([s + '.' for s in ru_sents_raw[-1].split('.') if s])

['Но@@ вым мини@@ стром по дел@@ ам Bre@@ xit стал До@@ ми@@ ник Ра@@ аб@@ , который замени@@ л на этом по@@ сту уш@@ ед@@ шего на@@ кан@@ у@@ не ве@@ че@@ ром в отстав@@ ку Дэ@@ вида Дэ@@ ви@@ са@@ .',
 ' Ран@@ ее Ра@@ аб возглав@@ лял министер@@ ство по дел@@ ам жи@@ ли@@ щ@@ ного строительства Великобрита@@ ни@@ и@@ , а также был замест@@ ителем министра ю@@ сти@@ ци@@ и@@ .',
 ' Се@@ год@@ ня же стало извест@@ но@@ , что коро@@ лева Великобритании одоб@@ ри@@ ла его на@@ значение на новую долж@@ ност@@ ь@@ .',
 ' На данный момент это самы@@ й ответ@@ ственный пост в правитель@@ стве@@ , вед@@ ь со@@ всем скоро стране пред@@ стоит вы@@ ход из Евро@@ со@@ ю@@ за@@ .',
 ' Э@@ кс@@ пер@@ ты уже окрест@@ или на@@ значение До@@ ми@@ ника Ра@@ а@@ ба на новую должность прояв@@ лением от@@ ча@@ я@@ ния со стороны премьер-@@ министра@@ .',
 ' От@@ мет@@ им@@ , что тот же Дэ@@ вис покин@@ ул свой пост из-за не@@ прия@@ тия под@@ хода Те@@ рез@@ ы М@@ эй в перегово@@ рах с Е@@ С.',
 ' По его 