In [120]:
import torch

from transformers import CamembertForMaskedLM,CamembertTokenizer



tokenizer = CamembertTokenizer.from_pretrained("camembert-base")
model = CamembertForMaskedLM.from_pretrained("camembert-base")
model.eval()

CamembertForMaskedLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(32005, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerN

In [1]:
import torch
from transformers import pipeline

In [6]:
camembert_fill_mask  = pipeline("fill-mask", model="camembert-base", tokenizer="camembert-base")
results = camembert_fill_mask("Le camembert est <mask> :)")

In [8]:
camembert_embed = pipeline("feature-extraction",model="camembert-base",tokenizer="camembert-base")

Some weights of the model checkpoint at camembert-base were not used when initializing CamembertModel: ['lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing CamembertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
results

[{'score': 0.4909106194972992,
  'token': 7200,
  'token_str': 'délicieux',
  'sequence': 'Le camembert est délicieux :)'},
 {'score': 0.10556956380605698,
  'token': 2183,
  'token_str': 'excellent',
  'sequence': 'Le camembert est excellent :)'},
 {'score': 0.034533172845840454,
  'token': 26202,
  'token_str': 'succulent',
  'sequence': 'Le camembert est succulent :)'},
 {'score': 0.033031292259693146,
  'token': 528,
  'token_str': 'meilleur',
  'sequence': 'Le camembert est meilleur :)'},
 {'score': 0.03007633611559868,
  'token': 1654,
  'token_str': 'parfait',
  'sequence': 'Le camembert est parfait :)'}]

In [10]:

def fill_mask(masked_input, model, tokenizer, topk=5):
    # Adapted from https://github.com/pytorch/fairseq/blob/master/fairseq/models/roberta/hub_interface.py
    assert masked_input.count("<mask>") == 1
    input_ids = torch.tensor(tokenizer.encode(masked_input, add_special_tokens=True)).unsqueeze(0)  # Batch size 1
    logits = model(input_ids)[0]  # The last hidden-state is the first element of the output tuple
    masked_index = (input_ids.squeeze() == tokenizer.mask_token_id).nonzero().item()
    logits = logits[0, masked_index, :]
    prob = logits.softmax(dim=0)
    values, indices = prob.topk(k=topk, dim=0)
    topk_predicted_token_bpe = " ".join(
        [tokenizer.convert_ids_to_tokens(indices[i].item()) for i in range(len(indices))]
    )
    masked_token = tokenizer.mask_token
    topk_filled_outputs = []
    for index, predicted_token_bpe in enumerate(topk_predicted_token_bpe.split(" ")):
        predicted_token = predicted_token_bpe.replace("\u2581", " ")
        if " {0}".format(masked_token) in masked_input:
            topk_filled_outputs.append(
                (
                    masked_input.replace(" {0}".format(masked_token), predicted_token),
                    values[index].item(),
                    predicted_token,
                )
            )
        else:
            topk_filled_outputs.append(
                (masked_input.replace(masked_token, predicted_token), values[index].item(), predicted_token,)
            )
    return topk_filled_outputs


tokenizer = CamembertTokenizer.from_pretrained("camembert-base")
model = CamembertModel.from_pretrained("camembert-base")

model.eval()

masked_input = "Dans la douche, il se lave la peau avec du  <mask>"
#print(fill_mask(masked_input, model, tokenizer, topk=3))

Some weights of the model checkpoint at camembert-base were not used when initializing CamembertModel: ['lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing CamembertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [121]:
masked_line = 'Le camembert est <mask> :)'
fill_mask(masked_line,model,tokenizer, topk=3)

[('Le camembert est délicieux :)', 0.4909106194972992, ' délicieux'),
 ('Le camembert est excellent :)', 0.10556956380605698, ' excellent'),
 ('Le camembert est succulent :)', 0.034533172845840454, ' succulent')]

In [139]:
vocab=tokenizer.get_vocab()

In [148]:
vocab["▁jambon"]

15898

In [28]:
ids = torch.tensor(tokenizer.encode(masked_line)).unsqueeze(0)

In [29]:
acts = model(ids)

In [46]:
acts.hidden_states[-1]

tensor([[[ 0.0203,  0.0350,  0.0881,  ..., -0.1152,  0.0047, -0.0551],
         [-0.0346, -0.1090,  0.0404,  ..., -0.0560, -0.0389,  0.0104],
         [-0.0358, -0.0795,  0.1193,  ..., -0.0601, -0.1193,  0.0900],
         ...,
         [-0.0528,  0.0296,  0.0976,  ...,  0.0215, -0.0267, -0.1271],
         [ 0.0712, -0.1675, -0.6567,  ...,  0.0786, -0.0880,  0.1441],
         [-0.0380,  0.0742,  0.0933,  ..., -0.0585, -0.0465, -0.0675]]],
       grad_fn=<NativeLayerNormBackward0>)

In [165]:
s = "Le camembert est <mask> :)"
ids = torch.tensor(tokenizer.encode(s,add_special_tokens=True)).unsqueeze(0)
acts_mask = model(ids)[0]
acts_mask = acts_mask[0,-3,:]
prob = acts_mask.softmax(dim=0)
max_prob =max(prob)
ids[0,-3]

tensor(32004)

tensor([2.3250e-08, 9.4328e-09, 5.1482e-07,  ..., 4.9397e-10, 1.0783e-08,
        2.3439e-09], grad_fn=<SoftmaxBackward0>)

In [72]:
acts_mask.shape

torch.Size([768])

In [166]:
s1 = "Le camembert est délicieux :)"
ids = torch.tensor(tokenizer.encode(s1,add_special_tokens=True)).unsqueeze(0)
acts1_mask = model(ids)[-0]
acts1_mask = acts1_mask[0,-3,:]
word_index=ids[0,-3].item()

print(prob[word_index])
print(prob[word_index]/max_prob)



tensor(0.4909, grad_fn=<SelectBackward0>)
tensor(1., grad_fn=<DivBackward0>)


In [168]:
s2 = "Le camembert est mouton :)"
ids = torch.tensor(tokenizer.encode(s2,add_special_tokens=True)).unsqueeze(0)
acts2_mask = model(ids)[0]
acts2_mask = acts2_mask[0,-3,:]
word_index=ids[0,-3].item()

print(prob[word_index])
print(prob[word_index]/max_prob)


tensor(9.7666e-07, grad_fn=<SelectBackward0>)
tensor(1.9895e-06, grad_fn=<DivBackward0>)


In [118]:
print(cos(acts_mask,acts1_mask))

tensor(0.0904, grad_fn=<DivBackward0>)


In [119]:
cos(acts_mask,acts2_mask)

tensor(0.0539, grad_fn=<DivBackward0>)

IndexError: too many indices for tensor of dimension 1

In [169]:
validation_n400_file = open("../data/validation1.n400.txt")
cong = []
incong = []
for line in validation_n400_file:
    print(line)
    mots = line.split()
    mots_incong = mots[0:(len(mots)-1)]
    print(mots_incong)
    incong.append(mots_incong)
    mots_cong = mots[0:(len(mots)-2)]+ [mots[-1] ]
    cong.append(mots_cong)
    print(mots_cong)

Pour accrocher le cadre, Nicholas avait besoin d’un marteau et d’un navire clou 

['Pour', 'accrocher', 'le', 'cadre,', 'Nicholas', 'avait', 'besoin', 'd’un', 'marteau', 'et', 'd’un', 'navire']
['Pour', 'accrocher', 'le', 'cadre,', 'Nicholas', 'avait', 'besoin', 'd’un', 'marteau', 'et', 'd’un', 'clou']
Dans la douche, il se lave la peau avec du teste savon 

['Dans', 'la', 'douche,', 'il', 'se', 'lave', 'la', 'peau', 'avec', 'du', 'teste']
['Dans', 'la', 'douche,', 'il', 'se', 'lave', 'la', 'peau', 'avec', 'du', 'savon']
Elle est allée à la boulangerie pour acheter une miche de veste pain 

['Elle', 'est', 'allée', 'à', 'la', 'boulangerie', 'pour', 'acheter', 'une', 'miche', 'de', 'veste']
['Elle', 'est', 'allée', 'à', 'la', 'boulangerie', 'pour', 'acheter', 'une', 'miche', 'de', 'pain']
Katie a mis le bouquet de fleurs dans un poisson vase 

['Katie', 'a', 'mis', 'le', 'bouquet', 'de', 'fleurs', 'dans', 'un', 'poisson']
['Katie', 'a', 'mis', 'le', 'bouquet', 'de', 'fleurs', 'dans', 'u

In [64]:
cos = torch.nn.CosineSimilarity(dim=0)