In [1]:
import torch
from transformers import XLNetTokenizer, XLNetLMHeadModel

In [2]:
model = XLNetLMHeadModel.from_pretrained("xlnet-base-cased")
tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")



In [3]:
origStr = "I mean, when you go to a movie and it’s set to start at a certain time, would you not be upset if 7 hours later said movie has not started?"
testStr = "<mask> <mask> <mask> <mask> <mask> <mask> a <mask> and <mask> <mask> <mask> <mask> <mask> <mask> <mask> <mask> <mask> <mask> <mask> <mask> <mask> <mask> <mask> <mask> <mask> <mask> <mask> <mask> <mask> <mask>?"
encoded_str = tokenizer.encode(testStr)
tokens_tensor = torch.tensor([encoded_str])
tokenizer.convert_ids_to_tokens(encoded_str)
perm_mask = torch.zeros((1, tokens_tensor.shape[1], tokens_tensor.shape[1]), dtype=torch.float)
perm_mask[:, :, 8] = 1.0  # Previous tokens don't see masked token
target_mapping = torch.zeros((1, 1, tokens_tensor.shape[1]), dtype=torch.float)  # Shape [1, 1, seq_length] => let's predict one token
target_mapping[0, 0, 8] = 1.0  # Our first (and only) prediction will be the last token of the sequence (the masked token)


In [7]:
model

XLNetLMHeadModel(
  (transformer): XLNetModel(
    (word_embedding): Embedding(32000, 768)
    (layer): ModuleList(
      (0): XLNetLayer(
        (rel_attn): XLNetRelativeAttention(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (ff): XLNetFeedForward(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (layer_1): Linear(in_features=768, out_features=3072, bias=True)
          (layer_2): Linear(in_features=3072, out_features=768, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (1): XLNetLayer(
        (rel_attn): XLNetRelativeAttention(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (ff): XLNetFeedForward(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwi

In [9]:
model.transformer.word_embedding.weight

Parameter containing:
tensor([[-4.8919e-03,  6.5530e-02, -1.5061e-02,  ..., -4.5812e-02,
         -6.1461e-03,  3.4621e-02],
        [ 3.8088e-02,  1.9711e-02,  2.6418e-02,  ..., -1.9814e-04,
         -3.4959e-02,  2.6332e-02],
        [ 2.7695e-02,  1.7981e-02,  1.9903e-02,  ..., -1.8557e-03,
         -3.7725e-02,  3.1554e-02],
        ...,
        [ 4.6112e-02,  1.1896e-01,  1.3977e-02,  ...,  6.2643e-02,
          3.9860e-02, -4.7146e-02],
        [ 6.4509e-02,  1.2249e-01, -2.3139e-02,  ..., -6.4272e-02,
          4.0406e-02,  2.5219e-01],
        [ 4.0150e-03, -4.8572e-02, -2.2838e-02,  ..., -2.1984e-02,
          4.1460e-02,  1.1334e-01]], requires_grad=True)

In [12]:
model.lm_loss.weight is model.transformer.word_embedding.weight

True

In [13]:
embeddings = model.transformer.word_embedding.weight
embeddings

Parameter containing:
tensor([[-4.8919e-03,  6.5530e-02, -1.5061e-02,  ..., -4.5812e-02,
         -6.1461e-03,  3.4621e-02],
        [ 3.8088e-02,  1.9711e-02,  2.6418e-02,  ..., -1.9814e-04,
         -3.4959e-02,  2.6332e-02],
        [ 2.7695e-02,  1.7981e-02,  1.9903e-02,  ..., -1.8557e-03,
         -3.7725e-02,  3.1554e-02],
        ...,
        [ 4.6112e-02,  1.1896e-01,  1.3977e-02,  ...,  6.2643e-02,
          3.9860e-02, -4.7146e-02],
        [ 6.4509e-02,  1.2249e-01, -2.3139e-02,  ..., -6.4272e-02,
          4.0406e-02,  2.5219e-01],
        [ 4.0150e-03, -4.8572e-02, -2.2838e-02,  ..., -2.1984e-02,
          4.1460e-02,  1.1334e-01]], requires_grad=True)

In [16]:
orig_embeddings = embeddings.detach().numpy().copy()

In [18]:
embeddings.shape

torch.Size([32000, 768])

In [31]:
embeddings.requires_grad

True

In [40]:
with torch.no_grad():
    embeddings.copy_(torch.tensor(orig_embeddings))

In [27]:
orig_std = orig_embeddings.std()
orig_std

0.05505202

Var[X] = v
Var[a X] = a^2 v

In [55]:
noise = torch.randn_like(embeddings)
noise *= orig_std * .01
noise.std()

tensor(0.0006)

In [56]:
with torch.no_grad():
    embeddings += noise

In [57]:
with torch.no_grad():
    outputs = model(tokens_tensor, perm_mask=perm_mask, target_mapping=target_mapping)
    next_token_logits = outputs[0][0, 0, :]

In [42]:
print(outputs[0].shape)

torch.Size([1, 1, 32000])


In [6]:
print([tokenizer.convert_ids_to_tokens(index.item()) for index in next_token_logits.topk(10).indices])

['d', '?', 's', 'sion', '▁Note', 'c', 'p', '▁Formation', 'ions', 'sis']


In [58]:
print([tokenizer.convert_ids_to_tokens(index.item()) for index in next_token_logits.topk(10).indices])

['d', '?', 's', 'c', 'p', '▁Note', 'sion', '▁Formation', 'nch', 'm']
