In [1]:
#!pip install transformers
!pip install -U adapter-transformers
!pip install torchdata
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
from random import shuffle
import torch.nn.functional as F
import numpy as np
import pandas as pd
import spacy
import torch
import torch.nn.functional as F
from tqdm import tqdm
from transformers import AutoConfig, AutoModelForMaskedLM, AutoTokenizer
from transformers.adapters.configuration import AdapterConfig
import transformers.adapters.composition as ac



Seet the hooks for the different modules, should work with or without adapters

In [3]:
# Below I state the naming convention
layer_dict = {"layer_input_":"input into each layer, output of the previous layer (redundancy)", 
              "attention_output_": "output of the attention layer, before the upward and downward project using FFN", 
              "ffn_output_": "The feedforward downward projection before layer normalization", 
              "layer_output_": "this is the layer output which is the same as the hidden state", 
              "adapter_input_": "adapter layer input, this should be the output of ffn_output_ after layer normalization ",
              "adapter_output_": "adapter layer output" 
              }
for k, v in layer_dict.items():
  print(k,':', v)

layer_input_ : input into each layer, output of the previous layer (redundancy)
attention_output_ : output of the attention layer, before the upward and downward project using FFN
ffn_output_ : The feedforward downward projection before layer normalization
layer_output_ : this is the layer output which is the same as the hidden state
adapter_input_ : adapter layer input, this should be the output of ffn_output_ after layer normalization 
adapter_output_ : adapter layer output


In [4]:
def set_hooks_roberta(model, lang=None):
    """
    Only works on Roberta from HF
    model: model object
    lang: is the name of the adapter
    """
    final_layer = model.config.num_hidden_layers

    for attr in ["activations_"]:
        if not hasattr(model, attr):
            setattr(model, attr, {})

    def get_activation(name):
        def hook(module, input, output):
            if "input_embedding" in name or "layer_input" in name:
              num_tokens = list(input[0].size())[1]  # (batch, sequence, hidden_state)
              model.activations_[name] = input[0].detach() #input[0].detach() #[:, num_tokens - 1].detach()
            elif "attention_output" in name:
              num_tokens = list(input[0].size())[1]  # (batch, sequence, hidden_state)
              model.activations_[name] = input[0]
            elif "ffn_output" in name or "layer_output" in name or "ffn_output_norm" in name:
              num_tokens = list(output[0].size())[1]  # (batch, sequence, hidden_state)
              model.activations_[name] = output[0].unsqueeze(dim=0)
            elif "adapter_input" in name:
              num_tokens = list(input[0].size())[1]  # (batch, sequence, hidden_state)
              model.activations_[name] = input[0]
            elif "adapter_output" in name:
              num_tokens = list(output[0].size())[1]  # (batch, sequence, hidden_state)
              model.activations_[name] = output[0].unsqueeze(dim=0)
            elif "adapter_output2" in name:
              num_tokens = list(output[0].size())[1]  # (batch, sequence, hidden_state)
              model.activations_[name] = output[0].unsqueeze(dim=0)

        return hook
    
    model.roberta.encoder.layer[0].attention.register_forward_hook(get_activation("input_embedding"))
    for i in range(final_layer):
      model.roberta.encoder.layer[i].attention.register_forward_hook(get_activation("layer_input_" + str(i))) # get the input to each attention module
      model.roberta.encoder.layer[i].intermediate.register_forward_hook(get_activation("attention_output_" + str(i))) # get the attention module output     
      model.roberta.encoder.layer[i].output.dense.register_forward_hook(get_activation("ffn_output_" + str(i))) # get output of the encoder FFN module before Layer normalization
      model.roberta.encoder.layer[i].output.LayerNorm.register_forward_hook(get_activation("ffn_output_norm_" + str(i))) # we need this input
      model.roberta.encoder.layer[i].output.register_forward_hook(get_activation("layer_output_" + str(i))) # redunancy because it is the hidden state also
      if lang!=None:
        # if we have a language adapter, get the adapter input and output
        model.roberta.encoder.layer[i].output.adapters[lang].adapter_down.register_forward_hook(get_activation("adapter_input_" + str(i)))
        model.roberta.encoder.layer[i].output.adapters[lang].adapter_up.register_forward_hook(get_activation("adapter_output_" + str(i)))
        #model.get_adapter(lang)[i]['output_adapter'].register_forward_hook(get_activation("adapter_output2"))
        #model.get_adapter(lang)[i]['output_adapter'].register_forward_hook(get_activation("adapter_input2"))
        #model.get_adapter(lang)[i]['output_adapter'].adapter_up.register_forward_hook(get_activation("adapter_output3"))
    

In [5]:
def get_hidden_states(model, tokenizer, tokens, TOP_K=1):
    """
    get the hidden states for all the tokens and the unknown tokens that were masked from all the modules set in the hook above
    input
      model: model 
      tokenizer: tokenizer
      tokens: input sentence already tokenized

    returns
      sent_to_hidden_states: intermediate hidden states gotten from hook
      unknown_hiddens: hidden states for only the masked tokens gotten from hook (subset of sent_to_hidden_states)
      hidden_states: the original encoder hidden states (each layer hidden states)

    """
    HIDDEN_SIZE = model.config.hidden_size
    masked_position = (tokens['input_ids'].squeeze() == tokenizer.mask_token_id).nonzero()
    masked_position = masked_position.squeeze()
    #print("Masked positions = ", masked_position)
    layer_residual_preds = []
    intermed_residual_preds = []
    output = model(**tokens, output_hidden_states=True)
    hidden_states = output['hidden_states']
    #print(hidden[-1])
    sent_to_hidden_states = {}
    unknow_hiddens = {}
    for layer in model.activations_.keys():
      if 'input_embedding' in layer or "layer_input" in layer or "attention_output" in layer or "ffn_output" in layer or "layer_output" in layer or "adapter_input" in layer or "adapter_output" in layer or "ffn_output_norm" in layer:
        sent_to_hidden_states[layer] = model.activations_[layer]
        unknow_hiddens[layer] = model.activations_[layer][:, masked_position]
    return sent_to_hidden_states, unknow_hiddens, hidden_states

Load the model and language adapter

In [6]:
def loadmodel(name="xlm-roberta-base", lang="en"):
  tokenizer = AutoTokenizer.from_pretrained(name) 
  config = AutoConfig.from_pretrained(name)
  model = AutoModelForMaskedLM.from_pretrained(name, config=config) 
  lang_adapter_config = AdapterConfig.load("pfeiffer", reduction_factor=8)
  model.load_adapter(lang, config=lang_adapter_config, model_name = 'xlm-roberta-base')
  model.set_active_adapters(ac.Stack(lang))
  return model, tokenizer

Test with a sample input sentence in English

In [7]:
modelname = "xlm-roberta-base"
lang = "en"
model, tokenizer = loadmodel(name=modelname, lang=lang)
set_hooks_roberta(model,lang=lang)
sampleinput =  "Tom has fully recovered from the illness."
sentence = "Tom has fully ___ from the illness."
sentence = sentence.replace("___","<mask>")

# key = (sentence, idx)
tokens = tokenizer(sentence, return_tensors="pt")
tokens_to_sent = tokenizer.tokenize(sentence)
sent_to_hidden_states, unknow_hiddens, encoderhidden = get_hidden_states(model,tokenizer, tokens, TOP_K=30)

## The issue I am talking about is below, using the first encoder layer as an example, below I print out the hidden state from the first encoder layer, the output of the transformer hidden state (which should be the input to the adapter module), and the adapter module output,

In [8]:
# the output of the first encoder layer , encoderhidden[0] is the embedding layer
print(encoderhidden[1])

tensor([[[-0.0481, -0.0368, -0.0395,  ..., -0.0945, -0.0946, -0.2492],
         [-0.2660, -0.2258,  0.4876,  ...,  0.2622,  0.0763,  1.1179],
         [-0.2593,  0.1817,  0.2864,  ...,  1.0401, -0.8326,  0.0481],
         ...,
         [ 0.2902,  0.1266,  0.1916,  ...,  0.4514,  0.4580, -0.2023],
         [ 0.1633,  0.1977,  0.2310,  ..., -0.3881,  0.0034,  0.1870],
         [ 0.0400,  0.3386,  0.2164,  ...,  0.5326, -0.1517,  0.3164]]],
       grad_fn=<NativeLayerNormBackward0>)


In [9]:
# the output of the transformer layer, this is suppose to be the input into the language adapter
print(sent_to_hidden_states['ffn_output_norm_0']) 

tensor([[[-0.0481, -0.0368, -0.0395,  ..., -0.0945, -0.0946, -0.2492],
         [-0.2660, -0.2258,  0.4876,  ...,  0.2622,  0.0763,  1.1179],
         [-0.2593,  0.1817,  0.2864,  ...,  1.0401, -0.8326,  0.0481],
         ...,
         [ 0.2902,  0.1266,  0.1916,  ...,  0.4514,  0.4580, -0.2023],
         [ 0.1633,  0.1977,  0.2310,  ..., -0.3881,  0.0034,  0.1870],
         [ 0.0400,  0.3386,  0.2164,  ...,  0.5326, -0.1517,  0.3164]]],
       grad_fn=<UnsqueezeBackward0>)


In [10]:
print("The adapter output")
print(sent_to_hidden_states['adapter_output_0'])

The adapter output
tensor([[[ 2.9568e-01, -2.0675e-01,  1.4939e-01,  ..., -1.6568e-02,
           1.5798e-01,  6.3218e-02],
         [-1.1226e-02, -2.7732e-02,  1.1889e-03,  ...,  2.3751e-03,
           1.1180e-02,  1.5625e-03],
         [ 6.1715e-02,  1.1988e-02, -1.4756e-01,  ...,  1.2963e-01,
          -5.8567e-02, -1.4035e-02],
         ...,
         [ 4.5658e-03,  8.2757e-02, -4.9524e-02,  ...,  4.1237e-02,
          -8.3173e-03, -1.3748e-04],
         [ 4.4017e-02,  6.8796e-02, -3.6980e-02,  ..., -1.0728e-01,
           6.8577e-02,  5.9167e-02],
         [-3.5689e-03,  5.2253e-02, -1.4614e-01,  ...,  5.5502e-02,
          -4.6489e-02,  5.3878e-02]]], grad_fn=<UnsqueezeBackward0>)


Take logit over the output hidden state, to the top-k output

In [None]:
logits = model.lm_head(unknow_hiddens['layer_output_11'])
probs = F.softmax(logits)
probs_ = []
TOP_K = 10
for index, prob in enumerate(probs.squeeze()):
      probs_.append((index, prob))
top_k = sorted(probs_, key=lambda x: x[1], reverse=True)[:TOP_K]
print(top_k)
top_k = [(t[1].item(), t[0]) for t in top_k]
print([tokenizer.decode(y).strip() for x, y in top_k])

  probs = F.softmax(logits)
