In [1]:
from torch.nn import Module
from transformers import EncoderDecoderModel, AutoTokenizer
model = EncoderDecoderModel.from_pretrained("patrickvonplaten/bert2bert-cnn_dailymail-fp16", pad_token_id=0)
orig_model = EncoderDecoderModel.from_pretrained("patrickvonplaten/bert2bert-cnn_dailymail-fp16", pad_token_id=0)
tokenizer = AutoTokenizer.from_pretrained("patrickvonplaten/bert2bert-cnn_dailymail-fp16", pad_token_id=0)

Some weights of the model checkpoint at patrickvonplaten/bert2bert-cnn_dailymail-fp16 were not used when initializing EncoderDecoderModel: ['decoder.bert.pooler.dense.bias', 'decoder.bert.pooler.dense.weight']
- This IS expected if you are initializing EncoderDecoderModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing EncoderDecoderModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [2]:
tokens = tokenizer.batch_encode_plus(["This is a sentence"], padding=True, return_tensors="pt")
print(tokens)

{'input_ids': tensor([[ 101, 2023, 2003, 1037, 6251,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1]])}


In [3]:
from transformers.modeling_outputs import BaseModelOutputWithPoolingAndCrossAttentions
from transformers.modeling_utils import PreTrainedModel
from torch.nn.utils.rnn import pad_sequence
from transformers.models.bert.modeling_bert import BertPooler
import torch

class Config():
    pass

class CustomEncoder(Module):
    def __init__(self, encoder, *args, **kwargs):    
        super().__init__(*args, **kwargs)
        self.sub_encoder = encoder
        pooler_config = Config()
        pooler_config.hidden_size = 768
        self.bert_pooler = BertPooler(pooler_config)

    def forward(self, input_ids, **kwargs):

        attention_masks = kwargs.pop("attention_mask")

        print(attention_masks)

        hidden_state_outputs = []
        print(len(input_ids))
        for i in range(len(input_ids)):
            inputs = torch.unsqueeze(input_ids[i],0)
            attn_mask = torch.ones_like(inputs)
            encoder_outputs = self.sub_encoder.forward(inputs, attention_mask=attn_mask, **kwargs)
            hidden_state_outputs.append(encoder_outputs["pooler_output"])
        hidden_state_outputs = pad_sequence(hidden_state_outputs).contiguous()
       
        
        pooler_output = self.bert_pooler(hidden_state_outputs).contiguous()

        output = BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=hidden_state_outputs,
                                                              pooler_output = pooler_output)

        return output


#class CustomDecoder(Module):
#    def __init__(self, decoder, *args, **kwargs):
#        super().__init__(*args, **kwargs)
#        self.decoder = decoder

#    def forward(self, input_ids, **kwargs):
#        return self.decoder.forward(input_ids, **kwargs)[:,0]
    
#    def prepare_inputs_for_generation(self, *args, **kwargs):
#        print(args)
#        print(kwargs)
#        return self.decoder.prepare_inputs_for_generation(*args, **kwargs)




dialogue = ["this is a sentence", "what is the sentence about", "yes it is"]
tokens = tokenizer.batch_encode_plus(dialogue, padding=True, return_tensors="pt")


cust_enc = CustomEncoder(model.encoder)
#cust_dec = CustomDecoder(model.decoder)

model.encoder = cust_enc
#model.decoder = cust_dec

tokens.pop("attention_mask")
tokens["attention_mask"] = torch.Tensor([[1,1,1]])
tokens.pop("token_type_ids")
output = model.generate(**tokens)

tensor([[1., 1., 1.]])
3


In [4]:

labels = tokenizer("This is the corresponding summary", return_tensors="pt").input_ids
labels = labels.expand(3,-1)
tokens["labels"] = labels.contiguous()
print(tokens)
orig_tokens = tokenizer.batch_encode_plus(dialogue, padding=True, return_tensors="pt")


{'input_ids': tensor([[ 101, 2023, 2003, 1037, 6251,  102,    0],
        [ 101, 2054, 2003, 1996, 6251, 2055,  102],
        [ 101, 2748, 2009, 2003,  102,    0,    0]]), 'attention_mask': tensor([[1., 1., 1.]]), 'labels': tensor([[  101,  2023,  2003,  1996,  7978, 12654,   102],
        [  101,  2023,  2003,  1996,  7978, 12654,   102],
        [  101,  2023,  2003,  1996,  7978, 12654,   102]])}


In [5]:
orig_gen = orig_model.generate(**orig_tokens)
print(tokenizer.batch_decode(**orig_gen))

tensor([[1., 1., 1.]])
3




Seq2SeqLMOutput(loss=tensor(8.9051, grad_fn=<NllLossBackward>), logits=tensor([[[-10.0957,  -9.9681, -10.1470,  ..., -10.9902, -11.0085, -11.2155],
         [-10.0951,  -9.9675, -10.1465,  ..., -10.9896, -11.0080, -11.2148],
         [ -9.9053, -10.0673,  -9.8953,  ..., -10.0824,  -9.3429,  -9.6537],
         ...,
         [ -7.8221,  -7.7496,  -7.6399,  ...,  -8.5666,  -7.6333,  -8.9055],
         [ -8.3329,  -8.3815,  -8.4302,  ...,  -8.6184,  -8.4290, -10.3535],
         [ -9.7433,  -9.9469,  -9.8623,  ...,  -9.8260, -10.4160, -11.8519]],

        [[-10.0957,  -9.9681, -10.1470,  ..., -10.9902, -11.0085, -11.2155],
         [-10.0951,  -9.9675, -10.1465,  ..., -10.9896, -11.0080, -11.2148],
         [ -9.9053, -10.0673,  -9.8953,  ..., -10.0824,  -9.3429,  -9.6537],
         ...,
         [ -7.8221,  -7.7496,  -7.6399,  ...,  -8.5666,  -7.6333,  -8.9055],
         [ -8.3329,  -8.3815,  -8.4302,  ...,  -8.6184,  -8.4290, -10.3535],
         [ -9.7433,  -9.9469,  -9.8623,  ...,  -9.82

In [6]:
tokenizer.batch_decode(output)

['[CLS] the incident took place in the town of kew, near the capital of south africa. the town is a popular destination for the local population. the city has a number of famous actors and actresses. the actor is now a celebrity in the uk. he is the latest in a series of celebrity celebrity - themed films. [SEP]',
 '[CLS] the incident took place in the town of kew, near the capital of south africa. the town is a popular destination for the local population. the city has a number of famous actors and actresses. the actor is now a celebrity in the uk. he is the latest in a series of celebrity celebrity - themed films. [SEP]',
 '[CLS] the incident took place in the town of kew, near the capital of south africa. the town is a popular destination for the local population. the city has a number of famous actors and actresses. the actor is now a celebrity in the uk. he is the latest in a series of celebrity celebrity - themed films. [SEP]']

In [7]:
asdasdsad

NameError: name 'asdasdsad' is not defined

In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('bert-base-nli-mean-tokens')

#Our sentences we like to encode
sentences = ['This framework generates embeddings for each input sentence',
    'Sentences are passed as a list of string.',
    'The quick brown fox jumps over the lazy dog.']

#Sentences are encoded by calling model.encode()
embeddings = model.encode(sentences)


Downloading: 100%|██████████| 391/391 [00:00<00:00, 95.5kB/s]
Downloading: 100%|██████████| 190/190 [00:00<00:00, 105kB/s]
Downloading: 100%|██████████| 3.95k/3.95k [00:00<00:00, 1.37MB/s]
Downloading: 100%|██████████| 2.00/2.00 [00:00<00:00, 885B/s]
Downloading: 100%|██████████| 625/625 [00:00<00:00, 221kB/s]
Downloading: 100%|██████████| 122/122 [00:00<00:00, 34.0kB/s]
Downloading: 100%|██████████| 229/229 [00:00<00:00, 70.3kB/s]
Downloading: 100%|██████████| 438M/438M [01:37<00:00, 4.49MB/s] 
Downloading: 100%|██████████| 53.0/53.0 [00:00<00:00, 25.8kB/s]
Downloading: 100%|██████████| 112/112 [00:00<00:00, 55.3kB/s]
Downloading: 100%|██████████| 466k/466k [00:00<00:00, 986kB/s] 
Downloading: 100%|██████████| 399/399 [00:00<00:00, 231kB/s]
Downloading: 100%|██████████| 232k/232k [00:00<00:00, 542kB/s] 


In [None]:
model.modules

<bound method Module.modules of SentenceTransformer(
  (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
)>