In [1]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, AutoModel, AutoTokenizer
from torch.nn import Module
from pprint import pprint
import copy
import torch

##st_tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
#st_model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")



class inspect_func(Module):
    def __call__(self, *args, **kwargs):
        pprint(args)
        pprint(kwargs)
        print(kwargs["encoder_hidden_states"].shape)


In [2]:
#Sentences we want sentence embeddings for
sentences = ['This framework generates embeddings for each input sentence',
             'Sentences are passed as a list of string.',
             'The quick brown fox jumps over the lazy dog.']

#Load AutoModel from huggingface model repository
st_tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/msmarco-bert-base-dot-v5") #model_args={"embedding_size":512})
st_model = AutoModel.from_pretrained("sentence-transformers/msmarco-bert-base-dot-v5")#embedding_size=512)

#Tokenize sentences
encoded_input = st_tokenizer(sentences, padding=True, truncation=True, max_length=128, return_tensors='pt')

#Compute token embeddings
with torch.no_grad():
    model_output = st_model(**encoded_input)

print(model_output)

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[ 0.3171, -0.0024,  0.6244,  ...,  0.3232, -0.1514, -0.2152],
         [ 0.1937,  0.0800,  0.5210,  ...,  0.5329,  0.1537,  0.0716],
         [ 0.9578,  0.6989,  0.2488,  ...,  0.3102,  0.1141, -0.0484],
         ...,
         [-0.2485,  0.2961,  0.3823,  ...,  0.3827,  0.2004, -0.4977],
         [ 0.3643, -0.0698,  0.0736,  ...,  0.5198,  0.1275, -0.3185],
         [ 0.1206,  0.1841,  0.2435,  ...,  0.3209,  0.1520, -0.1109]],

        [[ 0.1973,  0.0898,  0.0544,  ...,  0.5683, -0.3024,  0.1424],
         [ 0.6744,  0.4258, -0.0365,  ...,  0.5554, -0.0285, -0.0414],
         [-0.0486, -0.1381,  0.0093,  ...,  0.6076,  0.0589,  0.1301],
         ...,
         [ 0.0136,  0.1186, -0.0105,  ...,  0.5485, -0.1306,  0.0441],
         [ 0.2659,  0.0137,  0.0820,  ...,  0.6097, -0.2816,  0.1907],
         [ 0.1904,  0.0573,  0.0702,  ...,  0.6156, -0.3170,  0.2227]],

        [[-0.4076,  0.0368,  0.1900,  ...,  0.0402,  

In [3]:
print(model_output["pooler_output"].shape)

torch.Size([3, 768])


In [27]:
t5_tokenizer = T5Tokenizer.from_pretrained("t5-base")
from transformers.modeling_outputs import BaseModelOutputWithPastAndCrossAttentions
from torch.nn.utils.rnn import pad_sequence


def concat_dicts(dict_1, dict_2):
    for k in dict_1:
        dict_1[k] = torch.cat((dict_1[k], dict_2[k]),dim=1)
    return dict_1



class STOnlyEncoder(Module):
    def __init__(self, encoder, tokenizer, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.st_encoder = encoder
        self.tokenizer = tokenizer
        self.extra_token_id = self.tokenizer.encode("<extra_id_0>")[1]



    def forward(self, input_ids, **kwargs):


        batches = []

        for b in input_ids:
            text = self.tokenizer.decode(b, ignore_special_tokens=True)[5:-5]
            text = text.strip(" ")
            sents = text.split("<extra_id_0>")
            sents_tokenized = self.tokenizer.batch_encode_plus(sents, return_tensors="pt", padding=True)
            
            encoded = self.st_encoder.forward(**sents_tokenized)
            pooler_output = encoded["pooler_output"]
            print(sents)
            print(pooler_output)
            print(pooler_output.shape)
            batches.append(pooler_output)
        
        embeddings = pad_sequence(batches, batch_first=True)
        print(embeddings)
        print(embeddings.size())
        output = BaseModelOutputWithPastAndCrossAttentions(embeddings, )
        pprint(kwargs)
 #       encoded = self.st_encoder.forward(input_ids, **kwargs)
#        pooler_output = encoded["pooler_output"]

        return output
        

class CustomEncoder(Module):
    def __init__(self, encoder, *args, **kwargs):    
        super().__init__(*args, **kwargs)
        self.t5_encoder = encoder

    def forward(self, input_ids, **kwargs):
        print("kwargs")
        print(kwargs)

        input_ids = input_ids[:input_ids.shape[0],:input_ids.shape[1]//2]
        kwargs["attention_mask"] = kwargs["attention_mask"][:input_ids.shape[0], :input_ids.shape[1]]
        encoder_1_output = self.t5_encoder.forward(input_ids, **kwargs)
        encoder_2_output = self.t5_encoder.forward(input_ids, **kwargs)
        print("input ids shape",input_ids.shape)
        print("attention mask shape", kwargs["attention_mask"].shape)
        print(encoder_1_output["last_hidden_state"].shape)

        encoder_output = concat_dicts(encoder_1_output, encoder_2_output)
        print(dir(encoder_1_output))
        #print(encoder_output)
        print(encoder_output["last_hidden_state"].shape)

        print(encoder_output)

        pprint(kwargs)
        return(encoder_output)


t5_model = T5ForConditionalGeneration.from_pretrained("t5-base")
import re


def replace_newlines(text):
    text = re.sub("[\n\W]*\n[\n\W]*", "<extra_id_0>", text)
    return text



example_dialogue = "\nhello\nhello\n\nsentence 3"




st_tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/msmarco-bert-base-dot-v5", extra_ids=1)

st_tokenizer.add_tokens(['<extra_id_0>'], special_tokens=True) ##This line is updated
st_model.resize_token_embeddings(len(tokenizer))
st_only_encoder = STOnlyEncoder(st_model,st_tokenizer)

t5_model.encoder = st_only_encoder

#t5_model.encoder = CustomEncoder(t5_model.encoder)



st_tokens = st_tokenizer.encode(replace_newlines(example_dialogue), return_tensors="pt")

    
print(st_tokens)
t5_model.generate(st_tokens, attention_mask=torch.Tensor([[1]*4]))



tensor([[  101, 30522,  7592, 30522,  7592, 30522,  6251,  1017,   102]])
['', ' hello ', ' hello ', ' sentence 3']
tensor([[ 0.3442, -0.1094,  0.0149,  ..., -0.0259,  0.1215,  0.0161],
        [ 0.3086, -0.1528, -0.0263,  ...,  0.0507,  0.0554, -0.0689],
        [ 0.3086, -0.1528, -0.0263,  ...,  0.0507,  0.0554, -0.0689],
        [ 0.4801, -0.0923,  0.2168,  ...,  0.0564,  0.0845, -0.2599]])
torch.Size([4, 768])
tensor([[[ 0.3442, -0.1094,  0.0149,  ..., -0.0259,  0.1215,  0.0161],
         [ 0.3086, -0.1528, -0.0263,  ...,  0.0507,  0.0554, -0.0689],
         [ 0.3086, -0.1528, -0.0263,  ...,  0.0507,  0.0554, -0.0689],
         [ 0.4801, -0.0923,  0.2168,  ...,  0.0564,  0.0845, -0.2599]]])
torch.Size([1, 4, 768])
{'attention_mask': tensor([[1., 1., 1., 1.]]),
 'output_attentions': False,
 'output_hidden_states': False,
 'return_dict': True}


tensor([[0, 3, 2, 3, 2, 3, 2, 3, 7, 3, 9, 3, 9, 3, 9, 3, 9, 3, 9, 3]])

In [28]:
t5_tokenizer.decode(torch.Tensor([0, 3, 2, 3, 2, 3, 2, 3, 7, 3, 9, 3, 9, 3, 9, 3, 9, 3, 9, 3]))

'<pad> <unk> <unk> <unk> s a a a a a'

In [12]:
words = "here \n are \n some \n sentences"
st_tokens = st_tokenizer.encode(replace_newlines(words), return_tensors="pt")

t5_model.generate(st_tokens)

['here ', ' are ', ' some ', ' sentences']
tensor([[ 0.3270,  0.2870, -0.1443,  ...,  0.0487,  0.0666, -0.0639],
        [ 0.1175,  0.0292, -0.1062,  ...,  0.0180,  0.1735,  0.0575],
        [ 0.1774, -0.0870, -0.2180,  ...,  0.0631,  0.2605,  0.1801],
        [ 0.1744, -0.1681,  0.0668,  ...,  0.0326,  0.0542, -0.1686]])
torch.Size([4, 768])


ValueError: Make sure that `model_kwargs` include `encoder_outputs` of type `ModelOutput`.

In [None]:
tokens = tokenizer.encode("Translate to German: This is an unreasonably long sentence", return_tensors="pt")
print(tokens)
tokens = torch.cat((tokens, tokens), dim=1)
generated = model.generate(tokens)
tokenizer.decode(generated[0], skip_special_tokens=True)

tensor([[30355,    15,    12,  2968,    10,   100,    19,    46,    73,   864,
           739,  3834,   307,  7142,     1]])
kwargs
{'return_dict': True, 'output_attentions': False, 'output_hidden_states': False, 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1]])}
input ids shape torch.Size([1, 15])
attention mask shape torch.Size([1, 15])
torch.Size([1, 15, 768])
['__annotations__', '__class__', '__contains__', '__dataclass_fields__', '__dataclass_params__', '__delattr__', '__delitem__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__post_init__', '__reduce__', '__reduce_ex__', '__repr__', '__reversed__', '__setattr__', '__setitem__', '__sizeof__', '__str__', '__subclasshook__', 'attentions', 'clear', 'copy', 'cross_atte

'<extra_id_-99>: This is an unreasonably long sentence.'

In [None]:
tokens

tensor([[30355,    15,    12,  2968,    10,   100,    19,    46,    73,   864,
           739,  3834,   307,  7142,     1, 30355,    15,    12,  2968,    10,
           100,    19,    46,    73,   864,   739,  3834,   307,  7142,     1]])

In [None]:
model.forward(tokens)

kwargs
{'attention_mask': None, 'inputs_embeds': None, 'head_mask': None, 'output_attentions': None, 'output_hidden_states': None, 'return_dict': True}


TypeError: 'NoneType' object is not subscriptable

In [None]:
t5_generated = t5_model.generate(tokens)
tokenizer.decode(t5_generated[0], skip_special_tokens=True)

In [None]:
generated = model.generate(tokens)
tokenizer.decode(generated[0], skip_special_tokens=True)

In [None]:
t5_model.forward(tokens)