In [None]:
import torch
import numpy as np
import torch.nn.functional as f
device = "cuda:0" if torch.cuda.is_available() else "cpu"
import matplotlib.pyplot as plt


In [None]:

from transformers import BertTokenizer, BertModel

model = BertModel.from_pretrained("bert-large-cased",output_hidden_states=True)
tokenizer=BertTokenizer.from_pretrained('bert-large-cased')
model.eval()
model.to(device)

In [None]:
texts  = [
    'the dog is good',
    'a good dog',
    'Oranges are my favorite fruit',
    'my favorite fruits are oranges'
         ]
encodings = tokenizer(
    texts, # the texts to be tokenized
    padding=True, # pad the texts to the maximum length (so that all outputs have the same length)
    return_tensors='pt' # return the tensors (not lists)
)

In [None]:
encodings = encodings.to(device)
with torch.no_grad():
    embeds = model(**encodings)
embeds=embeds[0]

In [None]:
MAXS, _ = embeds.max(dim=1)
# normalize the CLS token embeddings
normalized = f.normalize(MAXS, p=2, dim=1)
# calculate the cosine similarity
cls_dist = normalized.matmul(normalized.T)
cls_dist = cls_dist.new_ones(cls_dist.shape) - cls_dist
cls_dist = cls_dist.cpu().numpy()

In [None]:
cls_dist

In [None]:
plt.imshow( cls_dist )
  

#### T5 Encoding

In [4]:
import torch
from transformers import T5Tokenizer, T5EncoderModel, AutoTokenizer, AutoModelForSeq2SeqLM

In [5]:

T5_CONFIGS = {'t5-small': ['t5',512] ,'t5-base':['t5',768], 
              't5-large':['t5',1024] , 'google/t5-v1_1-small': ['auto',512],
              'google/t5-v1_1-base':['auto', 768],
              'google/t5-v1_1-large':['auto',1025]}
MAX_LENGTH=256

In [6]:
't5-small' not in T5_CONFIGS.keys()

False

In [10]:
def get_encoded_text(texts, model_name='t5-small'):
    
    global T5_CONFIGS
    if model_name not in T5_CONFIGS.keys():
        print('model name is not found in config')
        
    config=T5_CONFIGS[model_name]
    
    if config[0] == 't5':
        t5_class,tokenizer_class= T5EncoderModel, T5Tokenizer
        
    elif config[0] == 'auto':
        t5_class,tokenizer_class=  AutoModelForSeq2SeqLM , AutoTokenizer
        
    else:
        raise ValueError(f'unknown source {config[0]}')
    
        
    t5=t5_class.from_pretrained(model_name)
    
    tokenizer=tokenizer_class.from_pretrained(model_name)
    
    if torch.cuda.is_available():
        t5 = t5.cuda()
        
    device = next(t5.parameters()).device

    encoded = tokenizer.batch_encode_plus(texts, return_tensors = "pt",
                                          padding = 'longest', 
                                          max_length = MAX_LENGTH,
                                          truncation = True) 
    
    input_ids = encoded.input_ids.to(device)
    attn_mask = encoded.attention_mask.to(device)
    
    
    t5.eval()
    
    with torch.no_grad():
        if config[0] == 't5':
            output = t5(input_ids = input_ids, attention_mask = attn_mask)
            encoded_text = output.last_hidden_state.detach()
            
        elif config[0] == 'auto':
            output = t5(input_ids = input_ids, attention_mask = attn_mask, decoder_input_ids = input_ids[:, :1])
            encoded_text = output.encoder_last_hidden_state.detach()
            
    return encoded_text, attn_mask.bool()
    

In [11]:
texts=["I love rock and roll"]

In [12]:
get_encoded_text(texts,model_name='t5-small')

Some weights of the model checkpoint at t5-small were not used when initializing T5EncoderModel: ['decoder.block.2.layer.1.EncDecAttention.k.weight', 'decoder.block.0.layer.0.layer_norm.weight', 'decoder.block.2.layer.2.DenseReluDense.wo.weight', 'decoder.block.0.layer.0.SelfAttention.o.weight', 'decoder.block.2.layer.1.layer_norm.weight', 'decoder.block.1.layer.0.SelfAttention.v.weight', 'decoder.block.0.layer.2.DenseReluDense.wo.weight', 'decoder.block.5.layer.0.SelfAttention.v.weight', 'decoder.block.1.layer.2.layer_norm.weight', 'decoder.block.3.layer.0.layer_norm.weight', 'decoder.block.3.layer.1.EncDecAttention.k.weight', 'decoder.block.4.layer.1.EncDecAttention.o.weight', 'decoder.block.5.layer.1.EncDecAttention.k.weight', 'decoder.block.5.layer.1.layer_norm.weight', 'decoder.block.4.layer.2.DenseReluDense.wo.weight', 'decoder.block.5.layer.0.SelfAttention.q.weight', 'decoder.block.3.layer.2.DenseReluDense.wo.weight', 'decoder.block.0.layer.2.DenseReluDense.wi.weight', 'decoder.

(tensor([[[ 0.1478, -0.1649,  0.0895,  ...,  0.0626, -0.1512, -0.0810],
          [-0.1527,  0.1281, -0.1035,  ..., -0.2610, -0.0079, -0.0893],
          [ 0.0857, -0.2130, -0.0017,  ...,  0.2185,  0.1569, -0.1649],
          [ 0.1391, -0.0811, -0.0594,  ...,  0.3500,  0.0728, -0.0599],
          [-0.2597,  0.0151, -0.1621,  ...,  0.1388,  0.2020, -0.2338],
          [ 0.0348,  0.0081,  0.0171,  ..., -0.0599,  0.0984,  0.0388]]],
        device='cuda:0'),
 tensor([[True, True, True, True, True, True]], device='cuda:0'))

In [16]:
import T5

In [17]:
T5.test()

Some weights of the model checkpoint at t5-small were not used when initializing T5EncoderModel: ['decoder.block.2.layer.1.EncDecAttention.k.weight', 'decoder.block.0.layer.0.layer_norm.weight', 'decoder.block.2.layer.2.DenseReluDense.wo.weight', 'decoder.block.0.layer.0.SelfAttention.o.weight', 'decoder.block.2.layer.1.layer_norm.weight', 'decoder.block.1.layer.0.SelfAttention.v.weight', 'decoder.block.0.layer.2.DenseReluDense.wo.weight', 'decoder.block.5.layer.0.SelfAttention.v.weight', 'decoder.block.1.layer.2.layer_norm.weight', 'decoder.block.3.layer.0.layer_norm.weight', 'decoder.block.3.layer.1.EncDecAttention.k.weight', 'decoder.block.4.layer.1.EncDecAttention.o.weight', 'decoder.block.5.layer.1.EncDecAttention.k.weight', 'decoder.block.5.layer.1.layer_norm.weight', 'decoder.block.4.layer.2.DenseReluDense.wo.weight', 'decoder.block.5.layer.0.SelfAttention.q.weight', 'decoder.block.3.layer.2.DenseReluDense.wo.weight', 'decoder.block.0.layer.2.DenseReluDense.wi.weight', 'decoder.