# Encoder Tests

In [1]:
from Encoders import *
from transformers import ElectraTokenizer, ElectraModel, DebertaV2Tokenizer, DebertaV2Model

  from tqdm.autonotebook import tqdm, trange


## Test 1

In [2]:
test_text = "Hello World!"

models_different_size = [
    TextEncoder(NormalEncoder((20,), mean=0, std=1), target_dim=None),
    TextEncoder(DummyEncoder((20, )), target_dim=None),
    TextEncoder(SentenceTransformerEncoder("sentence-transformers/all-MiniLM-L6-v2"), target_dim=None),
    TextEncoder(SentenceTransformerEncoder("sentence-transformers/all-MiniLM-L12-v2"), target_dim=None),
    TextEncoder(SentenceTransformerEncoder("sentence-transformers/all-mpnet-base-v2"), target_dim=None),
    TextEncoder(SentenceTransformerEncoder("sentence-transformers/paraphrase-xlm-r-multilingual-v1"), target_dim=None),
    TextEncoder(SentenceTransformerEncoder("sentence-transformers/distilbert-base-nli-stsb-mean-tokens"), target_dim=None),
    TextEncoder(SentenceTransformerEncoder("sentence-transformers/bert-base-nli-mean-tokens"), target_dim=None),
    TextEncoder(SentenceTransformerEncoder("sentence-transformers/roberta-base-nli-mean-tokens"), target_dim=None),
    TextEncoder(SentenceTransformerEncoder("sentence-transformers/xlm-r-bert-base-nli-stsb-mean-tokens"), target_dim=None),
    TextEncoder(SentenceTransformerEncoder("sentence-transformers/distiluse-base-multilingual-cased-v2"), target_dim=None),
    TextEncoder(GPT_2Encoder(), target_dim=None),
    TextEncoder(OpenAIEncoder("text-embedding-3-small"), target_dim=None),
    TextEncoder(T5Encoder(), target_dim=None),
    TextEncoder(LongformBase4096(), target_dim=None),
    TextEncoder(HiddenStateTransformer(DebertaV2Tokenizer.from_pretrained("microsoft/deberta-v3-large"), DebertaV2Model.from_pretrained("microsoft/deberta-v3-large")), target_dim=None),
    TextEncoder(HiddenStateTransformer(ElectraTokenizer.from_pretrained("google/electra-base-discriminator"), ElectraModel.from_pretrained("google/electra-base-discriminator")), target_dim=None),
]

for model in models_different_size:
    encoding = model.encode(test_text)
    print(encoding.shape)
    assert encoding is not None

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


(20,)
(20,)
(384,)
(384,)
(768,)
(768,)
(768,)
(768,)
(768,)
(768,)
(512,)
(768,)


Input ids are automatically padded to be a multiple of `config.attention_window`: 512


(1536,)
(512,)
(768,)
(1024,)
(768,)


## Test 2

In [3]:
test_text = "Hello World!"

models_same_size = [
    TextEncoder(NormalEncoder((20,), mean=0, std=1), target_dim=1024),
    TextEncoder(DummyEncoder((20, )), target_dim=1024),
    TextEncoder(SentenceTransformerEncoder("sentence-transformers/all-MiniLM-L6-v2"), target_dim=1024),
    TextEncoder(SentenceTransformerEncoder("sentence-transformers/all-MiniLM-L12-v2"), target_dim=1024),
    TextEncoder(SentenceTransformerEncoder("sentence-transformers/all-mpnet-base-v2"), target_dim=1024),
    TextEncoder(SentenceTransformerEncoder("sentence-transformers/paraphrase-xlm-r-multilingual-v1"), target_dim=1024),
    TextEncoder(SentenceTransformerEncoder("sentence-transformers/distilbert-base-nli-stsb-mean-tokens"), target_dim=1024),
    TextEncoder(SentenceTransformerEncoder("sentence-transformers/bert-base-nli-mean-tokens"), target_dim=1024),
    TextEncoder(SentenceTransformerEncoder("sentence-transformers/roberta-base-nli-mean-tokens"), target_dim=1024),
    TextEncoder(SentenceTransformerEncoder("sentence-transformers/xlm-r-bert-base-nli-stsb-mean-tokens"), target_dim=1024),
    TextEncoder(SentenceTransformerEncoder("sentence-transformers/distiluse-base-multilingual-cased-v2"), target_dim=1024),
    TextEncoder(GPT_2Encoder(), target_dim=1024),
    TextEncoder(OpenAIEncoder("text-embedding-3-small"), target_dim=1024),
    TextEncoder(T5Encoder(), target_dim=1024),
    TextEncoder(LongformBase4096(), target_dim=1024),
    TextEncoder(HiddenStateTransformer(DebertaV2Tokenizer.from_pretrained("microsoft/deberta-v3-large"), DebertaV2Model.from_pretrained("microsoft/deberta-v3-large")), target_dim=1024),
    TextEncoder(HiddenStateTransformer(ElectraTokenizer.from_pretrained("google/electra-base-discriminator"), ElectraModel.from_pretrained("google/electra-base-discriminator")), target_dim=1024),
]

for model in models_same_size:
    encoding = model.encode(test_text)
    print(encoding.shape)
    assert encoding is not None
    assert len(encoding) == 1024

(1024,)
(1024,)
(1024,)
(1024,)
(1024,)
(1024,)
(1024,)
(1024,)
(1024,)
(1024,)
(1024,)
(1024,)
(1024,)
(1024,)
(1024,)
(1024,)
(1024,)
