# Test ESM models from hugging face

## Imports

In [1]:
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer

### Define the model name for the ESM model you wish to use

In [2]:
# model_name = "facebook/esm2_t48_15B_UR50D" # 48 layers, 15B parameters, 67 GB
# model_name = "facebook/esm2_t36_3B_UR50D" # 36 layers, 3B parameters, 18 GB
# model_name = "facebook/esm2_t33_650M_UR50D" # 33 layers, 650M parameters, 2.5 GB
# model_name = "facebook/esm2_t30_150M_UR50D" # 30 layers, 150M parameters
# model_name = "facebook/esm2_t12_35M_UR50D" # 12 layers, 35M parameters
model_name = "facebook/esm2_t6_8M_UR50D" # 6 layers, 8M parameters

## Loading Pretrained Model and Tokenizer from Hugging Face's Transformers Library

In [3]:
# Load the pretrained model and tokenizer from Hugging Face's transformers library
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForMaskedLM.from_pretrained(model_name, output_hidden_states=True)

tokenizer_config.json:   0%|          | 0.00/95.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt:   0%|          | 0.00/93.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/775 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/31.4M [00:00<?, ?B/s]

In [5]:
# Get the model's layers
layers = model.config.num_hidden_layers

# Print the layers
print("Number of layers:", layers)

Number of layers: 6


In [6]:
# Get information about each layer
for i, layer_module in enumerate(model.base_model.encoder.layer):
    print(f"Layer {i + 1}:")
    print("Number of parameters:", sum(p.numel() for p in layer_module.parameters() if p.requires_grad))
    print("Layer type:", layer_module.__class__.__name__)
    print()


Layer 1:
Number of parameters: 1232960
Layer type: EsmLayer

Layer 2:
Number of parameters: 1232960
Layer type: EsmLayer

Layer 3:
Number of parameters: 1232960
Layer type: EsmLayer

Layer 4:
Number of parameters: 1232960
Layer type: EsmLayer

Layer 5:
Number of parameters: 1232960
Layer type: EsmLayer

Layer 6:
Number of parameters: 1232960
Layer type: EsmLayer



## Input Data

In [7]:
# Define your protein sequences here. Replace '...' with your actual protein sequences.
# For demonstration, these are placeholders and should be replaced with real sequences.
sequences = [
    "VMHLTPEEKSAVTALWGKVNVDEVGGEALGRLLVVYPWTQRFFESFGDLSTPDAVMGNPKVKAHGKKVLGAFSDGLAHLDNLKGTFATLSELHCDKLHVDPENFRLLGNVLVCVLAHHFGKEFTPPVQAAYQKVVAGVANALAHKYH",
    "MEEPQSDPSVEPPLSQETFSDLWKLLPENNVLSPLPSQAMDDLMLSPDDIEQWFTEDPGPDEAPRMPEAAPPVAPAPAAPTPAAPAPAPSWPLSSSVPSQKTYQGSYGFRLGFLHSGTAKSVTCTYSPALNKMFCQLAKTCPVQLWVDSTPPPGTRVRAMAIYKQSQHMTEVVRRCPHGRSCSDG",
    "MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLTYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIEDGGVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITLGMDELYK"
]

## Tokenize input and run model

In [8]:
# Tokenize the protein sequences. The tokenizer converts sequences into a format the model can understand.
inputs = tokenizer(sequences, return_tensors="pt", padding=True)

# We do not need to calculate gradients (useful for inference), hence torch.no_grad()
with torch.no_grad():
    # Pass the tokenized sequences through the model.
    outputs = model(**inputs)

## Output

In [10]:
print("==== Model Output ====")
# output shape : (batch_size, sequence_length, num_labels)
print(outputs.logits.shape)
print("==== Model Output ====")

# Check if 'hidden_states' is part of the output
if 'hidden_states' in outputs:
    # The hidden states are typically a tuple with each element being the states of a layer
    # The last element ([-1]) of this tuple will give you the last layer's hidden states, often used as the embeddings
    embeddings = outputs['hidden_states'][-1]
    print("Embeddings shape:", embeddings.shape)
else:
    # If there are no hidden_states, this model might not have been configured to output them
    # You might need to check your model configuration or use 'logits' for different purposes
    print("No hidden states available in the model output.")

==== Model Output ====
torch.Size([3, 240, 33])
==== Model Output ====
Embeddings shape: torch.Size([3, 240, 320])


In [11]:
print(dir(outputs))

['__annotations__', '__class__', '__class_getitem__', '__contains__', '__dataclass_fields__', '__dataclass_params__', '__delattr__', '__delitem__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__ior__', '__iter__', '__le__', '__len__', '__lt__', '__match_args__', '__module__', '__ne__', '__new__', '__or__', '__post_init__', '__reduce__', '__reduce_ex__', '__repr__', '__reversed__', '__ror__', '__setattr__', '__setitem__', '__sizeof__', '__str__', '__subclasshook__', 'attentions', 'clear', 'copy', 'fromkeys', 'get', 'hidden_states', 'items', 'keys', 'logits', 'loss', 'move_to_end', 'pop', 'popitem', 'setdefault', 'to_tuple', 'update', 'values']


In [65]:
print(inputs.input_ids.shape)
print(outputs.hidden_states[-1].shape)

torch.Size([3, 240])
torch.Size([3, 240, 1280])


In [66]:
# Get the output before the last 2 layers
output_before_last_2_layers = outputs.hidden_states[-1][:, :-2, :]
print(output_before_last_2_layers.shape)

torch.Size([3, 238, 1280])


In [30]:
print(outputs.values)

<built-in method values of MaskedLMOutput object at 0x000002A7192BF940>


# ESM model for masked language modeling

In [12]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForMaskedLM

In [13]:
# model_name = "facebook/esm2_t48_15B_UR50D" # 48 layers, 15B parameters, 67 GB
# model_name = "facebook/esm2_t36_3B_UR50D" # 36 layers, 3B parameters, 18 GB
# model_name = "facebook/esm2_t33_650M_UR50D" # 33 layers, 650M parameters, 2.5 GB
# model_name = "facebook/esm2_t30_150M_UR50D" # 30 layers, 150M parameters
# model_name = "facebook/esm2_t12_35M_UR50D" # 12 layers, 35M parameters
model_name = "facebook/esm2_t6_8M_UR50D" # 6 layers, 8M parameters

In [14]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForMaskedLM.from_pretrained(model_name)

In [15]:
# Define your protein sequences here. Replace '...' with your actual protein sequences.
# For demonstration, these are placeholders and should be replaced with real sequences.
sequences = [
    "VMHLTPEEKSAVTALWGKVNVDEVGGEALGRLLVVYPWTQRFFESFGDLSTPDAVMGNPKVKAHGKKVLGAFSDGLAHLDNLKGTFATLSELHCDKLHVDPENFRLLGNVLVCVLAHHFGKEFTPPVQAAYQKVVAGVANALAHKYH",
    "MEEPQSDPSVEPPLSQETFSDLWKLLPENNVLSPLPSQAMDDLMLSPDDIEQWFTEDPGPDEAPRMPEAAPPVAPAPAAPTPAAPAPAPSWPLSSSVPSQKTYQGSYGFRLGFLHSGTAKSVTCTYSPALNKMFCQLAKTCPVQLWVDSTPPPGTRVRAMAIYKQSQHMTEVVRRCPHGRSCSDG",
    "MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLTYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIEDGGVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITLGMDELYK"
]

In [16]:
# Tokenize the protein sequences. The tokenizer converts sequences into a format the model can understand.
inputs = tokenizer(sequences, return_tensors="pt", padding=True)

# We do not need to calculate gradients (useful for inference), hence torch.no_grad()
with torch.no_grad():
    # Pass the tokenized sequences through the model.
    outputs = model(**inputs)

In [17]:
print("==== Model Output ====")
# output shape : (batch_size, sequence_length, num_labels)
print(outputs.logits.shape)
print("==== Model Output ====")

==== Model Output ====
torch.Size([3, 240, 33])
==== Model Output ====


In [41]:
import torch

# Encoder hyperparamerters
model_name = "facebookresearch/esm"

# select the model
# model_type = "esm2_t48_15B_UR50D" # 48 layers, 15B parameters, 67 GB
# model_type = "esm2_t36_3B_UR50D" # 36 layers, 3B parameters, 18 GB
# model_type = "esm2_t33_650M_UR50D" # 33 layers, 650M parameters, 2.5 GB
# model_type = "esm2_t30_150M_UR50D" # 30 layers, 150M parameters
# model_type = "esm2_t12_35M_UR50D" # 12 layers, 35M parameters
model_type = "esm2_t6_8M_UR50D" # 6 layers, 8M parameters

In [42]:
encoder, alphabet = torch.hub.load(model_name, model_type)
batch_converter = alphabet.get_batch_converter()

Using cache found in C:\Users\angad/.cache\torch\hub\facebookresearch_esm_main
Downloading: "https://dl.fbaipublicfiles.com/fair-esm/models/esm2_t6_8M_UR50D.pt" to C:\Users\angad/.cache\torch\hub\checkpoints\esm2_t6_8M_UR50D.pt
Downloading: "https://dl.fbaipublicfiles.com/fair-esm/regression/esm2_t6_8M_UR50D-contact-regression.pt" to C:\Users\angad/.cache\torch\hub\checkpoints\esm2_t6_8M_UR50D-contact-regression.pt


In [43]:
print(alphabet)

# 1. List all attributes and methods using dir()
all_attrs_methods = dir(alphabet)
print("All attributes and methods:", all_attrs_methods)

<esm.data.Alphabet object at 0x0000020162177E10>
All attributes and methods: ['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_tokenize', 'all_special_tokens', 'all_toks', 'append_eos', 'append_toks', 'cls_idx', 'encode', 'eos_idx', 'from_architecture', 'get_batch_converter', 'get_idx', 'get_tok', 'mask_idx', 'padding_idx', 'prepend_bos', 'prepend_toks', 'standard_toks', 'to_dict', 'tok_to_idx', 'tokenize', 'unique_no_split_tokens', 'unk_idx', 'use_msa']


In [52]:
print(dir(encoder))
print(encoder.embed_dim)
print(encoder.alphabet_size)

['T_destination', '__annotations__', '__call__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattr__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_apply', '_backward_hooks', '_backward_pre_hooks', '_buffers', '_call_impl', '_compiled_call_impl', '_forward_hooks', '_forward_hooks_always_called', '_forward_hooks_with_kwargs', '_forward_pre_hooks', '_forward_pre_hooks_with_kwargs', '_get_backward_hooks', '_get_backward_pre_hooks', '_get_name', '_init_submodules', '_is_full_backward_hook', '_load_from_state_dict', '_load_state_dict_post_hooks', '_load_state_dict_pre_hooks', '_maybe_warn_non_full_backward_hook', '_modules', '_named_members', '_non_persistent_buffers_set', '_parameters', '_register_loa

In [6]:
len(alphabet.all_toks)

33

In [44]:
labels, seqs, inputs = batch_converter([
    ("abc", "KRKRTRFTPEQLEILEAIFKQNPYPSREEREELAKELGLSEKQVKVWFQNRRAKERK"),
    ("bc", "KRKRTRFTPEQLEILEAIFKQNPYPSREEREELAKELGLSEKQVKVWFQNRRAKERK"),
])

In [46]:
esm_out = encoder(inputs, repr_layers=[6], return_contacts=True)
emb = esm_out["representations"][6]         # emb shape : [2, 74, 1280]

In [47]:
print(f"shape of inputs: {inputs.shape}:")
print(f"shape of encoder output: {emb.shape}")

shape of inputs: torch.Size([2, 59]):
shape of encoder output: torch.Size([2, 59, 320])
