In [None]:
import torch

from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
# from transformers import TrainerCallback
from transformers import EsmModel

In [None]:
# Load the ESM-2 model and tokenizer using Hugging Face
model_name = "facebook/esm2_t6_8M_UR50D"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = EsmModel.from_pretrained(model_name)
model.eval()  # Set to evaluation mode

In [None]:
# Example protein sequence
sequences = ["MEEPQSDPSV", "GVPINVSCTG"]

# Tokenize sequences
inputs = tokenizer(sequences, return_tensors="pt", padding=True)

In [None]:
# Move model and data to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
inputs = {key: value.to(device) for key, value in inputs.items()}

In [None]:
# Forward pass to extract embeddings
with torch.no_grad():
    outputs = model(**inputs)

# Extract last hidden state (embeddings)
embeddings = outputs.last_hidden_state

# Print shape of embeddings (batch_size, seq_len, hidden_dim)
print("Embedding shape:", embeddings.shape)