In [6]:
%pip install -q pandas numpy biopython matplotlib seaborn tqdm goatools plotly umap-learn scikit-learn torch transformers


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m39.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.8/15.8 MB[0m [31m134.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m175.3/175.3 kB[0m [31m16.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [7]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Bioinformatics
from Bio import SeqIO
from Bio.SeqUtils.ProtParam import ProteinAnalysis
from google.colab import files
# Upload file
uploaded = files.upload()

# Extract the filename
filename = list(uploaded.keys())[0]
print("Uploaded:", filename)
entry_ids = []
sequences = []

for record in SeqIO.parse(filename, "fasta"):
    header = record.id.strip()

    # Case 1: UniProt format with pipes: sp|A0A0C5B5G6|MOTSC_HUMAN
    if "|" in header:
        entry_id = header.split("|")[1]

    # Case 2: Already clean (A0A024QZJ9)
    else:
        entry_id = header

    entry_ids.append(entry_id)
    sequences.append(str(record.seq))

len(entry_ids), entry_ids[:5]


Saving testsuperset.fasta to testsuperset.fasta
Uploaded: testsuperset.fasta


(224309, ['A0A0C5B5G6', 'A0A1B0GTW7', 'A0JNW5', 'A0JP26', 'A0PK11'])

In [8]:
from transformers import AutoTokenizer, AutoModel
import torch

MODEL_NAME = "facebook/esm2_t33_650M_UR50D"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME,
    do_lower_case=False,
)

# Load model
model = AutoModel.from_pretrained(MODEL_NAME)

# Move to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)
model.eval()

print("Loaded ESM model:", MODEL_NAME, "on", device)


Some weights of EsmModel were not initialized from the model checkpoint at facebook/esm2_t33_650M_UR50D and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loaded ESM model: facebook/esm2_t33_650M_UR50D on cuda


In [None]:
import numpy as np
from tqdm import tqdm

def embed_batch(seqs, tokenizer, model, device):
    tokens = tokenizer(
        seqs,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=4096
    )

    tokens = {k: v.to(device) for k, v in tokens.items()}

    with torch.no_grad():
        outputs = model(**tokens)
        reps = outputs.last_hidden_state

    mask = tokens["attention_mask"].unsqueeze(-1)
    summed = (reps * mask).sum(dim=1)
    counts = mask.sum(dim=1)
    avg = summed / counts

    return avg.cpu().numpy()
BATCH_SIZE = 16
all_embeddings = []

for i in tqdm(range(0, len(sequences), BATCH_SIZE)):
    batch = sequences[i : i + BATCH_SIZE]
    emb = embed_batch(batch, tokenizer, model, device)
    all_embeddings.append(emb)

all_embeddings = np.vstack(all_embeddings)
print("Final embeddings shape:", all_embeddings.shape)
np.save("esm2_embeddings.npy", all_embeddings)
np.save("esm2_entry_ids.npy", np.array(entry_ids))

print("Saved esm2_test.npy and esm2_test_ids.npy")
from google.colab import files

files.download("esm2_embeddings.npy")
files.download("esm2_entry_ids.npy")

 74%|███████▍  | 10407/14020 [6:07:20<1:18:55,  1.31s/it]