In [None]:
# Make sure you're using A100 GPU.

### **Load Packages**

In [None]:
!pip install fair-esm # our embedding model

Collecting fair-esm
  Downloading fair_esm-2.0.0-py3-none-any.whl.metadata (37 kB)
Downloading fair_esm-2.0.0-py3-none-any.whl (93 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/93.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m93.1/93.1 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fair-esm
Successfully installed fair-esm-2.0.0


In [None]:
import pandas as pd
import numpy as np
import torch
import esm
import time

### **Load Dataset**

In [None]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
# load in the cleaned parquet dataframe
# set this to the Google Drive path where the parquet file is located
df = pd.read_parquet("/content/drive/MyDrive/DeepLearningProject/base_cleaned_df.parquet")

In [None]:
df.head(2)

Unnamed: 0,reactant_set_id,drug_smiles,drug_name,protein_name,ic50,protein_sequence,ic50_numeric
0,143,Cc1nc(CN2CCN(CC2)c2c(Cl)cnc3[nH]c(nc23)-c2cn(C...,"US9447092, 3",Cytochrome P450 3A4,>50000,MALIPDLAMETWLLLAVSLVLLYLYGTHSHGLFKKLGIPGPTPLPF...,50000.0
1,145,O=C1CCCC2=C1C1(CCS(=O)(=O)C1)N=C(Nc1nc3ccccc3o...,"US9447087, 24::2-(benzo[d]oxazol-2-ylamino)-4'...",Galactokinase,6676.9,MAALRQPQVAELLAEARRAFREEFGAEPELAVSAPGRVNLIGEHTD...,6676.9


### **Run ESM2 Model in Batches**
- subset 1  | : 250,000
- subset 2  | 250,000 : 500,000
- subset 3  | 500,000 : 750,000
- subset 4  | 750,000 : 1,000,000
- subset 5  | 1,000,000 : 1,100,000
- subset 6  | 1,100,000 : 1,250,000
- subset 7  | 1,250,000 : 1,500,000
- subset 8  | 1,500,000 : 1,750,000
- subset 9  | 1,750,000 : 2,000,000
- subset 10| 2,000,000 :

In [None]:
start = time.time()

model, alphabet = esm.pretrained.esm2_t6_8M_UR50D()
model = model.half()
model = model.cuda()
batch_converter = alphabet.get_batch_converter()
model.eval()

batch_size = 8
sequences_subset = df['protein_sequence'].str.upper()[:250000]  # adjust slice (e.g., [250000:500000]) to process a different subset
embedding = []


for i in range(0, len(sequences_subset), batch_size):
  batch = sequences_subset.iloc[i:i+batch_size]


  data = [(str(idx), seq) for idx, seq in batch.items()]
  batch_labels, batch_strs, batch_tokens = batch_converter(data)
  batch_tokens = batch_tokens.cuda()


  with torch.no_grad():
        out = model(batch_tokens, repr_layers=[6])
        reps = out["representations"][6]

  for j in range(len(batch)):
        token_reps = reps[j]
        mean_rep = token_reps[1:-1].mean(0)
        embedding.append(mean_rep.cpu().numpy())


end = time.time()
print("Total time:", end - start, "seconds")

Downloading: "https://dl.fbaipublicfiles.com/fair-esm/models/esm2_t6_8M_UR50D.pt" to /root/.cache/torch/hub/checkpoints/esm2_t6_8M_UR50D.pt
Downloading: "https://dl.fbaipublicfiles.com/fair-esm/regression/esm2_t6_8M_UR50D-contact-regression.pt" to /root/.cache/torch/hub/checkpoints/esm2_t6_8M_UR50D-contact-regression.pt
Total time: 1190.689934015274 seconds


### **Save Embedding as .npy File**

In [None]:
# update both the path and variable name to match the previous cell
# use your own path to save the file
np.save("/content/drive/MyDrive/DeepLearningProject/embeddings250K.npy", np.vstack(embedding))