In [None]:
!pip install -q fair-esm

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/93.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m93.1/93.1 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
!pip install Biopython

Collecting Biopython
  Downloading biopython-1.81-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m32.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: Biopython
Successfully installed Biopython-1.81


In [None]:
import pathlib
import torch

from esm import FastaBatchedDataset, pretrained



def extract_embeddings(model_name, fasta_file, output_dir, tokens_per_batch=4096, seq_length=1022,repr_layers=[33]):

    model, alphabet = pretrained.load_model_and_alphabet(model_name)
    model.eval()

    if torch.cuda.is_available():
        model = model.cuda()

    dataset = FastaBatchedDataset.from_file(fasta_file)
    batches = dataset.get_batch_indices(tokens_per_batch, extra_toks_per_seq=1)

    data_loader = torch.utils.data.DataLoader(
        dataset,
        collate_fn=alphabet.get_batch_converter(seq_length),
        batch_sampler=batches
    )

    output_dir.mkdir(parents=True, exist_ok=True)

    with torch.no_grad():
        for batch_idx, (labels, strs, toks) in enumerate(data_loader):

            print(f'Processing batch {batch_idx + 1} of {len(batches)}')

            if torch.cuda.is_available():
                toks = toks.to(device="cuda", non_blocking=True)

            out = model(toks, repr_layers=repr_layers, return_contacts=False)

            logits = out["logits"].to(device="cpu")
            representations = {layer: t.to(device="cpu") for layer, t in out["representations"].items()}

            for i, label in enumerate(labels):
                entry_id = label.split()[0]

                filename = output_dir / f"{entry_id}.pt"
                truncate_len = min(seq_length, len(strs[i]))

                result = {"entry_id": entry_id}
                result["mean_representations"] = {
                        layer: t[i, 1 : truncate_len + 1].mean(0).clone()
                        for layer, t in representations.items()
                    }

                torch.save(result, filename)


In [None]:
import os
from pathlib import Path
import glob
from google.colab import drive
drive.mount('/content/drive')
import glob

Mounted at /content/drive


In [None]:
from Bio.PDB import PDBParser
from Bio.PDB.Polypeptide import PPBuilder
from Bio import SeqIO

In [None]:
"""
p = PDBParser(QUIET=True)
ppb = PPBuilder()
seqrecords = []
"""

'\np = PDBParser(QUIET=True)\nppb = PPBuilder()\nseqrecords = []\n'

In [None]:
"""
pdbdir = Path('/content/drive/MyDrive/ConfDiff_Test/pdbs')
for folder in os.listdir(pdbdir):
  #Gets path of each cluster folder
  folderpath = Path(str(pdbdir) + f'/{folder}')
  #print(folderpath)
  if not os.path.isdir(str(folderpath)):
    continue
  #Gets path of first protein in cluster
  for pdbfile in folderpath.glob('*.pdb'):
    try:
      proteinpath = Path(str(pdbfile))
      #print(proteinpath)
      #print(os.path.isfile(str(proteinpath)))
      #Gets structure & FASTA sequence of each protein in folder
      pstructure = p.get_structure(folder, str(proteinpath))
      #print(list(pstructure.get_residues()))
      for pp in ppb.build_peptides(pstructure):
        pp
      pseq = pp.get_sequence()
      pseqr = SeqIO.SeqRecord(pseq, id=folder, name='Cluster_fasta_sequence')
      seqrecords.append(pseqr)
      print(f'{folder} completed')
      break
    except:
      continue
  """


"\npdbdir = Path('/content/drive/MyDrive/ConfDiff_Test/pdbs')\nfor folder in os.listdir(pdbdir):\n  #Gets path of each cluster folder\n  folderpath = Path(str(pdbdir) + f'/{folder}')\n  #print(folderpath)\n  if not os.path.isdir(str(folderpath)):\n    continue\n  #Gets path of first protein in cluster\n  for pdbfile in folderpath.glob('*.pdb'):\n    try:\n      proteinpath = Path(str(pdbfile))\n      #print(proteinpath)\n      #print(os.path.isfile(str(proteinpath)))\n      #Gets structure & FASTA sequence of each protein in folder\n      pstructure = p.get_structure(folder, str(proteinpath))\n      #print(list(pstructure.get_residues()))\n      for pp in ppb.build_peptides(pstructure):\n        pp\n      pseq = pp.get_sequence()\n      pseqr = SeqIO.SeqRecord(pseq, id=folder, name='Cluster_fasta_sequence')\n      seqrecords.append(pseqr)\n      print(f'{folder} completed')\n      break\n    except:\n      continue\n  "

In [None]:
fastadir=Path('/content/drive/MyDrive/ConfDiff/codnas_data/cluster_seqs')

In [None]:
model_name = 'esm1v_t33_650M_UR90S_5' #could also be: esm1v_t33_650M_UR90S_1
file_name = Path('/content/drive/MyDrive/ConfDiff/codnas_data/all.unique.fasta')
output_dir = Path('/content/drive/MyDrive/ConfDiff/New_ESM1_Embeddings')

In [None]:
extract_embeddings(model_name, file_name, output_dir)

Downloading: "https://dl.fbaipublicfiles.com/fair-esm/models/esm1v_t33_650M_UR90S_5.pt" to /root/.cache/torch/hub/checkpoints/esm1v_t33_650M_UR90S_5.pt


FileNotFoundError: ignored

In [None]:
!cat /content/drive/MyDrive/ConfDiff/codnas_data/cluster_seqs/*.fasta > /content/drive/MyDrive/ConfDiff/codnas_data/all.fasta

In [None]:
!sed -i 's/ /_/g' /content/drive/MyDrive/ConfDiff/codnas_data/all.fasta

In [None]:
!head /content/drive/MyDrive/ConfDiff/codnas_data/all.fasta

In [None]:
all_seqs = Path("/content/drive/MyDrive/ConfDiff/codnas_data/all.fasta")
with all_seqs.open() as old, all_seqs.with_suffix('.unique.fasta').open('w') as new:
  for i, line in enumerate(old):
    header = f"{line.rstrip()}_{i}"
    seq = next(old)
    print(f"{header}\n{seq}", file=new)

In [None]:
!pwd

In [None]:
!tar -cfz ./all_embeddings.tar.gz /content/drive/MyDrive/ConfDiff/New_ESM1_Embeddings