In [None]:
# delete this cell if working on Pycharm
!pip install Bio
!pip install torch

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from Bio.PDB import *
import numpy as np
import os
from tqdm import tqdm

In [None]:
#Change Here
NB_MAX_LENGTH = 140
AA_DICT = {"A": 0, "C": 1, "D": 2, "E": 3, "F": 4, "G": 5, "H": 6, "I": 7, "K": 8, "L": 9, "M": 10, "N": 11,
           "P": 12, "Q": 13, "R": 14, "S": 15, "T": 16, "W": 17, "Y": 18, "V": 19, "X": 20, "-": 21}
FEATURE_NUM = len(AA_DICT)
BACKBONE_ATOMS = ["N", "CA", "C", "O", "CB"]
OUTPUT_SIZE = len(BACKBONE_ATOMS) * 3
NB_CHAIN_ID = "H"


In [None]:
def check_valid_protein_sequence(seq):
    amino_acids = set("ACDEFGHIKLMNPQRSTVWY")

    for aa in seq:
        if aa.upper() not in amino_acids:
            return f"Invalid amino acid: {aa}"

    return "Valid protein sequence!"

In [None]:
def get_seq_aa(pdb_file, chain_id): #Stays the same
    """
    returns the sequence (String) and a list of all the aa residue objects of the given protein chain.
    :param pdb_file: path to a pdb file
    :param chain_id: chain letter (char)
    :return: sequence, [aa objects]
    """
    # load model
    chain = PDBParser(QUIET=True).get_structure(pdb_file, pdb_file)[0][chain_id]

    aa_residues = []
    seq = ""

    for residue in chain.get_residues():
        aa = residue.get_resname()
        if not is_aa(aa) or not residue.has_id('CA'): # Not amino acid
            continue
        elif aa == "UNK":  # unkown amino acid
            seq += "X"
        else:
            seq += Polypeptide.three_to_one(residue.get_resname())
        aa_residues.append(residue)

    atoms = [res['CA'] for res in chain if res.has_id('CA')]

    return seq, atoms

In [None]:
from IPython.core.display import struct
def get_seq_aa_by_id(pdb_id, chain_id):
    pdbl = PDBList()
    pdb_file = pdbl.retrieve_pdb_file(pdb_id.upper(), pdir=".", file_format="pdb")
    structure = PDBParser(QUIET=True).get_structure(pdb_id, pdb_file)
    chain = structure[0][chain_id]
    aa_residues = []
    seq = ""
    from Bio.PDB.PDBIO import PDBIO
    PDBIO().set_structure(structure).save('test.pdb')

    for residue in chain.get_residues():
        aa = residue.get_resname()
        if not is_aa(aa) or not residue.has_id('CA'): # Not amino acid
            continue
        elif aa == "UNK":  # unkown amino acid
            seq += "X"
        else:
            seq += Polypeptide.three_to_one(residue.get_resname())
        aa_residues.append(residue)
    atoms = [res['CA'] for res in chain if res.has_id('CA')]

    return seq, atoms

In [None]:
def generate_input_one_hot(seq): # TODO: implement this!
    """
    receives a pdb file and returns its sequence in a one-hot encoding matrix (each row is an aa in the sequence, and
    each column represents a different aa out of the 20 aa + 2 special columns).
    :param pdb_file: path to a pdb file (nanobody, heavy chain has id 'H')
    :return: numpy array of shape (NB_MAX_LENGTH, FEATURE_NUM)
    """


    # TODO: fill the missing code lines.
    # create one-hot encoding matrix
    one_hot = np.zeros((NB_MAX_LENGTH, FEATURE_NUM))

    # fill the matrix
    for i, aa in enumerate(seq):
        one_hot[i, AA_DICT[aa]] = 1

    # pad the matrix
    for i in range(len(seq), NB_MAX_LENGTH):
        one_hot[i, AA_DICT["-"]] = 1

    return one_hot[0]


In [None]:
# struct that will hold possible embeddings dimensions, for example name "5120", value 5120 dimensions...
possible_embedding_dims = { 1280, 640, 480, 320}
# dict of embedding dimensions to their corresponding model
embedding_dim_to_model = {
    1280: "esm2_t33_650M_UR50D",
    640: "esm2_t30_150M_UR50D",
    480: "esm2_t12_35M_UR50D",
    320: "esm2_t6_8M_UR50D",
}

# https://github.com/facebookresearch/esm
# Load ESM-2 model
import torch
def set_esm_modle(dim):
    """
    sets the embedding dimension, and loads the corresponding model.
    :param embedding_dim: the dimension of the embedding
    :return: None
    """
    # check if dim is valid
    if dim not in possible_embedding_dims:
      raise ValueError(f"embedding dimension must be one of {possible_embedding_dims}")
    embedding_dim = dim
    # load pretrained model, according to the embedding dimension
    if embedding_dim == 1280:
          model, alphabet = torch.hub.load("facebookresearch/esm", "esm2_t33_650M_UR50D")
          last_layer = 33
    elif embedding_dim == 640:
          model, alphabet = torch.hub.load("facebookresearch/esm", "esm2_t30_150M_UR50D")
          last_layer = 34
    elif embedding_dim == 480:
          model, alphabet = torch.hub.load("facebookresearch/esm", "esm2_t12_35M_UR50D")
          last_layer = 12
    elif embedding_dim == 320:
          model, alphabet = torch.hub.load("facebookresearch/esm", "esm2_t6_8M_UR50D")
          last_layer = 6
    return alphabet, model, last_layer

False


Using cache found in /root/.cache/torch/hub/facebookresearch_esm_main


(<esm.data.Alphabet at 0x7f67a45d3f10>,
 ESM2(
   (embed_tokens): Embedding(33, 320, padding_idx=1)
   (layers): ModuleList(
     (0-5): 6 x TransformerLayer(
       (self_attn): MultiheadAttention(
         (k_proj): Linear(in_features=320, out_features=320, bias=True)
         (v_proj): Linear(in_features=320, out_features=320, bias=True)
         (q_proj): Linear(in_features=320, out_features=320, bias=True)
         (out_proj): Linear(in_features=320, out_features=320, bias=True)
         (rot_emb): RotaryEmbedding()
       )
       (self_attn_layer_norm): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
       (fc1): Linear(in_features=320, out_features=1280, bias=True)
       (fc2): Linear(in_features=1280, out_features=320, bias=True)
       (final_layer_norm): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
     )
   )
   (contact_head): ContactPredictionHead(
     (regression): Linear(in_features=120, out_features=1, bias=True)
     (activation): Sigmoid()
   )
   (

In [None]:
def get_esm_embedding_for_protein_sequence(sequence, dim):
    """
    Receives a protein sequence (String) and returns the ESM representation of the sequence (numpy array).
    :param sequence: protein sequence (String)
    :return: ESM representation of the sequence (numpy array)
    """
    alphabet, model, last_layer = set_esm_modle(dim)
    model.eval()  # disables dropout for deterministic results
    batch_converter = alphabet.get_batch_converter()


    # Prepare data (protein_1, seq_1), (protein_2, seq_2), ...
    data = [("protein", sequence) for i, seq in enumerate(sequence)]
    batch_labels, batch_strs, batch_tokens = batch_converter(data)
    batch_lens = (batch_tokens != alphabet.padding_idx).sum(1)


    # Extract per-residue embeddings (on CPU)
    with torch.no_grad():
        results = model(batch_tokens, repr_layers=[last_layer], return_contacts=False)
    token_embeddings = results["representations"][last_layer]

    # Generate per-sequence embeddings via averaging
    # NOTE: token 0 is always a beginning-of-sequence token, so the first residue is token 1.

    sequence_representations = []
    for i, tokens_len in enumerate(batch_lens):
        sequence_representations.append(token_embeddings[i, 1 : tokens_len - 1, :])

    emb = np.zeros((len(sequence_representations), 140, dim))
    for i, mat in enumerate(sequence_representations):
      s, _ = mat.size()
      emb[i, :s, :] = mat

    return emb



In [None]:
def matrix_to_pdb(seq, coord_matrix, pdb_name):
    """
    Receives a sequence (String) and the output matrix of the neural network (coord_matrix, numpy array)
    and creates from them a PDB file named pdb_name.pdb.
    :param seq: protein sequence (String), with no padding
    :param coord_matrix: output np array of the nanobody neural network, shape = (NB_MAX_LENGTH, OUTPUT_SIZE)
    :param pdb_name: name of the output PDB file (String)
    """
    ATOM_LINE = "ATOM{}{}  {}{}{} {}{}{}{}{:.3f}{}{:.3f}{}{:.3f}  1.00{}{:.2f}           {}\n"
    END_LINE = "END\n"
    k = 1
    with open(f"{pdb_name}.pdb", "w") as pdb_file:
        for i, aa in enumerate(seq):
            third_space = (4 - len(str(i))) * " "
            for j, atom in enumerate(BACKBONE_ATOMS):
                if not (aa == "G" and atom == "CB"):  # GLY lacks CB atom
                    x, y, z = coord_matrix[i][3*j], coord_matrix[i][3*j+1], coord_matrix[i][3*j+2]
                    b_factor = 0.00
                    first_space = (7 - len(str(k))) * " "
                    second_space = (4 - len(atom)) * " "
                    forth_space = (12 - len("{:.3f}".format(x))) * " "
                    fifth_space = (8 - len("{:.3f}".format(y))) * " "
                    sixth_space = (8 - len("{:.3f}".format(z))) * " "
                    seventh_space = (6 - len("{:.2f}".format(b_factor))) * " "

                    pdb_file.write(ATOM_LINE.format(first_space, k, atom, second_space, Polypeptide.one_to_three(aa) , "H", third_space,
                                                    i, forth_space, x, fifth_space, y, sixth_space, z, seventh_space,
                                                    b_factor, atom[0]))
                    k += 1

        pdb_file.write(END_LINE)
    return

In [None]:
def generate_label(pdb_file):  # Stays the same
    """
    receives a pdb file and returns its backbone + CB coordinates.
    :param pdb_file: path to a pdb file (nanobody, heavy chain has id 'H') already alingned to a reference nanobody.
    :return: numpy array of shape (CDR_MAX_LENGTH, OUTPUT_SIZE).
    """
    # get seq and aa residues
    seq, aa_residues = get_seq_aa(pdb_file, NB_CHAIN_ID)

    # create empty matrix
    label = np.zeros((NB_MAX_LENGTH, OUTPUT_SIZE))

    # fill the matrix with the backbone + CB coordinates
    for i, residue in enumerate(aa_residues):
        for j, atom in enumerate(BACKBONE_ATOMS):
            # check if atom in backbone
            if not residue.has_id(atom):
                continue
            label[i, j * 3:j * 3 + 3] = residue[atom].get_coord()

    return label


In [None]:
def matrix_to_pdb(seq, coord_matrix, pdb_name):
    """
    Receives a sequence (String) and the output matrix of the neural network (coord_matrix, numpy array)
    and creates from them a PDB file named pdb_name.pdb.
    :param seq: protein sequence (String), with no padding
    :param coord_matrix: output np array of the nanobody neural network, shape = (NB_MAX_LENGTH, OUTPUT_SIZE)
    :param pdb_name: name of the output PDB file (String)
    """
    ATOM_LINE = "ATOM{}{}  {}{}{} {}{}{}{}{:.3f}{}{:.3f}{}{:.3f}  1.00{}{:.2f}           {}\n"
    END_LINE = "END\n"
    k = 1
    with open(f"{pdb_name}.pdb", "w") as pdb_file:
        for i, aa in enumerate(seq):
            third_space = (4 - len(str(i))) * " "
            for j, atom in enumerate(BACKBONE_ATOMS):
                if not (aa == "G" and atom == "CB"):  # GLY lacks CB atom
                    x, y, z = coord_matrix[i][3*j], coord_matrix[i][3*j+1], coord_matrix[i][3*j+2]
                    b_factor = 0.00
                    first_space = (7 - len(str(k))) * " "
                    second_space = (4 - len(atom)) * " "
                    forth_space = (12 - len("{:.3f}".format(x))) * " "
                    fifth_space = (8 - len("{:.3f}".format(y))) * " "
                    sixth_space = (8 - len("{:.3f}".format(z))) * " "
                    seventh_space = (6 - len("{:.2f}".format(b_factor))) * " "

                    pdb_file.write(ATOM_LINE.format(first_space, k, atom, second_space, Polypeptide.one_to_three(aa) , "H", third_space,
                                                    i, forth_space, x, fifth_space, y, sixth_space, z, seventh_space,
                                                    b_factor, atom[0]))
                    k += 1

        pdb_file.write(END_LINE)
    return