#ProteinMPNN for nanobody sequence design conditioned over nanobody-target antigen backbone structure (i.e., RFDiffusion output)
Edited from the google colaboratory hosted at this [site](https://colab.research.google.com/github/sokrypton/ColabDesign/blob/v1.1.0/mpnn/examples/proteinmpnn_in_jax.ipynb)
---


---


In [4]:
#@title Install colabdesign
import os
import numpy as np
import jax
import jax.numpy as jnp
import matplotlib.pyplot as plt
from IPython.display import HTML
import pandas as pd
import tqdm.notebook
from google.colab import files
from google.colab import data_table

# FIX: Restore np.int for compatibility with older libraries like colabdesign
# This attribute was removed in NumPy 1.24.
np.int = int

try:
  import colabdesign
except:
  os.system("pip -q install git+https://github.com/sokrypton/ColabDesign.git@v1.1.0")
  os.system("ln -s /home/marco/miniconda3/envs/ProteinMPNN/lib/python3.10/site-packages/ colabdesign")

from colabdesign.mpnn import mk_mpnn_model, clear_mem
from colabdesign.shared.protein import pdb_to_string

TQDM_BAR_FORMAT = '{l_bar}{bar}| {n_fmt}/{total_fmt} [elapsed: {elapsed} remaining: {remaining}]'
#data_table.enable_dataframe_formatter()

def get_pdb(pdb_code=""):
  if pdb_code is None or pdb_code == "":
    upload_dict = files.upload()
    pdb_string = upload_dict[list(upload_dict.keys())[0]]
    with open("tmp.pdb","wb") as out: out.write(pdb_string)
    return "tmp.pdb"
  elif os.path.isfile(pdb_code):
    return pdb_code
  elif len(pdb_code) == 4:
    os.system(f"wget -qnc https://files.rcsb.org/view/{pdb_code}.pdb")
    return f"{pdb_code}.pdb"
  else:
    os.system(f"wget -qnc https://alphafold.ebi.ac.uk/files/AF-{pdb_code}-F1-model_v3.pdb")
    return f"AF-{pdb_code}-F1-model_v3.pdb"

In [None]:
%%time
#@title Run ProteinMPNN

import warnings, os, re
warnings.simplefilter(action='ignore', category=FutureWarning)

os.system("mkdir -p output")

# Check if running in Google Colab
try:
    from google.colab import files
    from google.colab import data_table
    IN_COLAB = True
except ImportError:
    IN_COLAB = False

# USER OPTIONS
#@markdown #### ProteinMPNN options
model_name = "v_48_020" #@param ["v_48_002", "v_48_010", "v_48_020", "v_48_030"]
#@markdown #### Input Options
pdb=''
##@param {type:"string"}
##@markdown - leave blank to get an upload prompt
#@markdown chains must be the chain letters used in the PDB given as input (and output) of RFDiffusionAntibody (by default, "H" for the nanobody, "T" for the target; i.e., H,T)
chains = "H,T" #@param {type:"string"}
homooligomer = False #@param {type:"boolean"}
#@markdown #### Design constraints: in our case we will design only interfacial (CDRs or FWs) + non interfacial CDRs residues of the created nanobody binder
fix_pos = "H28-38,50-64,98-108" #@param {type:"string"}
#@markdown - specify which positions to keep fixed in the sequence (example: `1,2-10`)
#@markdown - you can also specify chain specific constraints (example: `A1-10,B1-20`)
#@markdown - you can also specify to fix entire chain(s) (example: `A`)
inverse = True #@param {type:"boolean"}
#@markdown - inverse the `fix_pos` selection (define position to "free" [or design] instead of "fix")
rm_aa = "C" #@param {type:"string"}
#@markdown - specify amino acid(s) to exclude (example: `C,A,T`)

#@markdown #### Design Options
num_seqs = 16 #@param ["2","8","16","32", "64", "128", "256", "512", "1024"] {type:"raw"}
sampling_temp = 0.1 #@param ["0.0001", "0.1", "0.15", "0.2", "0.25", "0.3", "0.5", "1.0"] {type:"raw"}
#@markdown - Sampling temperature for amino acids, T=0.0 means taking argmax, T>>1.0 means sample randomly.

#@markdown Note: designed sequences are saved to `design.fasta`

# cleaning user options
chains = re.sub("[^A-Za-z]+",",", chains)
if fix_pos == "": fix_pos = None
rm_aa = ",".join(list(re.sub("[^A-Z]+","",rm_aa.upper())))
if rm_aa == "": rm_aa = None

if IN_COLAB and pdb == '':
  print("Please upload a PDB file:")
  uploaded = files.upload()
  pdb = next(iter(uploaded))
  print(f"Using '{pdb}' as input.")


if chains == "":
  raise ValueError("Please provide a valid chain ID in the 'chains' parameter.")

if "mpnn_model" not in dir():
  mpnn_model = mk_mpnn_model(model_name)

# Ensure num is at least 1
num = max(1, num_seqs // 32)

mpnn_model.prep_inputs(pdb_filename=pdb,
                       chain=chains, homooligomer=homooligomer,
                       fix_pos=fix_pos, inverse=inverse,
                       rm_aa=rm_aa, verbose=True)
out = mpnn_model.sample(num=num, batch=32,
                        temperature=sampling_temp,
                        rescore=homooligomer)

with open("design.fasta","w") as fasta:
  for n in range(num_seqs):
    line = f'>score:{out["score"][n]:.3f}_seqid:{out["seqid"][n]:.3f}' + '\n' + out["seq"][n]
    fasta.write(line+"\n")

labels = ["score","seqid","seq"]
data = [[out[k][n] for k in labels] for n in range(num_seqs)]

df = pd.DataFrame(data, columns=labels)
df.to_csv('output/mpnn_results.csv')

# read and sort results
results_df = pd.read_csv('output/mpnn_results.csv')
sorted_results_df = results_df.sort_values(by='score', ascending=False)
if IN_COLAB:
  from google.colab import data_table
  data_table.DataTable(sorted_results_df.round(3))
else:
  print(sorted_results_df.round(3))



In [None]:
#@title ### Get amino acid probabilties from ProteinMPNN (optional)
mode = "conditional_fix_pos" #@param ["unconditional", "conditional", "conditional_fix_pos"]
#@markdown - `unconditional` - P(sequence | structure)
#@markdown - `conditional` - P(sequence | structure, sequence)
#@markdown - `conditional_fix_pos` - P(sequence[not_fixed] | structure, sequence[fix_pos])
show = "all"
import plotly.express as px
from scipy.special import softmax
from colabdesign.mpnn.model import residue_constants
L = sum(mpnn_model._lengths)
fix_pos = mpnn_model._inputs.get("fix_pos",[])
free_pos = np.delete(np.arange(L),fix_pos)

if mode == "conditional":
  ar_mask = 1-np.eye(L)
  logits = mpnn_model.score(ar_mask=ar_mask)["logits"]
  pdb_labels = None
if mode == "conditional_fix_pos":
  assert "fix_pos" in mpnn_model._inputs, "no positions fixed"
  ar_mask = 1-np.eye(L)
  p = np.delete(np.arange(L),mpnn_model._inputs["fix_pos"])
  ar_mask[free_pos[:,None],free_pos[None,:]] = 0
  logits = mpnn_model.score(ar_mask=ar_mask)["logits"]
  logits = logits[free_pos]
  pdb_labels = np.array([f"{i}_{c}" for c,i in zip(mpnn_model.pdb["idx"]["chain"], mpnn_model.pdb["idx"]["residue"])])
  pdb_labels = pdb_labels[free_pos]
else:
  ar_mask = np.zeros((L,L))
  logits = mpnn_model.score(ar_mask=ar_mask)["logits"]
  pdb_labels = None

pssm = softmax(logits,-1)
np.savetxt("output/pssm.txt",pssm)

fig = px.imshow(np.array(pssm).T,
               labels=dict(x="positions", y="amino acids", color="probability"),
               y=residue_constants.restypes + ["X"],
               x=pdb_labels,
               zmin=0,
               zmax=1,
               template="simple_white",
              )
fig.update_xaxes(side="top")
fig.show()

In [None]:
#@title Select and Download Your Chosen ProteinMPNN Design

import pandas as pd
import os
import ipywidgets as widgets
from google.colab import files
from IPython.display import display, clear_output

# --- Using predefined variables 'pdb' and 'H' ---
pdb_filename = pdb
nanobody_chain = "H"

# --- Helper Functions (already perfected) ---

def aa_to_three_letter(aa):
    """Converts a one-letter amino acid code to a three-letter code."""
    three_letter_map = {
        'A': 'ALA', 'R': 'ARG', 'N': 'ASN', 'D': 'ASP', 'C': 'CYS',
        'Q': 'GLN', 'E': 'GLU', 'G': 'GLY', 'H': 'HIS', 'I': 'ILE',
        'L': 'LEU', 'K': 'LYS', 'M': 'MET', 'F': 'PHE', 'P': 'PRO',
        'S': 'SER', 'T': 'THR', 'W': 'TRP', 'Y': 'TYR', 'V': 'VAL'
    }
    return three_letter_map.get(aa.upper(), 'UNK')

def generate_new_pdb_content(original_pdb_lines, new_sequence, design_info):
    """Rewrites PDB ATOM records and filters out original REMARK lines."""
    processed_lines = []
    current_residue_index = -1
    last_residue_num = None

    for line in original_pdb_lines:
        if line.startswith("ATOM"):
            chain_id = line[21]
            if chain_id == nanobody_chain:
                current_residue_num = line[22:26]
                if current_residue_num != last_residue_num:
                    current_residue_index += 1
                    last_residue_num = current_residue_num
                if current_residue_index < len(new_sequence):
                    new_aa_char = new_sequence[current_residue_index]
                    new_resname = aa_to_three_letter(new_aa_char)
                    modified_line = line[:17] + f"{new_resname:>3}" + line[20:]
                    processed_lines.append(modified_line)
            else:
                processed_lines.append(line)
        elif line.startswith(("TER", "END")):
            processed_lines.append(line)

    header = [
        "REMARK   1 GENERATED FROM PROTEINMPNN DESIGN\n",
        f"REMARK   2 DESIGN INDEX: {design_info.name}\n",
        f"REMARK   2 DESIGN SCORE: {design_info['score']:.4f}\n",
        f"REMARK   3 DESIGN SEQUENCE: {new_sequence}\n",
    ]
    return "".join(header + processed_lines)

# --- Main Execution ---

try:
    # 1. Load and process ProteinMPNN results
    results_df = pd.read_csv('output/mpnn_results.csv')
    results_df['nanobody_seq'] = results_df['seq'].apply(lambda s: s.split('/')[0])
    sorted_df = results_df.sort_values('score', ascending=True)

    # 2. Create interactive widgets

    # Format the options for the radio buttons
    design_options = [
        (f"#{idx:<3} | Score: {row['score']:.4f} | Sequence: {row['nanobody_seq']}", idx)
        for idx, row in sorted_df.iterrows()
    ]

    selector = widgets.RadioButtons(
        options=design_options,
        description='<b>Select a Design:</b>',
        style={'description_width': 'initial'},
        layout={'width': '100%'},
        disabled=False
    )

    download_button = widgets.Button(
        description="Generate and Download Selected PDB",
        button_style='success',
        icon='download'
    )

    output_area = widgets.Output()

    # 3. Define the action when the button is clicked
    def on_download_clicked(b):
        with output_area:
            clear_output(wait=True)
            selected_index = selector.value
            selected_design = sorted_df.loc[selected_index]

            print(f"✅ Design #{selected_index} selected.")
            print(f"   Score: {selected_design['score']:.4f}")

            try:
                # Read original PDB
                with open(pdb_filename, 'r') as f:
                    original_lines = f.readlines()

                # Generate new PDB content
                print("🧬 Generating new PDB content...")
                final_pdb_content = generate_new_pdb_content(
                    original_lines,
                    selected_design['nanobody_seq'],
                    selected_design
                )

                # Save and download the file
                output_filename = f"ProteinMPNN_design_{selected_index}.pdb"
                with open(output_filename, 'w') as f:
                    f.write(final_pdb_content)

                print(f"📄 File '{output_filename}' created.")
                files.download(output_filename)
                print(f"📥 Download for '{output_filename}' initiated.")

            except Exception as e:
                print(f"❌ An error occurred during file generation: {e}")

    # 4. Link the function to the button and display the UI
    download_button.on_click(on_download_clicked)

    print("Please make your selection from the list below and click the button to download.")
    display(selector, download_button, output_area)


except FileNotFoundError:
    print(f"❌ Error: The file '{pdb_filename}' or 'output/mpnn_results.csv' was not found.")
except Exception as e:
    print(f"❌ An unexpected error occurred: {e}")