# Target-Specific De Novo Peptide Binder Design with DiffPepBuilder

<img src="https://github.com/YuzheWangPKU/DiffPepBuilder/blob/main/examples/figures/dpb_model.jpg?raw=true">

This notebook demonstrates how to use the [DiffPepBuilder](https://github.com/YuzheWangPKU/DiffPepBuilder) tool to design peptides that bind to a target protein. We provide an example of the target ALK1 (Activin Receptor-like Kinase 1, PDB ID: [6SF1](https://www.rcsb.org/structure/6SF1)) to demonstrate the procedures of generating peptide binders.

## Setup

In [1]:
#@title ### Download model assets
import os

diffpep_folder = "DiffPepBuilder"
ssbuilder_lib = os.path.join(diffpep_folder, "SSbuilder", "SSBLIB")
checkpoint_file = os.path.join(diffpep_folder, "experiments", "checkpoints", "diffpepbuilder_v1.pth")

if not (os.path.isdir(diffpep_folder) and os.path.isdir(ssbuilder_lib) and os.path.isfile(checkpoint_file)):
  print("Installing DiffPepBuilder...")

  if not os.path.isdir(diffpep_folder):
    os.system("git clone https://github.com/YuzheWangPKU/DiffPepBuilder.git")

  os.chdir(diffpep_folder)

  if not os.path.isdir(ssbuilder_lib):
    print("Installing SSBLIB...")
    os.system("tar -xvf SSbuilder/SSBLIB.tar.gz -C SSbuilder")

  if not os.path.isfile(checkpoint_file):
    print("Downloading model weights...")
    os.system("wget https://zenodo.org/records/12794439/files/diffpepbuilder_v1.pth")
    os.makedirs("experiments/checkpoints/", exist_ok=True)
    os.system("mv diffpepbuilder_v1.pth experiments/checkpoints/")

  os.chdir("..")
  print("DiffPepBuilder is installed and ready.")

else:
  print("DiffPepBuilder is already installed and ready.")

Installing DiffPepBuilder...
Installing SSBLIB...
Downloading model weights...
DiffPepBuilder is installed and ready.


In [2]:
#@title ### Install dependencies
os.system("pip install wget wandb fair-esm biotite pyrootutils easydict biopython tqdm ml-collections mdtraj GPUtil dm-tree tmtools py3Dmol")

pdbfixer_folder = "pdbfixer"
if not os.path.isdir(pdbfixer_folder):
  print("Installing pdbfixer...")
  os.system("git clone https://github.com/openmm/pdbfixer.git")
  os.chdir(pdbfixer_folder)
  os.system("python setup.py install")
  os.chdir("..")
  print("pdbfixer is installed.")
else:
  print("pdbfixer is already cloned.")

os.system("pip install hydra-core hydra-joblib-launcher")

Installing pdbfixer...
pdbfixer is installed.


0

## Inference

In [None]:
#@title ### Specify receptor information
from google.colab import files
import json

os.makedirs("test_case", exist_ok=True)
receptor_type = "default (ALK1)" #@param ["default (ALK1)", "uploaded"]

if receptor_type == "uploaded":
  uploaded_pdb = files.upload(accept=".pdb")
  file_name = next(iter(uploaded_pdb))
  os.system(f"mv {file_name} test_case/")
else:
  file_name = "alk1.pdb"
  os.system(f"cp DiffPepBuilder/examples/receptor_data/alk1.pdb test_case/")
#@markdown - **Note**: please remove non-protein components from the PDB file and ensure that the CA atoms are present.

lig_chain = None #@param {type:"string"}
#@markdown  - Chain ID of the **reference** ligand. Please set to `None` if no reference ligand is included in the PDB file.
#@markdown  The model will prioritize reference ligand information over the hotspots and motif if both are given.
hotspots = "B40,B58,B59,B71,B72,B87" #@param {type:"string"}
motif = "B38,B39,B40,B41,B42,B47,B48,B49,B50,B52,B53,B54,B55,B56,B57,B58,B59,B60,B61,B64,B65,B66,B67,B69,B70,B71,B72,B73,B74,B75,B76,B77,B82,B83,B84,B85,B86,B87" #@param {type:"string"}
#@markdown - **Note**: the model will prioritize hotspots over the motif when both are given. See the [paper](https://arxiv.org/abs/2405.00128) for more details.

key = os.path.splitext(file_name)[0]
data = {}
if lig_chain and lig_chain != "None":
  data['lig_chain'] = lig_chain
if hotspots and hotspots != "None":
  data['hotspots'] = hotspots.replace(",", "-")
if motif and motif != "None":
  data['motif'] = motif.replace(",", "-")

json_file_write_path = "test_case/de_novo_cases.json"
final_data = {key: data}
with open(json_file_write_path, 'w') as file:
  json.dump(final_data, file, indent=4)

In [4]:
#@title ### Preprocess receptors
!python DiffPepBuilder/experiments/process_receptor.py \
  --pdb_dir test_case \
  --write_dir test_case \
  --receptor_info_path test_case/de_novo_cases.json

Files will be written to test_case
Finished test_case/alk1.pdb in 0.04s
Finished processing 1/1 files. Start ESM embedding...
Model file /content/DiffPepBuilder/experiments/checkpoints/esm2_t33_650M_UR50D.pt not found. Downloading...
Model file /content/DiffPepBuilder/experiments/checkpoints/esm2_t33_650M_UR50D-contact-regression.pt not found. Downloading...
Read sequence data with 1 sequences
Processing protein sequence batches:   0% 0/1 [00:00<?, ?it/s]Processing 1 of 1 batches (1 sequences)
Processing protein sequence batches: 100% 1/1 [00:00<00:00,  1.07it/s]
100% 1/1 [00:00<00:00, 556.57it/s]


In [None]:
#@title ### Customize generation settings
import yaml

#@markdown #### Sampling params
denoising_steps = "200" #@param [100, 200, 500]
noise_scale = "1" #@param [0.5, 1, 1.5, 2, 2.5]
seq_temperature = "0.1" #@param [0.1, 0.5, 1, 2.5, 10]

#@markdown #### Peptide length
min_length = 12 #@param {type:"integer"}
max_length = 16 #@param {type:"integer"}
samples_per_length = 4 #@param {type:"integer"}

#@markdown #### Disulfide bond settings
build_ss_bond = True #@param {type:"boolean"}
max_ss_bond = 2 #@param {type:"integer"}
entropy_threshold = 0.01 #@param [0.001, 0.01, 0.05, 0.1, 0.5]

yaml_file_path = "DiffPepBuilder/config/inference.yaml"
with open(yaml_file_path, 'r') as file:
  yaml_data = yaml.safe_load(file)

yaml_data['inference']['denoising']['num_t'] = int(denoising_steps)
yaml_data['inference']['denoising']['noise_scale'] = float(noise_scale)
yaml_data['inference']['sampling']['samples_per_length'] = samples_per_length
yaml_data['inference']['sampling']['min_length'] = min_length
yaml_data['inference']['sampling']['max_length'] = max_length
yaml_data['inference']['sampling']['seq_temperature'] = float(seq_temperature)
yaml_data['inference']['ss_bond']['build_ss_bond'] = build_ss_bond
yaml_data['inference']['ss_bond']['max_ss_bond'] = max_ss_bond
yaml_data['inference']['ss_bond']['entropy_threshold'] = float(entropy_threshold)

with open(yaml_file_path, 'w') as file:
  yaml.dump(yaml_data, file, default_flow_style=False)


In [6]:
#@title ### Run *de novo* generation
os.environ['BASE_PATH'] = "/content/DiffPepBuilder"

!torchrun --nproc-per-node=1 DiffPepBuilder/experiments/run_inference.py \
  data.val_csv_path=test_case/metadata_test.csv \
  experiment.use_ddp=False \
  experiment.num_gpus=1 \
  experiment.num_loader_workers=1

[2025-05-15 04:50:26,648][experiments.train][INFO] - Loading checkpoint from /content/DiffPepBuilder/experiments/checkpoints/diffpepbuilder_v1.pth
[2025-05-15 04:50:32,484][data.so3_diffuser][INFO] - Computing IGSO3. Saving in /content/DiffPepBuilder/runs/cache/eps_1000_omega_1000_min_sigma_0_1_max_sigma_1_5_schedule_logarithmic
[2025-05-15 04:51:52,768][experiments.train][INFO] - Number of model parameters: 103.66 M
[2025-05-15 04:51:57,911][experiments.train][INFO] - Evaluation mode only, no checkpoint being saved.
[2025-05-15 04:51:57,913][experiments.train][INFO] - Evaluation saved to: /content/DiffPepBuilder/runs/inference/15D_05M_2025Y_04h_51m
[2025-05-15 04:51:58,034][experiments.train][INFO] - Using device: cuda:0
[2025-05-15 04:51:58,044][data.pdb_data_loader][INFO] - Validation: 1 examples
  output = torch._nested_tensor_from_mask(
[2025-05-15 04:52:32,305][experiments.train][INFO] - Done sample alk1 (peptide length: 16, sample: 0), saved to /content/DiffPepBuilder/runs/infer

In [7]:
#@title ### Download results

!tar --directory=/content/DiffPepBuilder/runs -czf /content/designed_binders.tar.gz inference
files.download("/content/designed_binders.tar.gz")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>