Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
SPDX-License-Identifier: Apache-2.0

# Generating New Protein Binders with RFDiffusion, ProteinMPNN, and ESMFold

Based on https://github.com/RosettaCommons/RFdiffusion/blob/main/examples/design_ppi.sh

## Table of Contents
0. [Install Dependencies](#0.-install-dependencies)
1. [View Reference Structures](#1.-view-reference-structure)
2. [Submit RFDiffusion Job](#2.-submit-rfdiffusion-job) 
3. [Submit ProteinMPNN Jobs](#3.-submit-proteinmpnn-jobs)
4. [Submit ESMFold Jobs](#4.-submit-esmfold-jobs)

## 0. Install Dependencies

In [None]:
%pip install -q -U -r notebook-requirements.txt

Import libraries and create aws objects

In [None]:
import boto3
from batchfold.batchfold_environment import BatchFoldEnvironment
from batchfold.rfdiffusion_job import RFDiffusionJob
from batchfold.proteinmpnn_job import ProteinMPNNJob
from batchfold.batchfold_target import BatchFoldTarget
from batchfold.esmfold_job import ESMFoldJob
from batchfold.utils import utils
from datetime import datetime
import os
import py3Dmol
import json
import time
from math import ceil

boto_session = boto3.session.Session()
s3 = boto_session.client("s3")
batch_environment = BatchFoldEnvironment(boto_session=boto_session)

S3_BUCKET = batch_environment.default_bucket
S3_PREFIX = "binder-experiment-" + datetime.now().strftime("%Y%m%d%s")
S3_PATH = os.path.join("s3://", S3_BUCKET, S3_PREFIX)
JOB_QUEUE = "G4dnJobQueue"
print(f"S3 path is {S3_PATH}")
input_filename = "insulin_target.pdb"


## 1. View Reference Structure

View the structure of our backbone target: A portion of the human insulin receptor

In [None]:
view = py3Dmol.view(input_filename, width=600, height=600)
view.setStyle({"cartoon": {"color": "spectrum"}})
view.zoomTo()
view.show()

Create a BatchFold target for our design and upload the pdb file.

In [None]:
target_id = "BINDER-" + datetime.now().strftime("%Y%m%d%s")
target = BatchFoldTarget(target_id=target_id, s3_bucket=S3_BUCKET, boto_session=boto_session)
target.upload_pdb(input_filename)

## 2. Submit RFDiffusion Job

In [None]:
# RFDiffusion example based on https://github.com/RosettaCommons/RFdiffusion/blob/main/examples/design_ppi.sh

NUM_DESIGNS = 4

params = {
    "contigmap.contigs": "[A1-150/0 70-100]", # Describe the protein we want as residues 1-150 of the A chain of the target protein, then a chain break, then a 70-100 aa binder
    "ppi.hotspot_res": "[A59,A83,A91]", # Target three specific residues on the target, specifically residues 59, 83 and 91 of the A chain
    "inference.num_designs": NUM_DESIGNS, # Make X number of designs
    "denoiser.noise_scale_ca": 0, # Reduce the noise added during inference to to improve the quality of the designs, at a cost of reduced diversity
    "denoiser.noise_scale_frame": 0,
}

rfdiffusion_job_name = "RFDiffusionJob" + datetime.now().strftime("%Y%m%d%s")

rfdiffusion_job = RFDiffusionJob(
    boto_session=boto_session,
    job_name=rfdiffusion_job_name,
    input_s3_uri=os.path.join(target.get_pdbs_s3_uri(), os.path.basename(input_filename)),
    output_s3_uri=target.get_predictions_s3_uri() + "/" + rfdiffusion_job_name,
    params=params,
)
print(f"Submitting {rfdiffusion_job_name}")
rfdiffusion_submission = batch_environment.submit_job(rfdiffusion_job, job_queue_name=JOB_QUEUE)


Wait for RFDiffusion job to complete

In [None]:
rfdiffusion_submission.wait()

Download designed structures

In [None]:
last_rfdiffusion_job_name = target.get_last_job_name(job_type="RFDiffusion")
print(f"Downloading results for job {last_rfdiffusion_job_name}")
local_predictions_dir = target.download_predictions(local_path="data", job=last_rfdiffusion_job_name)

Visualize the designed structures.

In [None]:
rfdiffusion_results_dir = os.path.join(local_predictions_dir, target.target_id, "predictions", last_rfdiffusion_job_name)

structures = []
for obj in os.listdir(rfdiffusion_results_dir):
    if ".pdb" in obj:
        p = os.path.join(rfdiffusion_results_dir, obj)
        with open(p, "r") as f:
            structures.append(f.read())

total_cols = 2
total_rows = ceil(len(structures) / total_cols)
view = py3Dmol.view(viewergrid=(total_rows, total_cols), width=500, height=500)
view.removeAllModels()
k = 0
for i in range(total_cols):
    for j in range(total_rows):
        if k < len(structures):
            view.addModel(structures[k], "pdb", viewer=(i, j))
            k += 1
        else:
            break
view.setStyle({"chain": "A"}, {"cartoon": {"color": "grey"}})        
view.setStyle({"chain": "B"}, {"cartoon": {"color": "spectrum"}})
view.zoomTo()
view.show()


View the diffusion trajectory for the first design

In [None]:
with open(os.path.join(rfdiffusion_results_dir, "traj/output_0_pX0_traj.pdb")) as t:
    trajectory = t.read()
view = py3Dmol.view(width=600, height=600)
view.addModelsAsFrames(trajectory)
view.setStyle({"chain": "A"}, {"cartoon": {"color": "grey"}})
view.setStyle({"chain": "B"}, {"cartoon": {"color": "spectrum"}})
view.animate({"loop": "backward"})
view.zoomTo()
view.show()

## 3. Submit ProteinMPNN Jobs

RFdiffusion gives us the backbone coordinates, but not the side chains. We use ProteinMPNN to generate some amino acid sequences that are most likely to adopt the predicted backbone. In this case, we'll create 4 sequences for each RFdiffusion prediction, so 16 in total (4x4).

Submit ProteinMPNN job.

In [None]:
NUM_SEQS_PER_DESIGN = 4

proteinmpnn_job_base = "ProteinMPNNJob" + datetime.now().strftime("%Y%m%d%s")

protein_mpnn_submissions = []
for i in range(NUM_DESIGNS):
    proteinmpnn_job_name = proteinmpnn_job_base + f"_{i}"
    proteinmpnn_job = ProteinMPNNJob(
        boto_session=boto_session,
        job_name=proteinmpnn_job_name,
        pdb_s3_uri=os.path.join(target.get_predictions_s3_uri(), rfdiffusion_job_name, f"output_{i}.pdb"),
        output_s3_uri=os.path.join(target.get_predictions_s3_uri(), proteinmpnn_job_name),
        pdb_path_chains="A",
        num_seq_per_target=NUM_SEQS_PER_DESIGN,
        sampling_temp=0.01,
        batch_size=1,
        remove_input_from_output=True
    )
    print(f"Submitting {proteinmpnn_job_name}")
    submission = batch_environment.submit_job(
        proteinmpnn_job, job_queue_name=JOB_QUEUE, depends_on=[rfdiffusion_submission]
    )
    protein_mpnn_submissions.append(submission)


Wait for ProteinMPNN jobs to complete.

In [None]:
for submission in protein_mpnn_submissions:
    submission.wait()

Download and process designed sequences

In [None]:
NUM_DESIGNS = 4
fasta = ""
seq_records = []
for i in range(NUM_DESIGNS):
    job_name = proteinmpnn_job_base + f"_{i}"
    local_predictions_dir = target.download_predictions(local_path="data", job=job_name)
    target.add_fasta(os.path.join(local_predictions_dir, target.target_id, "predictions", job_name, f"seqs/output_{i}.fa"))
target.sequences

## 4. Submit ESMFold Jobs

There are many ways we could evaluate the quality of our predictions. For this example, we'll use ESMFold to identify the "foldability" of each sequence, measured by the mean pLDDT score for each structure.

Submit ESMFold jobs

In [None]:
esmfold_job_base = "ESMFoldJob" + datetime.now().strftime("%Y%m%d%s")

esmfold_submissions = []
for i in range(NUM_DESIGNS):
    esmfold_job_name = esmfold_job_base + f"_{i}"
    esmfold_job = ESMFoldJob(
        job_name=esmfold_job_name,
        target_id=target.target_id,
        fasta_s3_uri=os.path.join(target.get_predictions_s3_uri(), proteinmpnn_job_base + f"_{i}/seqs/output_{i}.fa"),
        output_s3_uri=os.path.join(target.get_predictions_s3_uri(), esmfold_job_name),
        boto_session=boto_session
    )
    print(f"Submitting {esmfold_job_name}")
    submission = batch_environment.submit_job(
        esmfold_job, job_queue_name=JOB_QUEUE, depends_on=[protein_mpnn_submissions[i]]
    )
    esmfold_submissions.append(submission)

Wait for the ESMFold job to complete

In [None]:
for submission in esmfold_submissions:
    submission.wait()

Download and display ESMFold metrics

In [None]:
pdb_list = []
for i in range(NUM_DESIGNS):
    job_name = esmfold_submissions[i].job_name
    print(f"Downloading results for job {job_name}")
    local_predictions_dir = target.download_predictions(local_path="data", job=job_name)

    with open(os.path.join(local_predictions_dir, target.target_id, "predictions", job_name, f"metrics.json")) as f:
        metrics = json.load(f)
    total_time = metrics.pop('total')
    prediction_time = metrics.pop('time')
    end_time = metrics.pop('end_time')
    
    for structure in metrics.items():
        
        id = structure[0]                       
        pdb = os.path.join(local_predictions_dir, target.target_id, "predictions", job_name, id + ".pdb")
        pdb_list.append(pdb)  
        
        print("-"*50)
        print(target.sequences[id].description)
        print(structure[1])
        print(str(target.sequences[id].seq))
        print(pdb)

Visualize the predicted structures.

In [None]:
structures = []
for p in pdb_list:
    with open(p, "r") as f:
        structures.append(f.read())

total_cols = 4
total_rows = ceil(len(structures) / total_cols)
view = py3Dmol.view(viewergrid=(total_rows, total_cols), width=750, height=750)
view.removeAllModels()
k = 0
for i in range(total_cols):
    for j in range(total_rows):
        if k < len(structures):
            view.addModel(structures[k], "pdb", viewer=(i, j))
            k += 1
        else:
            break
view.setStyle({"cartoon": {"colorscheme": "amino"}})        
view.zoomTo()
view.show()


In this case, it looks like ESMFold is able to make confident predictions for several of the designed sequences. These are good candidates for synthesis and additional lab testing.