Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
SPDX-License-Identifier: Apache-2.0

# Generating New Protein Binders with RFDiffusion and ProteinMPNN

Based on https://github.com/RosettaCommons/RFdiffusion/blob/main/examples/design_ppi.sh

## Table of Contents
0. [Install Dependencies](#0.-install-dependencies)
1. [Download Reference Structures](#1.-download-reference-structures)
2. [Submit RFDesign Hallucination Jobs](#2.-submit-rfdesign-hallucination-jobs) 
3. [Submit RFDesign Inpainting Jobs](#3.-submit-rfdesign-inpainting-jobs)
4. [Submit ProteinMPNN Design Jobs](#3.-submit-proteinmpnn-design-jobs)

## 0. Install Dependencies

In [None]:
%pip install -U -r notebook-requirements.txt

In [1]:
# Import required Python packages
import boto3
from batchfold.batchfold_environment import BatchFoldEnvironment
from batchfold.rfdiffusion_job import RFDiffusionJob
from batchfold.proteinmpnn_job import ProteinMPNNJob
from batchfold.utils imp
from Bio.PDB import PDBParser, PDBIO, Selection
from Bio.PDB.PDBList import PDBList
from datetime import datetime
import os
import numpy as np
import py3Dmol
import json

# Create AWS clients
boto_session = boto3.session.Session(profile_name="bloyal+proteinfolding-Admin")
s3 = boto_session.client("s3")
batch_environment = BatchFoldEnvironment(boto_session=boto_session)

S3_BUCKET = batch_environment.default_bucket
print(f" S3 bucket name is {S3_BUCKET}")

 S3 bucket name is batchfold-230410-batchfolds3bucket-1hb07ye9k8zu0


## 1. Download Reference Structures

In [None]:
pdb_code = "4XB"
pdb_chain = "E"
output_dir = "data/rfdesign/pd1-demo"
file_format="pdb"
pdbl = PDBList()
os.makedirs(output_dir, exist_ok=True)
ent_filename = pdbl.retrieve_pdb_file(
    pdb_code=pdb_code, file_format=file_format, pdir=output_dir, overwrite=True
)
p = PDBParser(QUIET=True)
structure = p.get_structure(pdb_code, ent_filename)
Chain_A, Chain_B = Selection.unfold_entities(structure, "C")
io=PDBIO()
io.set_structure(Chain_A)
io.save("data/rfdesign/pd1-demo/pdl1.pdb")
io.set_structure(Chain_B)
io.save("data/rfdesign/pd1-demo/pd1.pdb")

s3.upload_file("data/rfdesign/pd1-demo/pdl1.pdb", S3_BUCKET, "pd1-demo/pdl1.pdb")
s3.upload_file("data/rfdesign/pd1-demo/pd1.pdb", S3_BUCKET, "pd1-demo/pd1.pdb")

View the structure of our backbone target: PD1

In [None]:
view = py3Dmol.view("data/rfdesign/pd1-demo/pd1.pdb", width=200, height=200)
view.setViewStyle({'style':'outline','color':'black','width':0.1})
view.setStyle({'cartoon':{'color':'spectrum'}})
view.zoomTo()
view.show()

## 2. Submit Hallucination Jobs

In [None]:
total_num = 20
batch = 5 
mask = '25-35,B63-82,15-25,B119-140,0-15' 
hallucinate_job_prefix = "RFDesignHallucinateJob" + datetime.now().strftime("%Y%m%d%s")
job_queue_name = "G4dnJobQueue"

for istart in np.arange(0, total_num, batch):
    job_name = f"{hallucinate_job_prefix}_{istart}"
    params = {
        "mask": mask,
        "steps": "g10",
        "num": batch,
        "start_num": istart,
        "w_rog": 1,
        "rog_thresh": 16,
        "w_rep": 2,
        "rep_pdb": "input/pdl1.pdb",
        "rep_sigma": 4,
        "save_pdb": True,
        "track_step": 10
    }
    
    new_job = RFDesignHallucinateJob(
        boto_session=boto_session,
        job_name = job_name,
        target_id = "4ZQK",
        input_s3_uri = f"s3://{S3_BUCKET}/pd1-demo/",
        output_s3_uri = f"s3://{S3_BUCKET}/{job_name}/",
        pdb = "input/pd1.pdb",
        params = params
    )
    print(f"Submitting {job_name}")
    submission = batch_environment.submit_job(new_job, job_queue_name)

Download hallucinated sequences

In [None]:
bucket_resource = boto_session.resource('s3').Bucket(S3_BUCKET)
items = [obj for obj in bucket_resource.objects.filter(Prefix=hallucinate_job_prefix) if obj.key.endswith(".fas") ]
for item in items:
    print(item.get()['Body'].read().decode('utf-8'))

## 3. Submit Inpainting Jobs

In [None]:
inpainting_job_name = "RFDesignInpaintingJob" + datetime.now().strftime("%Y%m%d%s")
job_queue_name = "G4dnJobQueue"
params = {
    "contigs":"25-35,B63-82,15-25,B119-140,0-15",
    "len": "80-115",
    "num_designs": 4,
    "dump_all": True,
}
new_job = RFDesignInpaintJob(
    boto_session=boto_session,
    job_name = inpainting_job_name,
    target_id = "4ZQK",
    input_s3_uri = f"s3://{S3_BUCKET}/pd1-demo/",
    output_s3_uri = f"s3://{S3_BUCKET}/{inpainting_job_name}/",
    pdb = "input/pd1.pdb",
    params = params
)
print(f"Submitting {inpainting_job_name}")
submission = batch_environment.submit_job(new_job, job_queue_name)

View infilled structures

In [None]:
bucket_resource = boto_session.resource('s3').Bucket(S3_BUCKET)
items = [obj for obj in bucket_resource.objects.filter(Prefix=inpainting_job_name) if obj.key.endswith(".pdb") ]
structures = []
for item in items:
    print(item.key)
    pdb_txt = item.get()['Body'].read().decode('utf-8')
    structures.append(pdb_txt)
total_cols = 2
total_rows = round(len(structures) / total_cols)
view = py3Dmol.view(viewergrid=(total_rows,total_cols))
view.removeAllModels()
i = 0
for j in range(total_rows):
    for k in range(total_cols):
        view.addModel(structures[i], 'pdb', viewer=(j,k))
        view.setStyle({'stick':{'colorscheme':'amino'}})   
        i += 1
view.zoomTo()
view.show()

## 4. Submit ProteinMPNN Design Jobs

Generate and upload fixed positions jsonl file.

In [None]:
fixed_position_dict = {"pd1": {"B": [*range(63,82)]}}
with open('data/rfdesign/pd1-demo/fixed_positions.jsonl', 'w') as f:
        f.write(json.dumps(fixed_position_dict) + '\n')
s3.upload_file("data/rfdesign/pd1-demo/fixed_positions.jsonl", S3_BUCKET, "pd1-demo/fixed_positions.jsonl")


Submit ProteinMPNN job.

In [None]:
proteinmpnn_job_name = "ProteinMPNNJob" + datetime.now().strftime("%Y%m%d%s")
job_queue_name = "GravitonOnDemandJobQueue"

new_job = ProteinMPNNJob(
    boto_session=boto_session,
    job_name = proteinmpnn_job_name,
    pdb_s3_uri = f"s3://{S3_BUCKET}/pd1-demo/pd1.pdb",
    output_s3_uri = f"s3://{S3_BUCKET}/{proteinmpnn_job_name}",
    pdb_path_chains = "B",
    fixed_positions_jsonl = f"s3://{S3_BUCKET}/pd1-demo/fixed_positions.jsonl",
    num_seq_per_target = 25,
    sampling_temp = 0.01,
    seed = 42,
    batch_size = 1
)
submission = batch_environment.submit_job(new_job, job_queue_name)

Download designed sequences

In [None]:
bucket_resource = boto_session.resource('s3').Bucket(S3_BUCKET)
items = [obj for obj in bucket_resource.objects.filter(Prefix=proteinmpnn_job_name)]
for item in items:
    print(item.get()['Body'].read().decode('utf-8'))