# Test RFDiffusion3 for GPC3

* PDB: correct_uniprot_gpc3_59-477_renum.pdb
* Hotspots: 67,71,147,153,156,275
* binder length: 60-150

In [1]:
from rfd3.engine import RFD3InferenceConfig, RFD3InferenceEngine

Environment variable CCD_MIRROR_PATH not set. Will not be able to use function requiring this variable. To set it you may:
  (1) add the line 'export VAR_NAME=path/to/variable' to your .bashrc or .zshrc file
  (2) set it in your current shell with 'export VAR_NAME=path/to/variable'
  (3) write it to a .env file in the root of the atomworks.io repository
Environment variable PDB_MIRROR_PATH not set. Will not be able to use function requiring this variable. To set it you may:
  (1) add the line 'export VAR_NAME=path/to/variable' to your .bashrc or .zshrc file
  (2) set it in your current shell with 'export VAR_NAME=path/to/variable'
  (3) write it to a .env file in the root of the atomworks.io repository
04:47:39 DEBUG transforms: Debug mode is on


In [2]:
from rfd3.engine import RFD3InferenceConfig, RFD3InferenceEngine

config = RFD3InferenceConfig(
    specification={
        "dialect": 2,
        "infer_ori_strategy": "hotspots",
        "input": "/home/ubuntu/cancer/gpc3/rfd3_test/correct_uniprot_gpc3_59-477_renum.pdb",
        "contig": "50-150,/0,A1-419",
        "select_hotspots": "A67,A71,A147,A153,A156,A275",
    },
    diffusion_batch_size=1,
)

model = RFD3InferenceEngine(**config)
outputs = model.run(
    inputs=None,
    out_dir=None,
    n_batches=1,
)

04:48:09 INFO rfd3.engine: [rank: 0] Finished inference batch in 20.11 seconds.


In [3]:
first_key = next(iter(outputs.keys()))
atom_array = outputs[first_key][0].atom_array

In [4]:
from mpnn.inference_engines.mpnn import MPNNInferenceEngine

engine_config = {
    'model_type': "protein_mpnn",
    "is_legacy_weights": True,
    "out_directory": "new",
    "write_structures": True,
    "write_fasta": False,
}

input_configs = [
    {
        "batch_size": 8,
        "remove_waters": True,
        "fixed_chains": ["B"],
        "temperature": 0.001,
    }
]

model = MPNNInferenceEngine(**engine_config)
mpnn_outputs = model.run(input_dicts=input_configs, atom_arrays=[atom_array])

In [5]:
from biotite.structure import get_residue_starts
from biotite.sequence import ProteinSequence

# Extract and display the designed sequences
print(f"Generated {len(mpnn_outputs)} designed sequences:\n")

for i, item in enumerate(mpnn_outputs):
    res_starts = get_residue_starts(item.atom_array)
    # Convert 3-letter codes to 1-letter using Biotite
    seq_1letter = ''.join(
        ProteinSequence.convert_letter_3to1(res_name)
        for res_name in item.atom_array.res_name[res_starts]
    )
    print(f"Sequence {i+1}: {seq_1letter}")

Generated 8 designed sequences:

Sequence 1: NFLKKLQKEGIIEEEEAKKMGVSEEFLKELEENNYITSLEIDGKTYYRITLLGIKYAKENNLGSDLQVCLPKGPTCCSRKMEEKYQLTARLNMEQLLQSASMELKFLIIQNAAVFQEAFEIVVRHAKNYTNAMFKNNYPSLTPQAFEFVGEFFTDVSLYILGSDINVDDMVNELFDSLFPVIYTQLMNPGLPDSALDINECLRGARRDLKVFGNFPKLIMTQVSKSLQVTRIFLQALNLGIEVINTTDHLKFSKDCGRMLTRMWYCSYCQGLMMVKPCGGYCNVVMQGCMAGVVEIDKYWREYILSLEELVNGMYRIYDMENVLLGLFSTIHDSIQYVQKNAGKLTTTIGKLCAHSQQRQYRSAYYPEDLFIDKKVLKVAHVEHEETLSSRRRELIQKLKSFISFYSALPGYICSHSPVAENDTLCWNGQELVERYSQKAARNGMKNQFNLHELKMKGPEPVVSQIIDKLKHINQLLRTMS
Sequence 2: KVLQEIKKKGKITLEEAKEMGMSEETIKELIENNYIISIEEDGKTYLVITKEGIDYMKKNGLGSDLQVCLPKGPTCCSRKMEEKYQLTARLNMEQLLQSASMELKFLIIQNAAVFQEAFEIVVRHAKNYTNAMFKNNYPSLTPQAFEFVGEFFTDVSLYILGSDINVDDMVNELFDSLFPVIYTQLMNPGLPDSALDINECLRGARRDLKVFGNFPKLIMTQVSKSLQVTRIFLQALNLGIEVINTTDHLKFSKDCGRMLTRMWYCSYCQGLMMVKPCGGYCNVVMQGCMAGVVEIDKYWREYILSLEELVNGMYRIYDMENVLLGLFSTIHDSIQYVQKNAGKLTTTIGKLCAHSQQRQYRSAYYPEDLFIDKKVLKVAHVEHEETLSSRRRELIQKLKSFISFYSALPGYICSHSPVAENDTLCWNGQELVERYSQKAARNGMKNQFNLHELKMKGPEP

In [6]:
from rf3.inference_engines.rf3 import RF3InferenceEngine
from rf3.utils.inference import InferenceInput

In [26]:
inference_engine = RF3InferenceEngine(ckpt_path='rf3', verbose=False)

In [28]:
input_structure = InferenceInput.from_atom_array(atom_array, example_id="gpc3_binder", template_selection=["B"])
#rf3_outputs = inference_engine.run(inputs=input_structure)
rf3_outputs = inference_engine.run(inputs=input_structure)

04:54:52 INFO rf3.inference_engines.rf3: [rank: 0] Loading checkpoint from /home/ubuntu/.foundry/checkpoints/rf3_foundry_01_24_latest_remapped.ckpt...
04:54:56 INFO rf3.inference_engines.rf3: [rank: 0] Found 1 structures to predict!
04:54:56 INFO rf3.inference_engines.rf3: [rank: 0] Predicting structure 1/1: gpc3_binder


In [29]:
rf3_output = rf3_outputs["gpc3_binder"][0]

In [30]:
# Summary confidences: overall model quality metrics
summary = rf3_output.summary_confidences

print("=== Summary Confidences ===")
print(f"  Overall pLDDT:    {summary['overall_plddt']:.3f}")
print(f"  Overall PAE:      {summary['overall_pae']:.2f} A")
print(f"  Overall PDE:      {summary['overall_pde']:.3f}")
print(f"  pTM:              {summary['ptm']:.3f}")
print(f"  ipTM:             {summary.get('iptm', 'N/A (single chain)')}")
print(f"  Ranking score:    {summary['ranking_score']:.3f}")
print(f"  Has clash:        {summary['has_clash']}")

=== Summary Confidences ===
  Overall pLDDT:    0.808
  Overall PAE:      8.83 A
  Overall PDE:      3.426
  pTM:              0.843
  ipTM:             0.23870883882045746
  Ranking score:    0.359
  Has clash:        False


In [31]:
from biotite.structure import rmsd, superimpose
from atomworks.constants import PROTEIN_BACKBONE_ATOM_NAMES
import numpy as np

aa_generated = atom_array              # Original RFD3 backbone (from Section 1)
aa_refolded = rf3_output.atom_array    # RF3-predicted structure

chA_generated = aa_generated[np.isin(aa_generated.chain_id,"A")]
bb_chA_generated = chA_generated[np.isin(chA_generated.atom_name, PROTEIN_BACKBONE_ATOM_NAMES)]

chA_refolded = aa_refolded[np.isin(aa_refolded.chain_id,"A")]
bb_chA_refolded = chA_refolded[np.isin(chA_refolded.atom_name, PROTEIN_BACKBONE_ATOM_NAMES)]

bb_refolded_fitted, _ = superimpose(bb_chA_generated, bb_chA_refolded)
rmsd_value = rmsd(bb_chA_generated, bb_chA_refolded)

print(f"Backbone RMSD: {rmsd_value:.2f} A")


Backbone RMSD: 19.24 A


In [32]:
res_starts = get_residue_starts(chA_generated)
# Convert 3-letter codes to 1-letter using Biotite
seq_1letter = ''.join(
    ProteinSequence.convert_letter_3to1(res_name)
    for res_name in chA_generated.res_name[res_starts]
)

In [33]:
rf3_output.summary_confidences['bb_sc_rmsd'] = rmsd_value.item()
rf3_output.summary_confidences['name'] = "xyz"
rf3_output.summary_confidences['binder_sequence'] = seq_1letter
rf3_output.summary_confidences['sequence_length'] = len(seq_1letter)


In [34]:
import pandas as pd

In [35]:
df = pd.DataFrame(rf3_output.summary_confidences)

In [36]:
rf3_output.summary_confidences

{'chain_ptm': [0.68, 0.82],
 'chain_pair_pae_min': [[None, 19.18], [None, None]],
 'chain_pair_pde_min': [[None, 9.94], [None, None]],
 'chain_pair_pae': [[None, 24.23], [None, None]],
 'chain_pair_pde': [[None, 13.37], [None, None]],
 'overall_plddt': 0.8079,
 'overall_pde': 3.4261,
 'overall_pae': 8.8338,
 'ptm': 0.8425129652023315,
 'iptm': 0.23870883882045746,
 'has_clash': False,
 'ranking_score': 0.3595,
 'bb_sc_rmsd': 19.243741989135742,
 'name': 'xyz',
 'binder_sequence': 'MVINKIITEGSLELSELLKLGASAATIDELIANASVVALNLDSQTLFAATAAGIAYAINTQA',
 'sequence_length': 62}

In [37]:
df.to_csv("output.csv", index=False) 

In [38]:
from atomworks.io.utils.io_utils import to_cif_file

# Export structures to CIF format for visualization in PyMOL/ChimeraX
to_cif_file(aa_generated, "generated.cif")
to_cif_file(aa_refolded, "refolded.cif")

'/paperspace/Desktop/cancer/gpc3/rfd3_test/refolded.cif'

# Final run

In [3]:
def rfdiffusion(pdb, contig, hotspots, n_mpnn, fixed_chains, struct_ID, job_ID):

    import numpy as np
    import pandas as pd
    
    from rfd3.engine import RFD3InferenceConfig, RFD3InferenceEngine
    from mpnn.inference_engines.mpnn import MPNNInferenceEngine
    from rf3.inference_engines.rf3 import RF3InferenceEngine
    from rf3.utils.inference import InferenceInput
    
    from biotite.structure import get_residue_starts
    from biotite.sequence import ProteinSequence
    from biotite.structure import rmsd, superimpose

    from atomworks.constants import PROTEIN_BACKBONE_ATOM_NAMES
    from atomworks.io.utils.io_utils import to_cif_file
    

    ID=f"job_{job_ID}-struct_{struct_ID}"

    # Run RFD3
    config = RFD3InferenceConfig(
        specification={
            "dialect": 2,
            "infer_ori_strategy": "hotspots",
            "input": pdb,
            "contig": contig,
            "select_hotspots": hotspots,
        },
        diffusion_batch_size=1,
    )

    model = RFD3InferenceEngine(**config)
    outputs = model.run(
        inputs=None,
        out_dir=None,
        n_batches=1,
    )

    first_key = next(iter(outputs.keys()))
    atom_array = outputs[first_key][0].atom_array

    # Run MPNN
    engine_config = {
        'model_type': "protein_mpnn",
        "is_legacy_weights": True,
        "out_directory": "new",
        "write_structures": True,
        "write_fasta": False,
    }

    input_configs = [
        {
            "batch_size": n_mpnn,
            "remove_waters": True,
            "fixed_chains": [fixed_chains],
            "temperature": 0.001,
        }
    ]

    model = MPNNInferenceEngine(**engine_config)
    mpnn_outputs = model.run(input_dicts=input_configs, atom_arrays=[atom_array])

    #Run RF3
    inference_engine = RF3InferenceEngine(ckpt_path='rf3', verbose=False)

    input_structure = InferenceInput.from_atom_array(atom_array, example_id="binder")
    rf3_outputs = inference_engine.run(inputs=input_structure)

    rf3_output = rf3_outputs["binder"][0]

    #Calculate RMSD
    aa_generated = atom_array              # Original RFD3 backbone (from Section 1)
    aa_refolded = rf3_output.atom_array    # RF3-predicted structure

    chA_generated = aa_generated[np.isin(aa_generated.chain_id,"A")]
    bb_chA_generated = chA_generated[np.isin(chA_generated.atom_name, PROTEIN_BACKBONE_ATOM_NAMES)]

    chA_refolded = aa_refolded[np.isin(aa_refolded.chain_id,"A")]
    bb_chA_refolded = chA_refolded[np.isin(chA_refolded.atom_name, PROTEIN_BACKBONE_ATOM_NAMES)]

    bb_refolded_fitted, _ = superimpose(bb_chA_generated, bb_chA_refolded)
    rmsd_value = rmsd(bb_chA_generated, bb_chA_refolded)

    res_starts = get_residue_starts(chA_generated)
    seq_1letter = ''.join(
        ProteinSequence.convert_letter_3to1(res_name)
        for res_name in chA_generated.res_name[res_starts]
    )

    rf3_output.summary_confidences['bb_sc_rmsd'] = rmsd_value.item()
    rf3_output.summary_confidences['binder_sequence'] = seq_1letter
    rf3_output.summary_confidences['sequence_length'] = len(seq_1letter)
    rf3_output.summary_confidences['name'] = ID

    #Save metric file
    df = pd.DataFrame(rf3_output.summary_confidences)
    df.to_csv(f"{ID}_metrics.csv", index=False) 

    # Export structures to CIF format for visualization in PyMOL/ChimeraX
    to_cif_file(aa_generated, f"{ID}_generated.cif")
    to_cif_file(aa_refolded, f"{ID}_refolded.cif")

In [4]:
rfdiffusion("/home/ubuntu/cancer/gpc3/rfd3_test/correct_uniprot_gpc3_59-477_renum.pdb", "50-150,/0,A1-419", "A67,A71,A147,A153,A156,A275", 8, "B", 108, 72)

04:45:47 INFO rfd3.engine: [rank: 0] Finished inference batch in 20.39 seconds.
04:45:50 INFO rf3.inference_engines.rf3: [rank: 0] Loading checkpoint from /home/ubuntu/.foundry/checkpoints/rf3_foundry_01_24_latest_remapped.ckpt...
04:45:53 INFO rf3.inference_engines.rf3: [rank: 0] Found 1 structures to predict!
04:45:53 INFO rf3.inference_engines.rf3: [rank: 0] Predicting structure 1/1: binder


# MPNN and RFD3 from a pdb

In [1]:
import numpy as np
import pandas as pd

from biotite.structure.io import load_structure
from mpnn.inference_engines.mpnn import MPNNInferenceEngine
from rf3.inference_engines.rf3 import RF3InferenceEngine
from rf3.utils.inference import InferenceInput

from biotite.structure import get_residue_starts
from biotite.sequence import ProteinSequence
from biotite.structure import rmsd, superimpose

from atomworks.constants import PROTEIN_BACKBONE_ATOM_NAMES
from atomworks.io.utils.io_utils import to_cif_file

Environment variable CCD_MIRROR_PATH not set. Will not be able to use function requiring this variable. To set it you may:
  (1) add the line 'export VAR_NAME=path/to/variable' to your .bashrc or .zshrc file
  (2) set it in your current shell with 'export VAR_NAME=path/to/variable'
  (3) write it to a .env file in the root of the atomworks.io repository
Environment variable PDB_MIRROR_PATH not set. Will not be able to use function requiring this variable. To set it you may:
  (1) add the line 'export VAR_NAME=path/to/variable' to your .bashrc or .zshrc file
  (2) set it in your current shell with 'export VAR_NAME=path/to/variable'
  (3) write it to a .env file in the root of the atomworks.io repository
05:59:59 INFO rdkit: Enabling RDKit 2025.03.6 jupyter extensions


In [2]:
path = "./generated.cif"
x = load_structure(path)
atom_array = x

In [3]:
engine_config = {
    'model_type': "protein_mpnn",
    "is_legacy_weights": True,
    "out_directory": "new",
    "write_structures": True,
    "write_fasta": False,
}

input_configs = [
    {
        "batch_size": 8,
        "remove_waters": True,
        "fixed_chains": ["B"],
        "temperature": 0.001,
    }
]

model = MPNNInferenceEngine(**engine_config)
mpnn_outputs = model.run(input_dicts=input_configs, atom_arrays=[atom_array])

06:00:02 INFO mpnn.inference_engines.mpnn: [rank: 0] Loading legacy MPNN weights.
06:00:05 INFO mpnn.utils.inference: Annotated AtomArray has 7715 atoms 
06:00:05 INFO mpnn.inference_engines.mpnn: [rank: 0] Running MPNN inference for input 0, batch 0...


In [4]:
print(f"Generated {len(mpnn_outputs)} designed sequences:\n")

for i, item in enumerate(mpnn_outputs):
    res_starts = get_residue_starts(item.atom_array)
    # Convert 3-letter codes to 1-letter using Biotite
    seq_1letter = ''.join(
        ProteinSequence.convert_letter_3to1(res_name)
        for res_name in item.atom_array.res_name[res_starts]
    )
    print(f"Sequence {i+1}: {seq_1letter}")

Generated 8 designed sequences:

Sequence 1: NVLKKIVEKGLITYEEAKKMGMSDEEIKKLIEENKIVSLEINGKKYYRITLAGIKYMKENNIGSDLQVCLPKGPTCCSRKMEEKYQLTARLNMEQLLQSASMELKFLIIQNAAVFQEAFEIVVRHAKNYTNAMFKNNYPSLTPQAFEFVGEFFTDVSLYILGSDINVDDMVNELFDSLFPVIYTQLMNPGLPDSALDINECLRGARRDLKVFGNFPKLIMTQVSKSLQVTRIFLQALNLGIEVINTTDHLKFSKDCGRMLTRMWYCSYCQGLMMVKPCGGYCNVVMQGCMAGVVEIDKYWREYILSLEELVNGMYRIYDMENVLLGLFSTIHDSIQYVQKNAGKLTTTIGKLCAHSQQRQYRSAYYPEDLFIDKKVLKVAHVEHEETLSSRRRELIQKLKSFISFYSALPGYICSHSPVAENDTLCWNGQELVERYSQKAARNGMKNQFNLHELKMKGPEPVVSQIIDKLKHINQLLRTMS
Sequence 2: NVLKKIVEKGKITYEEAKKMGMSDEEIEELIKNNYIISIEEDGKKYLVITKLGIEYRKKNNLGSDLQVCLPKGPTCCSRKMEEKYQLTARLNMEQLLQSASMELKFLIIQNAAVFQEAFEIVVRHAKNYTNAMFKNNYPSLTPQAFEFVGEFFTDVSLYILGSDINVDDMVNELFDSLFPVIYTQLMNPGLPDSALDINECLRGARRDLKVFGNFPKLIMTQVSKSLQVTRIFLQALNLGIEVINTTDHLKFSKDCGRMLTRMWYCSYCQGLMMVKPCGGYCNVVMQGCMAGVVEIDKYWREYILSLEELVNGMYRIYDMENVLLGLFSTIHDSIQYVQKNAGKLTTTIGKLCAHSQQRQYRSAYYPEDLFIDKKVLKVAHVEHEETLSSRRRELIQKLKSFISFYSALPGYICSHSPVAENDTLCWNGQELVERYSQKAARNGMKNQFNLHELKMKGPEP

In [40]:
inference_engine = RF3InferenceEngine(ckpt_path='rf3', verbose=False)
input_structure = InferenceInput.from_atom_array(mpnn_outputs[2].atom_array,example_id="binder", template_selection="B")
rf3_outputs = inference_engine.run(inputs=input_structure)



06:17:49 INFO rf3.inference_engines.rf3: [rank: 0] Loading checkpoint from /home/ubuntu/.foundry/checkpoints/rf3_foundry_01_24_latest_remapped.ckpt...
Using bfloat16 Automatic Mixed Precision (AMP)
06:17:53 INFO rf3.inference_engines.rf3: [rank: 0] Found 1 structures to predict!
06:17:53 INFO rf3.inference_engines.rf3: [rank: 0] Predicting structure 1/1: binder


In [41]:
rf3_output = rf3_outputs['binder'][0]
res_starts = get_residue_starts(rf3_output.atom_array)
    # Convert 3-letter codes to 1-letter using Biotite
seq_1letter = ''.join(
    ProteinSequence.convert_letter_3to1(res_name)
    for res_name in rf3_output.atom_array.res_name[res_starts]
)
print(f"Sequence {i+1}: {seq_1letter}")

Sequence 8: GFLKKLVENGYITYEEAKEMGVSDETLEYLIENNYITSIEENGKTLYVITLEGIKYMKENNLGSDLQVCLPKGPTCCSRKMEEKYQLTARLNMEQLLQSASMELKFLIIQNAAVFQEAFEIVVRHAKNYTNAMFKNNYPSLTPQAFEFVGEFFTDVSLYILGSDINVDDMVNELFDSLFPVIYTQLMNPGLPDSALDINECLRGARRDLKVFGNFPKLIMTQVSKSLQVTRIFLQALNLGIEVINTTDHLKFSKDCGRMLTRMWYCSYCQGLMMVKPCGGYCNVVMQGCMAGVVEIDKYWREYILSLEELVNGMYRIYDMENVLLGLFSTIHDSIQYVQKNAGKLTTTIGKLCAHSQQRQYRSAYYPEDLFIDKKVLKVAHVEHEETLSSRRRELIQKLKSFISFYSALPGYICSHSPVAENDTLCWNGQELVERYSQKAARNGMKNQFNLHELKMKGPEPVVSQIIDKLKHINQLLRTMS


In [39]:
rf3_output = rf3_outputs['binder'][0]
res_starts = get_residue_starts(rf3_output.atom_array)
    # Convert 3-letter codes to 1-letter using Biotite
seq_1letter = ''.join(
    ProteinSequence.convert_letter_3to1(res_name)
    for res_name in rf3_output.atom_array.res_name[res_starts]
)
print(f"Sequence {i+1}: {seq_1letter}")

Sequence 8: MVINKIITEGSLELSELLKLGASAATIDELIANASVVALNLDSQTLFAATAAGIAYAINTQAGSDLQVCLPKGPTCCSRKMEEKYQLTARLNMEQLLQSASMELKFLIIQNAAVFQEAFEIVVRHAKNYTNAMFKNNYPSLTPQAFEFVGEFFTDVSLYILGSDINVDDMVNELFDSLFPVIYTQLMNPGLPDSALDINECLRGARRDLKVFGNFPKLIMTQVSKSLQVTRIFLQALNLGIEVINTTDHLKFSKDCGRMLTRMWYCSYCQGLMMVKPCGGYCNVVMQGCMAGVVEIDKYWREYILSLEELVNGMYRIYDMENVLLGLFSTIHDSIQYVQKNAGKLTTTIGKLCAHSQQRQYRSAYYPEDLFIDKKVLKVAHVEHEETLSSRRRELIQKLKSFISFYSALPGYICSHSPVAENDTLCWNGQELVERYSQKAARNGMKNQFNLHELKMKGPEPVVSQIIDKLKHINQLLRTMS


In [37]:
res_starts = get_residue_starts(atom_array)
    # Convert 3-letter codes to 1-letter using Biotite
seq_1letter = ''.join(
    ProteinSequence.convert_letter_3to1(res_name)
    for res_name in atom_array.res_name[res_starts]
)
print(f"Sequence {i+1}: {seq_1letter}")

Sequence 8: MVINKIITEGSLELSELLKLGASAATIDELIANASVVALNLDSQTLFAATAAGIAYAINTQAGSDLQVCLPKGPTCCSRKMEEKYQLTARLNMEQLLQSASMELKFLIIQNAAVFQEAFEIVVRHAKNYTNAMFKNNYPSLTPQAFEFVGEFFTDVSLYILGSDINVDDMVNELFDSLFPVIYTQLMNPGLPDSALDINECLRGARRDLKVFGNFPKLIMTQVSKSLQVTRIFLQALNLGIEVINTTDHLKFSKDCGRMLTRMWYCSYCQGLMMVKPCGGYCNVVMQGCMAGVVEIDKYWREYILSLEELVNGMYRIYDMENVLLGLFSTIHDSIQYVQKNAGKLTTTIGKLCAHSQQRQYRSAYYPEDLFIDKKVLKVAHVEHEETLSSRRRELIQKLKSFISFYSALPGYICSHSPVAENDTLCWNGQELVERYSQKAARNGMKNQFNLHELKMKGPEPVVSQIIDKLKHINQLLRTMS
