## Denovo design of tecnetium labeled proteins using Technetium-carbonyl

# 
This script needs to be launched from the virtual enviroment created from the diffusion.yml file. This script covers the entire pipeline of creating new backbones, generating sequences with ProteinMPNN

In [None]:
import os, sys, glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import getpass
import subprocess
import time
import importlib
from shutil import copy2

%load_ext autoreload
%autoreload 2

### Path to this cloned GitHub repo:
SCRIPT_DIR = os.getcwd()
#SCRIPT_DIR = os.path.dirname("/home/ayobami/Documents/applications/heme_binder_diffusion")  # edit this to the GitHub repo path. Throws an error by default.
assert os.path.exists(SCRIPT_DIR)
sys.path.append(SCRIPT_DIR+"/scripts/utils")
import utils

In [None]:
pwd

In [None]:
import sys

diffusion_script = "/home/projects/protein_design/rf_diffusion_all_atom/run_inference.py"  # edit this
inpaint_script = "/home/projects/protein_design/RFDesign/inpainting/inpaint.py"  # edit this if needed
proteinMPNN_script = f"{SCRIPT_DIR}/lib/LigandMPNN/run.py"  # from submodule
AF2_script = f"{SCRIPT_DIR}/scripts/af2/af2.py"  # from submodule

### Use active Python interpreter instead of hardcoded paths
PYTHON = {
    "diffusion": sys.executable,
    "af2": sys.executable,
    "proteinMPNN": sys.executable,
    "general": sys.executable
}

In [None]:
PROJECT = "ligand_diffusion"

### Path where the jobs will be run and outputs dumped
WDIR = "/home/projects/protein_design/tc_binder_diffusion"

if not os.path.exists(WDIR):
    os.makedirs(WDIR, exist_ok=True)

print(f"Working directory: {WDIR}")

USE_GPU_for_AF2 = True

In [None]:
# Ligand information
params = [f"{SCRIPT_DIR}/theozyme/TCO/TC_CARBONYL.params"]  # Rosetta params file(s)
LIGAND = "TCO"

In [None]:
.## Kindly ensure only the intended pdb files are present in this directory
diffusion_inputs = glob.glob(f"{SCRIPT_DIR}/input/input.pdb")
print(f"Found {len(diffusion_inputs)} PDB files")

In [None]:
import os

DIFFUSION_DIR = f"{WDIR}/0_diffusion"
if not os.path.exists(DIFFUSION_DIR):
    os.makedirs(DIFFUSION_DIR, exist_ok=False)

os.chdir(DIFFUSION_DIR)

N_designs = 101
T_steps = 50

# Absolute path to your paper weights:
CKPT_PATH = "/home/projects/protein_design/rf_diffusion_all_atom/RFDiffusionAA_paper_weights.pt"

config = f"""
defaults:
  - aa
  - _self_

diffuser:
  T: {T_steps}

inference:
  num_designs: {N_designs}
  model_runner: NRBStyleSelfCond
  ligand: '{LIGAND}'
  ckpt_path: '{CKPT_PATH}'   # ✅ use absolute path!

model:
  freeze_track_motif: True
contigmap:
  contigs: ["30-100,A1-3,30-100"]
  inpaint_str: null
  length: "100-150"
  
potentials:
  guiding_potentials: ["type:ligand_ncontacts,weight:1"]
  guide_scale: 2
  guide_decay: cubic
"""

estimated_time = 3.5 * T_steps * N_designs

print(f"Estimated time to produce {N_designs} designs = {estimated_time/60:.0f} minutes")
with open("config.yaml", "w") as file:
    file.write(config)

print(f"Wrote config file to {os.path.realpath('config.yaml')}")

In [None]:
import subprocess

# Setup commands to run diffusion directly
commands_diffusion = []
diffusion_rundirs = []

for p in diffusion_inputs:
    pdbname = os.path.basename(p).replace(".pdb", "")
    os.makedirs(pdbname, exist_ok=True)
    
    cmd = f"cd {pdbname} && {PYTHON['diffusion']} {diffusion_script} --config-dir=../ " \
          f"--config-name=config.yaml inference.input_pdb={p} " \
          f"inference.output_prefix='./out/{pdbname}_dif'"
    
    commands_diffusion.append(cmd)
    diffusion_rundirs.append(pdbname)

print(f"Running {len(commands_diffusion)} diffusion jobs locally")

# Run each command sequentially and capture output
for cmd in commands_diffusion:
    print(f"\nRunning:\n{cmd}\n")
    result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    print("STDOUT:\n", result.stdout.decode())
    print("STDERR:\n", result.stderr.decode())

In [None]:
## If you're done with diffusion and happy with the outputs then mark it as done
DIFFUSION_DIR = f"{WDIR}/0_diffusion"
os.chdir(DIFFUSION_DIR)

if not os.path.exists(DIFFUSION_DIR+"/.done"):
    with open(f"{DIFFUSION_DIR}/.done", "w") as file:
        file.write(f"Run user: {username}\n")

In [None]:
pip install blosc

It will be nice to manually inspect the backbones generated. Kindly note that in addition to predicting bacbone coordinates, rf diffusion generates empty coordinates of ala side chains which will be a problem for pyrosetta and alphafold2

In [None]:
## This is just an evaluation step for generated backbones. The process_diffusion_outputs.py file will need fixing after removing zero coordinate backbones
import glob
import subprocess

# Paths
analysis_script = f"{SCRIPT_DIR}/scripts/diffusion_analysis/process_diffusion_outputs.py"

# Get all output PDBs from diffusion directories
diffusion_outputs = []
for d in diffusion_rundirs:
    diffusion_outputs += glob.glob(f"{d}/out/*.pdb")

# Get all reference PDBs (this fixes the *.pdb issue!)
ref_pdbs = glob.glob(f"{SCRIPT_DIR}/input/*.pdb")

# Build command dictionary
dif_analysis_cmd_dict = {
    "--pdb": diffusion_outputs,
    "--ref": ref_pdbs,
    "--params": params,
    "--term_limit": "15.0",
    "--SASA_limit": "0.3",
    "--loop_limit": "0.8",
    "--ref_catres": ["A3"],
    "--rethread": True,
    "--fix": True,
    "--exclude_clash_atoms": ["O1", "O2", "O3"],
    "--ligand_exposed_atoms": ["C1", "C2", "C3"],
    "--exposed_atom_SASA": "10.0",
    "--longest_helix": "30",
    "--rog": "30.0",
    "--traj": "5/30",
    "--analyze": False,
    "--nproc": "1"
}

# Build full shell command
analysis_command = f"{PYTHON['general']} {analysis_script}"
for k, val in dif_analysis_cmd_dict.items():
    if val is not None:
        if isinstance(val, list):
            analysis_command += f" {k} " + " ".join(val)
        elif isinstance(val, bool):
            if val:  # only if True
                analysis_command += f" {k}"
        else:
            analysis_command += f" {k} {val}"
print("Final command to run:\n", analysis_command)

# Run locally if small
if len(diffusion_outputs) < 100:
    p = subprocess.Popen(analysis_command, shell=True)
    output, err = p.communicate()
else:
    submit_script = "submit_diffusion_analysis.sh"
    utils.create_slurm_submit_script(
        filename=submit_script,
        name="diffusion_analysis",
        mem="8g",
        N_cores=dif_analysis_cmd_dict["--nproc"],
        time="0:20:00",
        email=EMAIL,
        command=analysis_command,
        outfile_name="output_analysis"
    )

# Load results
diffused_backbones_good = glob.glob(f"{DIFFUSION_DIR}/filtered_structures/*.pdb")
dif_analysis_df = pd.read_csv(f"{DIFFUSION_DIR}/diffusion_analysis.sc", header=0, sep=r"\s+")

In [None]:
## This could help debug the previous cell using just a single pdb
!python3.9 /home/projects/protein_design/binder_diffusion/scripts/diffusion_analysis/ppprocess_diffusion_outputs.py \
--pdb /home/projects/protein_design/binder_diffusion/0_diffusion/5abc_6debug/out/5abc_6debug_dif_2.pdb \
--ref /home/projects/protein_design/binder_diffusion/input/5abc_6debug.pdb \
--params /home/projects/protein_design/binder_diffusion/theozyme/TCO/TC_CARBONYL.params \
--term_limit 999.0 \
--SASA_limit 0.0 \
--loop_limit 0.0 \
--ref_catres A3 \
--rethread \
--fix \
--exclude_clash_atoms O1 O2 O3 \
--ligand_exposed_atoms C1 C2 C3 \
--exposed_atom_SASA 0.0 \
--longest_helix 999 \
--rog 999.0 \
--analyze \
--nproc 1

## ProteinMPNN
The goal here is to design various sequences for the generated backbones.Little consideration is payed to ligand information

In [None]:
import os
import subprocess
import glob

# 1Grab the good backbones
diffused_backbones_good = glob.glob(f"{DIFFUSION_DIR}/filtered_structures/*.pdb")
assert len(diffused_backbones_good) > 0, "No good backbones found!"

# Setup ProteinMPNN working dir
os.chdir(WDIR)
MPNN_DIR = f"{WDIR}/1_proteinmpnn"
os.makedirs(MPNN_DIR, exist_ok=True)
os.chdir(MPNN_DIR)

# Mask JSON: parse TRB files for motif residues
mask_json_cmd = f"{PYTHON['general']} {SCRIPT_DIR}/scripts/design/make_maskdict_from_trb.py --out masked_pos.jsonl --trb"
for d in diffused_backbones_good:
    mask_json_cmd += " " + d.replace(".pdb", ".trb")

print("Running mask command:")
print(mask_json_cmd)

result = subprocess.run(mask_json_cmd, shell=True)
assert os.path.exists("masked_pos.jsonl"), "Failed to create masked positions JSONL file"

# Prepare ProteinMPNN runs for multiple temps
MPNN_temperatures = [0.1, 0.2, 0.3]
MPNN_outputs_per_temperature = 5
MPNN_omit_AAs = "CM"

commands_mpnn = []

for T in MPNN_temperatures:
    for f in diffused_backbones_good:
        cmd = (
            f"{PYTHON['proteinMPNN']} {proteinMPNN_script} "
            f"--model_type protein_mpnn --ligand_mpnn_use_atom_context 0 "
            f"--fixed_residues_multi masked_pos.jsonl --out_folder ./ "
            f"--number_of_batches {MPNN_outputs_per_temperature} --temperature {T} "
            f"--omit_AA {MPNN_omit_AAs} --pdb_path {f} "
            f"--checkpoint_protein_mpnn {SCRIPT_DIR}/lib/LigandMPNN/model_params/proteinmpnn_v_48_020.pt"
        )
        commands_mpnn.append(cmd)

print(f"\nPrepared {len(commands_mpnn)} local ProteinMPNN jobs")
print("Example command:\n", commands_mpnn[0])

# Run them locally, one-by-one
for cmd in commands_mpnn:
    print("\nRunning:")
    print(cmd)
    result = subprocess.run(cmd, shell=True)
    if result.returncode != 0:
        print(f"⚠️  Warning: A command failed with return code {result.returncode}")