# Step 3: Predict Docking Poses with `DiffDock` and PDB Protein

### RORc Nuclear Receptor Antagonist Case-Study
Rene, O.; *et al*. "[Minor Structural Change to Tertiary Sulfonamide RORc Ligands Led to Opposite Mechanisms of Action.](https://pubs.acs.org/doi/10.1021/ml500420y)" *ACS Med. Chem. Lett.* **2015**, *6*, 276-281.

The original [DiffDock](https://arxiv.org/abs/2210.01776) is a GPU-accelerated tool for fast, coarse ligand docking in linear time. The [NIVDIA DiffDock NIM](https://docs.nvidia.com/bionemo-framework/1.10/models/diffdock.html) is a retrained version of DiffDock that offers improved docking accuracy, higher throughput, and support for batch processing.

Uses `4wqp_monomer.pdb` from the GitHub repo, as the parent crystal structure of the protein-ligand complex [PDB: 4WQP](https://www.rcsb.org/structure/4WQP) is a dimer.

## 3.1 Set Up the Environment ##

In [3]:
!pip install pandas numpy matplotlib loguru py3dmol rdkit ipywidgets



In [None]:
import os, shutil, requests, json, time
import zipfile
import glob
import random
import py3Dmol

from loguru import logger
from google.colab import files, userdata
from rdkit import Chem
import ipywidgets as widgets
from IPython.display import display

In [5]:
def prepare_output_directory(output):
    """
    Prepare the output directory
    output: str, the output directory
    return: None
    """
    # overwrite the output directory
    # delete the output directory if it exists
    if os.path.exists(output):
        shutil.rmtree(output)
    os.makedirs(output)

## 3.2 Set Up the Directories

### Upload the PDB protein structure file for the target protein

Uses `4wqp_monomer.pdb` from the GitHub repo, as the parent crystal structure of the protein-ligand complex [PDB: 4WQP](https://www.rcsb.org/structure/4WQP) is a dimer.

https://github.com/bf-nv/bionemo_tutorials/

In [None]:
protein_dir = "/content/output/proteinfolding_result"
prepare_output_directory(protein_dir)

# download the ground truth pdb "4wqp_monomer.pdb"
!wget https://raw.githubusercontent.com/bf-nv/bionemo_tutorials/refs/heads/main/4wqp_monomer.pdb

uploaded = '/content/4wqp_monomer.pdb' # true crystal structure

In [8]:
# Move the uploaded file to the target folder
for filename in uploaded.keys():
    !mv "{filename}" "{protein_dir}/{filename}"

In [9]:
# file path of the predicted target protein
protein_file_path = os.path.join(protein_dir, list(uploaded.keys())[0])
print(protein_file_path)

/content/output/esmfold_result/predicted_protein (1).pdb


### Upload ligand files

In [None]:
ligand_dir = "/content/output/ligandgeneration_result"
prepare_output_directory(ligand_dir)

### Upload `clean_mol.zip` which was downloaded at the end of the `Step 2` IPYNB

In [11]:
# choose to upload `clean_mol.zip` which was downloaded at the end of the `Step 2` IPYNB
uploaded = files.upload()

Saving clean_mols.zip to clean_mols.zip


In [12]:
zip_filename = list(uploaded.keys())[0]
print(zip_filename)

clean_mols.zip


In [13]:
with zipfile.ZipFile(zip_filename, 'r') as zip_ref:
    zip_ref.extractall(ligand_dir)

In [14]:
!ls {ligand_dir}

molecule_0.sdf	molecule_1.sdf	molecule_2.sdf	molecule_3.sdf	molecule_4.sdf


### Set up `output` directory for `DiffDock` results

In [15]:
docking_dir = "/content/output/diffdock_result"
prepare_output_directory(docking_dir)

## 3.3 Predict the Ligand Poses with `DiffDock`

### Load ligands

In [16]:
# Load all SDF files from the specified directory
sdf_files = [f for f in os.listdir(ligand_dir) if f.endswith(".sdf")]

# Sort ligand files based on the numeric part in the filename (molecule_0, molecule_1, molecule_2 ....)
sdf_files.sort(key=lambda x: int(x.split("_")[1].split(".")[0]))

# Add a prefix directory path to each file in sdf_files
sdf_files = [os.path.join(ligand_dir, f) for f in sdf_files]

# Get name of the sdf files
ligand_names = [os.path.basename(f).split(".")[0] for f in sdf_files]

print(sdf_files)
print(ligand_names)

['/content/output/molmim_result/molecule_0.sdf', '/content/output/molmim_result/molecule_1.sdf', '/content/output/molmim_result/molecule_2.sdf', '/content/output/molmim_result/molecule_3.sdf', '/content/output/molmim_result/molecule_4.sdf']
['molecule_0', 'molecule_1', 'molecule_2', 'molecule_3', 'molecule_4']


### For demo purpose, we'll only use the first 5 ligands

In [None]:
num_ligands = 5

sdf_files = sdf_files[:num_ligands]
ligand_names = ligand_names[:num_ligands]
print(sdf_files)
print(ligand_names)

['/content/output/molmim_result/molecule_0.sdf', '/content/output/molmim_result/molecule_1.sdf', '/content/output/molmim_result/molecule_2.sdf', '/content/output/molmim_result/molecule_3.sdf', '/content/output/molmim_result/molecule_4.sdf']
['molecule_0', 'molecule_1', 'molecule_2', 'molecule_3', 'molecule_4']


### Set up `API_KEY`

In [18]:
API_KEY = userdata.get('API_KEY')
header_auth = f"Bearer {API_KEY}"

### Upload the target protein

In [19]:
# get asset-uploading URL & upload the asset
def _upload_asset(input):
    assets_url = "https://api.nvcf.nvidia.com/v2/nvcf/assets"

    headers = {
        "Authorization": header_auth,
        "Content-Type": "application/json",
        "accept": "application/json",
    }

    s3_headers = {
        "x-amz-meta-nvcf-asset-description": "diffdock-file",
        "content-type": "text/plain",
    }

    payload = {
        "contentType": "text/plain",
        "description": "diffdock-file"
    }

    response = requests.post(
        assets_url, headers=headers, json=payload, timeout=30
    )

    response.raise_for_status()

    asset_url = response.json()["uploadUrl"]
    asset_id = response.json()["assetId"]

    response = requests.put(
        asset_url,
        data=input,
        headers=s3_headers,
        timeout=300,
    )

    response.raise_for_status()
    return asset_id

In [20]:
invoke_url = "https://health.api.nvidia.com/v1/biology/mit/diffdock"

# get asset uploading URL & upload assets for target protein
with open(protein_file_path, "r") as file:
    pdb_content = file.read()
    protein_id = _upload_asset(pdb_content)
print(protein_id)

37c233d5-0034-4d0a-8b0e-040e21190d9f


### Main loop to iterate over all ligands to generate docking poses

In [58]:
# Iterate over SDF files for each ligand
for ligand_file_path, ligand_name in zip(sdf_files, ligand_names):
    print(f"************ {ligand_name} ****************")
    # get asset-uploading URL & upload assets for ligand
    with open(ligand_file_path, "r") as file:
        sdf_content = file.read()
        ligand_id = _upload_asset(sdf_content)
    print(f"ligand_id: {ligand_id}")

    # DiffDock inference
    headers = {
        "Content-Type": "application/json",
        "NVCF-INPUT-ASSET-REFERENCES": ",".join([protein_id, ligand_id]),
        "Authorization": header_auth
    }

    payload = {
        "ligand": ligand_id,
        "ligand_file_type": "sdf",
        "protein": protein_id,
        "num_poses": 5,
        "time_divisions": 20,
        "steps": 18,
        "save_trajectory": False,
        "is_staged": True
    }

    start = time.time()
    response = requests.post(invoke_url, headers=headers, json=payload)
    end = time.time()
    logger.debug(f"{ligand_name} took {end - start:.2f} seconds")

    response.raise_for_status()

    result = response.json()

    # save result to output.json
    docking_ligand_dir = os.path.join(docking_dir, ligand_name)
    prepare_output_directory(docking_ligand_dir)
    with open(f"{docking_ligand_dir}/output.json", "w") as f:
        json.dump(result, f)

    # save ligand atomic positions
    for i, ligand_geometry in enumerate(result["ligand_positions"]):
        with open("{}/pose_{}_confidence_{:.2f}.sdf".format(docking_ligand_dir, i, result["position_confidence"][i]), "w") as f:
            f.write(ligand_geometry)

************ molecule_0 ****************
ligand_id: abf3dd29-5431-4fb0-8099-28e3c9ad135a


[32m2025-05-23 23:11:28.959[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<cell line: 0>[0m:[36m31[0m - [34m[1mmolecule_0 took 2.29 seconds[0m


************ molecule_1 ****************
ligand_id: 962723a4-a6e4-4524-a6e2-4fcdde795670


[32m2025-05-23 23:11:31.616[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<cell line: 0>[0m:[36m31[0m - [34m[1mmolecule_1 took 2.30 seconds[0m


************ molecule_2 ****************
ligand_id: 08db5a5d-0f0d-49cb-b5c7-bf7bb2458375


[32m2025-05-23 23:11:33.892[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<cell line: 0>[0m:[36m31[0m - [34m[1mmolecule_2 took 1.93 seconds[0m


************ molecule_3 ****************
ligand_id: 28803958-8e3c-48d6-9484-00c55d711cd8


[32m2025-05-23 23:11:36.617[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<cell line: 0>[0m:[36m31[0m - [34m[1mmolecule_3 took 2.36 seconds[0m


************ molecule_4 ****************
ligand_id: 00cc8096-8633-4861-8b89-865fb33e73bb


[32m2025-05-23 23:11:39.376[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<cell line: 0>[0m:[36m31[0m - [34m[1mmolecule_4 took 2.39 seconds[0m


## 3.4 Visualize the Docking Poses

Only visualizing **one** ligand, multiple poses, for demonstration purposes

In [22]:
# assume we select molecule_0
ligand_name = "molecule_0"

docking_ligand_dir = os.path.join(docking_dir, ligand_name)
# take a look at the JSON output file
with open(f"{docking_ligand_dir}/output.json", "r") as f:
    result = json.load(f)
result.keys()

dict_keys(['trajectory', 'ligand_positions', 'position_confidence', 'status', 'protein', 'ligand'])

### Fields in the results

- `trajectory`: diffusion trajectory (empty unless `save_trajectory` is set to `True`)
- `ligand_positions`: a list of docking poses
- `ligand_scores`: a list of confidence scores for each docking pose
- `protein`: input protein
- `ligand`: input ligand

Confidence score the logits of the probability that the docked pose has a RMSD < 2Å compared to ground truth. Interpretation of confidence score (c) is based on the guideline provided by [DiffDock authors](https://github.com/gcorso/DiffDock?tab=readme-ov-file#faq--).
```
c > 0 : high confidence
-1.5 < c < 0: moderate confidence
c < -1.5: low confidence
```

### Visusalize docking poses and [confidence scores](https://github.com/gcorso/DiffDock#faq--)

In [23]:
# define a function for color definitions for visualization
def ansi_color(text, color):
    """Color text for console output"""
    colors = {
        "red": "\033[31m",
        "green": "\033[32m",
        "yellow": "\033[33m",
        "blue": "\033[34m",
        "magenta": "\033[35m",
        "cyan": "\033[36m",
        "white": "\033[37m",
        "reset": "\033[0m"
    }
    return f"{colors[color]}{text}{colors['reset']}"

In [24]:
# load docking poses from the output SDF files extracted from the output.json 'ligand_positions' field
def load_poses_from_sdf(directory):
    sdf_files = glob.glob(f"{directory}/*.sdf")
    poses = []

    for sdf_file in sdf_files:
        supplier = Chem.SDMolSupplier(sdf_file)
        for mol in supplier:
            if mol is not None:
                poses.append(mol)
    return poses

In [54]:
# visualize the docking poses in an interactive manner, browsing docked poses using an embedded slider
def update_viewer(pose_index):

    view = py3Dmol.view(width=900, height=600)

    # Add the protein model
    view.addModel(protein_pdb, 'pdb')
    view.setStyle({'model': 0}, {'cartoon': {'color': 'white', 'opacity': 0.7}})
    #view.setViewStyle({'style':'outline','color':'black','width':0.03})
    Prot=view.getModel()
    Prot.setStyle({'cartoon':{'arrows':True, 'style':'oval', 'color':'white'}})
    view.addSurface(py3Dmol.VDW,{'opacity':0.5,'color':'white'})

    # Add the selected docking pose
    pose = poses[pose_index]
    pose_block = Chem.MolToMolBlock(pose)
    color = "#"+''.join([random.choice('0123456789ABCDEF') for _ in range(6)])
    view.addModel(pose_block, 'mol')
    view.setStyle({'model': 1}, {'stick': {'radius': 0.35, 'colorscheme': 'magentaCarbon'}})
    view.addSurface(py3Dmol.VDW, {'opacity': 0.8, 'colorscheme': 'magentaCarbon'}, {'model': 1})
    score = round(confidence_scores[pose_index], 3)
    score_color = "green" if score > 0 else "blue" if score >= -1.5 else "red"
    print(f"Loaded {ansi_color(ligand_name, 'magenta')} with confidence score: {ansi_color(confidence_scores[pose_index], score_color)}")
    view.zoomTo()
    return view.update()


In [55]:
# Load the protein model
with open(protein_file_path, 'r') as f:
    protein_pdb = f.read()

# Specify the directory containing the dock poses in SDF format for a specific ligand
poses = load_poses_from_sdf(docking_ligand_dir)

# Verify the number of poses loaded
print(f"Number of poses loaded: {len(poses)}")

Number of poses loaded: 3


In [56]:
# Load confidence scores from output.json
output_json_path = os.path.join(docking_ligand_dir, 'output.json')
with open(output_json_path, 'r') as file:
    data = json.load(file)
    confidence_scores = data['position_confidence']  # list of floats
print(confidence_scores)

[-2.6998205184936523, -2.776026725769043, -3.244274616241455]


In [57]:
# Create a slider widget
pose_slider = widgets.IntSlider(
    value=0,
    min=0,
    max=len(poses) - 1,
    step=1,
    description='Pose:',
    continuous_update=False,
    orientation='horizontal',
    readout=True,
    readout_format='d'
)

# Link the slider to the viewer update function
widgets.interact(update_viewer, pose_index=pose_slider)

interactive(children=(IntSlider(value=0, continuous_update=False, description='Pose:', max=2), Output()), _dom…