# AF metrics: File name parsing, template dependend metrics, 
Created 04.04.2025 by Andreas B

This script takes structure files and creates various metrics with it

In [1]:
# Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.axes._axes import Axes
from matplotlib.figure import Figure
from pathlib import Path
from sklearn.metrics import roc_curve, roc_auc_score
import re
import tempfile
import shutil
import os
import subprocess
import sys
stdout, stderr = sys.stdout, sys.stderr
from typing import Literal

import pymol
from Bio.SeqUtils import seq1
from Bio.PDB import PDBParser
from Bio.PDB.Structure import Structure as BioPy_PDBStructure
from Bio.PDB.Model import Model as BioPy_PDBModel
from Bio.PDB.Chain import Chain
from Bio.PDB.PDBExceptions import PDBConstructionException
parser = PDBParser(QUIET=True)

class bcolors:
    HEADER = '\033[95m'
    OKBLUE = '\033[94m'
    OKCYAN = '\033[96m'
    OKGREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    ENDC = '\033[0m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'

In [2]:
# Pymol debugging
# For debugging it is useful to visualize the structures in pymol. Uncomment this line before any structure is loaded

#pymol.finish_launching()
sys.stdout = stdout
sys.stderr = stderr

### 0 Imports and Settings

In [3]:
# Settings

# Which AF output should be parsed
af_mode: Literal["AF2", "AF3"] = "AF2"

# Path to resource folder with the structures and metadata tables
path_resources = Path(r"D:\Eigene Datein\dev\Uni\JGU Bio Bachelorthesis\Daten\resources")
# Path to the Luck Drive folder (used for ipSAE metric to get the json file)
path_AF_luck_drive = Path(r"L:\imb-luckgr2\projects\AlphaFold")
if af_mode == "AF3":
    path_AF_luck_drive = path_AF_luck_drive / "AlphaFold3"

# Paths to the local folders
path_AF = path_resources / af_mode
path_solved = path_resources / "solved"

# The path to the ipsae.py
path_ipsae_script = Path("../code ressources/ipsae.py")

# If set to true, load the previous dataframe
load_previous = True


In [6]:
def enhance_dataframe():
    global dataAF
    for c in ["chainA_start", "chainA_end", "chainB_start", "chainB_end", "num_mutations", "num_align_atoms_domain", "num_align_resi_domain", "hbonds", "salt_bridges", "hydrophobic_interactions"]:
        if c not in dataAF.columns:
            print(f"Column {bcolors.FAIL}{c}{bcolors.ENDC} not (yet) in data frame")
            continue
        dataAF[c] = dataAF[c].astype(pd.Int64Dtype())

    def _reorder_column(c:list[str], column: str, prev_column: str = None, index:int = None):
        if column not in c:
            print(f"Column {bcolors.FAIL}{column}{bcolors.ENDC} not (yet) in data frame")
            return
        if index is None:
            if prev_column not in c:
                print(f"Column {bcolors.FAIL}{column}{bcolors.ENDC} (used for sorting) not (yet) in data frame")
                return
            index = c.index(prev_column) + 1
        c.remove(column)
        c.insert(index, column)

    # Reordering of the columns
    c = list(dataAF.columns)
    if af_mode == "AF2":
        _reorder_column(c, "run_id", index=1)
        _reorder_column(c, "benchmark_set", index=2)
    elif af_mode == "AF3":
        _reorder_column(c, "benchmark_set", index=1)
    _reorder_column(c, "prediction_name", prev_column="benchmark_set")
    _reorder_column(c, "model_id", prev_column="prediction_name")
    if af_mode == "AF3": 
        _reorder_column(c, "ranking_score", prev_column="model_id")
        _reorder_column(c, "chainA_length", prev_column="ranking_score")
    else:
        _reorder_column(c, "chainA_length", prev_column="model_id")
    _reorder_column(c, "chainB_length", prev_column="chainA_length")
    _reorder_column(c, "chainA_id", prev_column="chainB_length")
    _reorder_column(c, "chainB_id", prev_column="chainA_id")
    _reorder_column(c, "chainA_start", prev_column="chainB_id")
    _reorder_column(c, "chainA_end", prev_column="chainA_start")
    _reorder_column(c, "chainB_start", prev_column="chainA_end")
    _reorder_column(c, "chainB_end", prev_column="chainB_start")
    _reorder_column(c, "PDB_id", prev_column="chainB_end")
    _reorder_column(c, "ELM_instance", prev_column="PDB_id")
    _reorder_column(c, "DDI_pfam_id", prev_column="ELM_instance")
    _reorder_column(c, "PDB_id_random_paired", prev_column="DDI_pfam_id")
    _reorder_column(c, "ELM_instance_random_paired", prev_column="PDB_id_random_paired")
    _reorder_column(c, "DDI_pfam_id_random_paired", prev_column="ELM_instance_random_paired")
    _reorder_column(c, "sequence_initial", prev_column="DDI_pfam_id_random_paired")
    _reorder_column(c, "sequence_mutated", prev_column="sequence_initial")
    _reorder_column(c, "num_mutations", prev_column="sequence_mutated")

    _reorder_column(c, "align_score_domain", prev_column="num_atom_atom_contact")
    _reorder_column(c, "num_align_atoms_domain", prev_column="align_score_domain")
    _reorder_column(c, "num_align_resi_domain", prev_column="num_align_atoms_domain")
    _reorder_column(c, "RMSD_domain", prev_column="num_align_resi_domain")
    _reorder_column(c, "RMSD_backbone_peptide", prev_column="RMSD_domain")
    _reorder_column(c, "RMSD_all_atom_peptide", prev_column="RMSD_backbone_peptide")
    _reorder_column(c, "RMSD_all_atom", prev_column="RMSD_all_atom_peptide")

    _reorder_column(c, "buried_area", prev_column="Fnonnat")
    _reorder_column(c, "min_distance", prev_column="buried_area")
    _reorder_column(c, "disulfide_bonds", prev_column="min_distance")
    _reorder_column(c, "salt_bridges", prev_column="disulfide_bonds")
    _reorder_column(c, "hbonds", prev_column="salt_bridges")
    _reorder_column(c, "hydrophobic_interactions", prev_column="hbonds")
    

    dataAF = dataAF[c]


In [7]:
# Load
if load_previous:
    dataAF = pd.read_csv(path_resources / af_mode / (af_mode + "_metrics.tsv"), sep="\t")
else:
    # Read in the AF data
    if af_mode == "AF2":
        dataAF = pd.read_csv(path_AF / "AF_metrics_all_structures.tsv", sep="\t")
        # Drop columns to recalculate them
        dataAF.drop(columns=["RMSD_domain", "num_align_atoms_domain", "align_score_domain", "num_align_resi_domain", "RMSD_backbone_peptide", "RMSD_all_atom_peptide", "known_motif_plddt", "DockQ", "iRMS", "LRMS", "Fnonnat", "label"], inplace=True)

        # Adding benchmark set column
        benchmark_set_replace_dict = {"1": "mutations_DMI", "2" : "mutations_DMI", "approved minimal DDI": "known_DDI", "known minimal": "known_DMI", "random minimal": "random_DMI", "random minimal DDI": "random_DDI"}
        dataAF["benchmark_set"] = None
        dataAF["num_mutations"] = None

        for i, row in dataAF.iterrows():
            if row["num_mutation_in_motif"] == "1":
                dataAF.at[i, "num_mutations"] = 1
            elif row["num_mutation_in_motif"] == "2":
                dataAF.at[i, "num_mutations"] = 2
            benchmark_set = benchmark_set_replace_dict[row["num_mutation_in_motif"]]
            dataAF.at[i, "benchmark_set"] = benchmark_set
        dataAF.drop(columns=["num_mutation_in_motif"], inplace=True)

    elif af_mode == "AF3":
        dataAF = pd.read_csv(path_AF / "AF3_output.tsv", sep="\t")

        benchmark_set_replace_dict = {"mutations": "mutations_DMI", "known_minimal": "known_DMI", "known_ddi": "known_DDI", "random_minimal": "random_DMI", "random_ddi": "random_DDI"}
            
        for i, row in dataAF.iterrows():
            benchmark_set = benchmark_set_replace_dict[row["benchmark_set"]]
            dataAF.at[i, "benchmark_set"] = benchmark_set
enhance_dataframe()
display(dataAF)

Column [91mdisulfide_bonds[0m not (yet) in data frame
Column [91msalt_bridges[0m (used for sorting) not (yet) in data frame


Unnamed: 0,project_name,run_id,benchmark_set,prediction_name,model_id,chainA_length,chainB_length,chainA_id,chainB_id,chainA_start,...,RMSD_all_atom,DockQ,iRMSD,LRMSD,Fnonnat,buried_area,min_distance,salt_bridges,hbonds,hydrophobic_interactions
0,AlphaFold_benchmark,run37,known_DMI,DEG_APCC_KENBOX_2_4GGD,ranked_0,312,5,A,B,165,...,0.976244,0.878344,0.603831,1.575394,0.086957,662.104,6.072,0,10,3
1,AlphaFold_benchmark,run37,known_DMI,DEG_APCC_KENBOX_2_4GGD,ranked_1,312,5,A,B,165,...,0.979658,0.880716,0.418230,1.100588,0.050000,613.651,6.063,0,9,0
2,AlphaFold_benchmark,run37,known_DMI,DEG_APCC_KENBOX_2_4GGD,ranked_2,312,5,A,B,165,...,0.969753,0.883186,0.641834,1.776257,0.185185,323.304,5.092,0,2,9
3,AlphaFold_benchmark,run37,known_DMI,DEG_APCC_KENBOX_2_4GGD,ranked_3,312,5,A,B,165,...,1.260424,0.475511,1.686332,5.358800,0.363636,853.680,5.658,11,12,6
4,AlphaFold_benchmark,run37,known_DMI,DEG_APCC_KENBOX_2_4GGD,ranked_4,312,5,A,B,165,...,1.759591,0.223400,2.928606,9.908745,0.888889,851.771,5.691,9,13,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3175,AlphaFold_benchmark_DDI,run6,random_DDI,D1PF18773_PF00071_2X19.D2PF00009_PF01873_2D74,ranked_0,60,113,B,B,392,...,5.341712,,,,,,,,,
3176,AlphaFold_benchmark_DDI,run6,random_DDI,D1PF18773_PF00071_2X19.D2PF00009_PF01873_2D74,ranked_1,60,113,B,B,392,...,5.975135,,,,,,,,,
3177,AlphaFold_benchmark_DDI,run6,random_DDI,D1PF18773_PF00071_2X19.D2PF00009_PF01873_2D74,ranked_2,60,113,B,B,392,...,5.962444,,,,,,,,,
3178,AlphaFold_benchmark_DDI,run6,random_DDI,D1PF18773_PF00071_2X19.D2PF00009_PF01873_2D74,ranked_3,60,113,B,B,392,...,5.901696,,,,,,,,,


In [8]:
# Read in solved structure data

dataSolved = pd.DataFrame(columns=["set", "PDB_id", "DDI_pfam_id", "path", "chainA_id", "chainB_id"])

# DMI
for structure_file in [p for p in Path(path_solved / "DMI").iterdir() if p.is_file() and p.suffix == ".pdb"]:
    pdb_id = structure_file.name.split("_")[0]
    dataSolved.loc[len(dataSolved)] = {"set" : "DMI", "PDB_id": pdb_id, "path": structure_file.relative_to(path_solved), "chainA_id": "A", "chainB_id": "B"}

# DDI
for structure_file in [p for p in Path(path_solved / "DDI").iterdir() if p.is_file() and p.suffix == ".pdb"]:
    ddi_pfam_id = "_".join(structure_file.name.split("_")[0:2])
    pdb_id = structure_file.name.split("_")[2]
    chainA_id = structure_file.name.split("_")[3][0]
    chainB_id = structure_file.name.split("_")[3][1]
    dataSolved.loc[len(dataSolved)] = {"set" : "DDI", "PDB_id": pdb_id, "DDI_pfam_id": ddi_pfam_id, "path": structure_file.relative_to(path_solved), "chainA_id": chainA_id, "chainB_id": chainB_id}

display(dataSolved)

Unnamed: 0,set,PDB_id,DDI_pfam_id,path,chainA_id,chainB_id
0,DMI,1ATP,,DMI\1ATP_min_DMI.pdb,A,B
1,DMI,1AXC,,DMI\1AXC_min_DMI.pdb,A,B
2,DMI,1B72,,DMI\1B72_min_DMI.pdb,A,B
3,DMI,1B8Q,,DMI\1B8Q_min_DMI.pdb,A,B
4,DMI,1BXX,,DMI\1BXX_min_DMI.pdb,A,B
...,...,...,...,...,...,...
183,DDI,3ZNI,PF14447_PF00179,DDI\PF14447_PF00179_3ZNI_AC.pdb,A,C
184,DDI,3J7Y,PF14978_PF00327,DDI\PF14978_PF00327_3J7Y_oZ.pdb,o,Z
185,DDI,6D6Q,PF15985_PF10175,DDI\PF15985_PF10175_6D6Q_GL.pdb,G,L
186,DDI,3KZ1,PF17838_PF00071,DDI\PF17838_PF00071_3KZ1_BE.pdb,B,E


### 1 Parsing the file names
Many informations (PDB ID, mutation sequence, ...) are included in the filename. This section parses them and adds them to the metrics data frame. The detected values include:
* **PDB_id**: Included in all structures
* **ELM_instance**: Included in DMI structures
* **PDB_id_random_paired** and **ELM_instance_random_paired**: Only included in the randomly paired benchmark sets: random_minimal (both) and random_ddi (only PDB ID)
* **sequence_initial** and **sequence_mutated**: Included in the mutations benchmark set (DMI)
* **chainA_id** and **chainB_id**: The chain IDs in the solved structure file. For DMI it is always A and B. For known_ddi the ids are included in the filename, which are also used for random_ddi.
* **ddi_pfam_id**: Included in DDI structures
* **ddi_pfam_id_random_paired**: Included in the random_ddi benchmark set
* **chainA_id**, **chainA_start**, **chainA_end** and the same three for **chainB**: For DDI structures, the chain ID as well as start and end of the selection are included in the filename. For DMI they will be added later

Note: known_extensions were excluded earlier, but if you need to parse them remove the comments in the code cell below

In [None]:
# Regex checks on filename
regex_paired_DMI = r"^([\w\-]+)_(\w{4})$"
regex_random_DMI = r"^M([\w\-]+)_(\w{4})\.D([\w\-]+)_(\w{4})$"
regex_mutated_DMI = r"^([\w\-]+)_(\w{4})_(\w+)\.([A-Za-z]+)$"
regex_known_extension_DMI = r"^([\w-]+)_((Mmin)|(MFL)|(M[\d]+_M[\d]+))_((DFL)|(Dmin)|(D[\d]+_D[\d]+))$"
regex_ddi_known = r"^([^\W_]+_[^\W_]+)_(\w{4})_(\w+)_resi(\d+)_resi(\d+).(\w+)_resi(\d+)_resi(\d+)$"
regex_ddi_random = r"^D1([^\W_]+_[^\W_]+)_(\w{4}).D2([^\W_]+_[^\W_]+)_(\w{4})$"


dataAF["PDB_id"] = None
dataAF["ELM_instance"] = None
dataAF["DDI_pfam_id"] = None
dataAF["PDB_id_random_paired"] = None
dataAF["ELM_instance_random_paired"] = None
dataAF["DDI_pfam_id_random_paired"] = None
dataAF["sequence_initial"] = None
dataAF["sequence_mutated"] = None
# known_extensions have not been run. Therefore exclude them here but keep the code for them
#dataAF["known_extension_motif"] = None 
#dataAF["known_extension_domain"] = None
dataAF["chainA_id"] = None
dataAF["chainB_id"] = None
dataAF["chainA_start"] = None
dataAF["chainA_end"] = None
dataAF["chainB_start"] = None
dataAF["chainB_end"] = None

for i, row in dataAF.iterrows():
    pdb_id, pdb_id_2, elm_instance, elm_instance_2, sequence, sequence_f = None, None, None, None, None, None
    known_extensionM, known_extensionD, chain1_letter, chain2_letter, ddi_pfam_id, ddi_pfam_id_random_paired = None, None, None, None, None, None
    c1_start, c1_end, c2_start, c2_end = None, None, None, None
    if (benchmark_set := row["benchmark_set"]) == "known_DMI":
        if (r1 := re.search(regex_paired_DMI, row["prediction_name"])) is not None and len(r1.groups()) == 2:
            elm_instance = r1.groups()[0]
            pdb_id = r1.groups()[1]
            chain1_letter, chain2_letter = "A", "B"
    elif benchmark_set == "random_DMI":
        if (r := re.search(regex_random_DMI, row["prediction_name"])) is not None and len(r.groups()) == 4:
            # Contraintuitive, but here before dot is motif and after dot is domain
            elm_instance_2 = r.groups()[0]
            pdb_id_2 = r.groups()[1]
            elm_instance = r.groups()[2]
            pdb_id = r.groups()[3]
            chain1_letter, chain2_letter = "A", "B"
    elif benchmark_set == "mutations_DMI":
        if (r := re.search(regex_mutated_DMI, row["prediction_name"])) is not None and len(r.groups()) == 4:
            elm_instance = r.groups()[0]
            pdb_id = r.groups()[1]
            sequence = r.groups()[2]
            sequence_f = r.groups()[3]
            chain1_letter, chain2_letter = "A", "B"

            if af_mode == "AF3":
                assert len(sequence) == len(sequence_f)
                num_mutations = 0
                for n in range(len(sequence)):
                    if sequence[n] != sequence_f[n]: 
                        num_mutations += 1
                dataAF.at[i, "num_mutations"] = num_mutations
    #elif benchmark_set == "known_extension":
    #    if (r := re.search(regex_known_extension_DMI, row["prediction_name"])) is not None and len(r.groups()) == 9:
    #        elm_instance = r.groups()[0]
    #        known_extensionM = r.groups()[1]
    #        known_extensionD = r.groups()[5]
    elif benchmark_set == "known_DDI":
        if (r := re.search(regex_ddi_known, row["prediction_name"])) is not None and len(r.groups()) == 8:
            ddi_pfam_id = r.groups()[0]
            pdb_id = r.groups()[1]
            chain1_letter = r.groups()[2]
            c1_start = r.groups()[3]
            c1_end = r.groups()[4]
            chain2_letter = r.groups()[5]
            c2_start = r.groups()[6]
            c2_end = r.groups()[7]

            if pdb_id != pdb_id.upper():
                pdb_id = pdb_id.upper()
                new_prediction_name = row["prediction_name"][:r.span(2)[0]] + pdb_id + row["prediction_name"][r.span(2)[1]:]
                print(f"Fixed prediction_name in set {benchmark_set} from {row['prediction_name']} to {new_prediction_name}")
                dataAF.at[i, "prediction_name"] = new_prediction_name
    elif benchmark_set == "random_DDI":
        if (r := re.search(regex_ddi_random, row["prediction_name"])) is not None and len(r.groups()) == 4:
            ddi_pfam_id = r.groups()[0]
            pdb_id = r.groups()[1]   
            ddi_pfam_id_random_paired = r.groups()[2]
            pdb_id_2 = r.groups()[3]  
    else:
        raise RuntimeError(f"Regex failed on {row['prediction_name']}")
    
    dataAF.at[i, "PDB_id"] =  pdb_id
    dataAF.at[i, "PDB_id_random_paired"] =  pdb_id_2
    dataAF.at[i, "ELM_instance"] =  elm_instance
    dataAF.at[i, "ELM_instance_random_paired"] =  elm_instance_2
    dataAF.at[i, "sequence_initial"] =  sequence
    dataAF.at[i, "sequence_mutated"] =  sequence_f
    #dataAF.at[i, "known_extension_motif"] =  known_extensionM
    #dataAF.at[i, "known_extension_domain"] =  known_extensionD
    dataAF.at[i, "chainA_id"] =  chain1_letter
    dataAF.at[i, "chainB_id"] =  chain2_letter
    dataAF.at[i, "DDI_pfam_id"] =  ddi_pfam_id
    dataAF.at[i, "DDI_pfam_id_random_paired"] =  ddi_pfam_id_random_paired
    dataAF.at[i, "chainA_start"] =  int(c1_start)
    dataAF.at[i, "chainA_end"] =  int(c1_end)
    dataAF.at[i, "chainB_start"] =  int(c2_start)
    dataAF.at[i, "chainB_end"] =  int(c2_end)

# The chain ids as well as start and end residues for the random DDI can be obtained from the known DDI
for i, row in dataAF[dataAF["benchmark_set"] == "random_DDI"].iterrows():
    prediction_name = row["prediction_name"]
    pdb_id, pdb_id_2 = row["PDB_id"], row["PDB_id_random_paired"]
    ddi_pfam_id, ddi_pfam_id_2 = row["DDI_pfam_id"], row["DDI_pfam_id_random_paired"]
    
    if len(list((_row1 := dataAF[np.logical_and(dataAF["benchmark_set"] == "known_DDI", np.logical_and(dataAF["PDB_id"] == pdb_id, dataAF["DDI_pfam_id"] == ddi_pfam_id))])["chainA_id"])) == 0:
        print(f"Can't find {pdb_id} from {prediction_name} (random_ddi, chain A) in the known_DDI set")
        continue
    if len(list((_row2 := dataAF[np.logical_and(dataAF["benchmark_set"] == "known_DDI", np.logical_and(dataAF["PDB_id"] == pdb_id_2, dataAF["DDI_pfam_id"] == ddi_pfam_id_2))])["chainB_id"])) == 0:
        print(f"Can't find {pdb_id_2} from {prediction_name} (random_ddi, chain B) in the known_DDI set")
        continue
    dataAF.at[i, "chainA_id"] = list(_row1["chainA_id"])[0]
    dataAF.at[i, "chainA_start"] = list(_row1["chainA_start"])[0]
    dataAF.at[i, "chainA_end"] = list(_row1["chainA_end"])[0]
    dataAF.at[i, "chainB_id"] =  list(_row2["chainB_id"])[0]
    dataAF.at[i, "chainB_start"] = list(_row2["chainB_start"])[0]
    dataAF.at[i, "chainB_end"] = list(_row2["chainB_end"])[0]
print("\n", f"Rows, where the regex failed")
display(dataAF[dataAF["PDB_id"].isna()])
display(dataAF)

Can't find 1G4B from D1PF07525_PF03931_3ZKJ.D2PF07724_PF00227_1G4B (random_ddi, chain B) in the known_DDI set
Can't find 1G4B from D1PF07525_PF03931_3ZKJ.D2PF07724_PF00227_1G4B (random_ddi, chain B) in the known_DDI set
Can't find 1G4B from D1PF07525_PF03931_3ZKJ.D2PF07724_PF00227_1G4B (random_ddi, chain B) in the known_DDI set
Can't find 1G4B from D1PF07525_PF03931_3ZKJ.D2PF07724_PF00227_1G4B (random_ddi, chain B) in the known_DDI set
Can't find 1G4B from D1PF07525_PF03931_3ZKJ.D2PF07724_PF00227_1G4B (random_ddi, chain B) in the known_DDI set
Can't find 1G4B from D1PF07724_PF00227_1G4B.D2PF08644_PF03531_4KHB (random_ddi, chain A) in the known_DDI set
Can't find 1G4B from D1PF07724_PF00227_1G4B.D2PF08644_PF03531_4KHB (random_ddi, chain A) in the known_DDI set
Can't find 1G4B from D1PF07724_PF00227_1G4B.D2PF08644_PF03531_4KHB (random_ddi, chain A) in the known_DDI set
Can't find 1G4B from D1PF07724_PF00227_1G4B.D2PF08644_PF03531_4KHB (random_ddi, chain A) in the known_DDI set
Can't find

Unnamed: 0,model_preset,benchmark_set,prediction_name,model_id,ranking_score,chainA_length,fraction_disordered,chainB_length,has_clash,iptm,...,DDI_pfam_id_random_paired,sequence_initial,sequence_mutated,chainA_id,chainB_id,chainA_start,chainA_end,chainB_start,chainB_end,num_mutations


Unnamed: 0,model_preset,benchmark_set,prediction_name,model_id,ranking_score,chainA_length,fraction_disordered,chainB_length,has_clash,iptm,...,DDI_pfam_id_random_paired,sequence_initial,sequence_mutated,chainA_id,chainB_id,chainA_start,chainA_end,chainB_start,chainB_end,num_mutations
0,alphafold3,known_DMI,DEG_APCC_KENBOX_2_4GGD,ranked_0,0.97,312,0.02,5,0.0,0.96,...,,,,A,B,,,,,
1,alphafold3,known_DMI,DEG_APCC_KENBOX_2_4GGD,ranked_1,0.97,312,0.02,5,0.0,0.96,...,,,,A,B,,,,,
2,alphafold3,known_DMI,DEG_APCC_KENBOX_2_4GGD,ranked_2,0.96,312,0.02,5,0.0,0.96,...,,,,A,B,,,,,
3,alphafold3,known_DMI,DEG_APCC_KENBOX_2_4GGD,ranked_3,0.96,312,0.02,5,0.0,0.95,...,,,,A,B,,,,,
4,alphafold3,known_DMI,DEG_APCC_KENBOX_2_4GGD,ranked_4,0.96,312,0.02,5,0.0,0.95,...,,,,A,B,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3175,alphafold3,random_DDI,D1PF18773_PF00071_2X19.D2PF00009_PF01873_2D74,ranked_0,0.36,60,0.22,113,0.0,0.19,...,PF00009_PF01873,,,B,B,392,451,21,133,
3176,alphafold3,random_DDI,D1PF18773_PF00071_2X19.D2PF00009_PF01873_2D74,ranked_1,0.23,60,0.08,113,0.0,0.12,...,PF00009_PF01873,,,B,B,392,451,21,133,
3177,alphafold3,random_DDI,D1PF18773_PF00071_2X19.D2PF00009_PF01873_2D74,ranked_2,0.22,60,0.14,113,0.0,0.07,...,PF00009_PF01873,,,B,B,392,451,21,133,
3178,alphafold3,random_DDI,D1PF18773_PF00071_2X19.D2PF00009_PF01873_2D74,ranked_3,0.21,60,0.07,113,0.0,0.10,...,PF00009_PF01873,,,B,B,392,451,21,133,


### 2 Adding domain and motif start / end from template file
While for the DDI structures selection start and end are included in the filename, for DMI structures there is absolutely no information about start/end of motif and domain. At least, the DMI structures are cut to only include the minimal domain/motif, but there still may be mutations or missing residues in experimental structures.

To restore this information use the template and perform a simple search for three consecutive residues in both chains and calculate the offset between the chain IDs. Then take the most common offset and use it at least 50 % of the AF residues were matched this way

In [8]:
def align_sequences(chain_af:  Chain, chain_template: Chain) -> tuple[int, int, float, str, str]:
    """ Estimate the residue id offset between two chains based on a neighbour local alignment (BioPython has no convinient alignment function).
    
        :returns tuple[int, int, float, str, str]: Start ID, End ID, score, Sequence Chain A, Sequence Chain B
    """
    residues_af = [r for r in chain_af.get_residues()]
    residues_tpl = [r for r in chain_template.get_residues()]
    seq_af = seq1(''.join([r.resname for r in residues_af]))
    seq_tpl = seq1(''.join([r.resname for r in residues_tpl]))
    offset_list = []

    misscounts = 0
    for a0, a1, a2 in zip(residues_af[:-2], residues_af[1:-1], residues_af[2:]):
        _found = False
        for t0, t1, t2 in zip(residues_tpl[:-2], residues_tpl[1:-1], residues_tpl[2:]):
            if a0.resname == t0.resname and a1.resname == t1.resname and a2.resname == t2.resname:
                offset_list.append(t1.id[1] - a1.id[1])
                _found = True
        if not _found:
            misscounts += 1

    # For degenerated short chains (motif) use no neighbours for matching
    # if len(offset_list) == 0:
    #     for r1 in residues_af:
    #         for r2 in residues_tpl:
    #             if r1.resname == r2.resname:
    #                 offset_list.append(r2.id[1] - r1.id[1])

    if len(offset_list) == 0:
        return (None, None, 0, seq_af, seq_tpl)
    offsets, counts = np.unique(offset_list, return_counts=True)
    offset = offsets[np.argmax(counts)]
    score = 1 - misscounts/(len(residues_af) - 2)
    return  offset + 1, offset + len(residues_af), score, seq_af, seq_tpl

for i, row in dataAF[dataAF["benchmark_set"].isin(["known_DMI", "random_DMI", "mutations_DMI"])].iterrows():
    pdb_id = str(row["PDB_id"])
    pdb_id_2 = None
    if row["PDB_id_random_paired"] is not None:
        pdb_id_2 = str(row["PDB_id_random_paired"])
    prediction_name = row["prediction_name"]
    benchmark_set = row["benchmark_set"]
    model_id = row["model_id"]

    if model_id == "ranked_0":
        print(bcolors.OKBLUE + f"{prediction_name} ({benchmark_set})" + bcolors.ENDC)

    #if not prediction_name == "MLIG_MYND_2_2ODD.DMOD_SUMO_for_1_1KPS": continue

    af_path = path_AF / "DMI" / benchmark_set / prediction_name / (model_id + ".pdb")
    af_biopy = parser.get_structure("structure", file=af_path)[0]
    chainA_af = af_biopy["A"]
    chainB_af = af_biopy["B"]    

    template1_path = path_solved / "DMI" / (pdb_id + "_min_DMI.pdb")
    if not template1_path.exists():
        print(f"\t", bcolors.WARNING + f"{prediction_name} has no template file for {pdb_id}" + bcolors.WARNING)
        continue
    template1_biopy = parser.get_structure("structure", file=template1_path)[0]
    chainA_tlp = template1_biopy["A"]
    if pdb_id_2 is not None:
        template2_path = path_solved / "DMI" / (pdb_id_2 + "_min_DMI.pdb")
        if not template2_path.exists():
            print(f"\t", f"{prediction_name} has no template file for {pdb_id}")
            continue
        template2_biopy = parser.get_structure("structure", file=template2_path)[0]
        chainB_tlp = template2_biopy["B"]
    else:
        chainB_tlp = template1_biopy["B"]

    chainA_start, chainA_end, chainA_score, seqA_af, seqA_tpl = align_sequences(chain_af=chainA_af, chain_template=chainA_tlp)
    if chainA_start is not None:
        if model_id == "ranked_0":
            print("\t", f"chainA: {chainA_start}-{chainA_end} ({bcolors.WARNING if chainA_score < 0.5 else ''}{chainA_score:0.3f}{bcolors.ENDC})")
        dataAF.at[i, "chainA_start"] =  chainA_start
        dataAF.at[i, "chainA_end"] =  chainA_end
    else:
        if model_id == "ranked_0":
            print(f"\t", bcolors.WARNING + "Chain A alignment failed" + bcolors.ENDC)
    if model_id == "ranked_0" and chainA_score < 0.5:
        print("\t\t", seqA_af)
        print("\t\t", seqA_tpl)

    chainB_start, chainB_end, chainB_score, seqB_af, seqB_tpl = align_sequences(chain_af=chainB_af, chain_template=chainB_tlp)
    if chainB_start is not None:
        if model_id == "ranked_0":
            print("\t", f"chainB: {chainB_start}-{chainB_end} ({bcolors.WARNING if chainB_score < 0.5 else ''}{chainB_score:0.3f}{bcolors.ENDC})")
        dataAF.at[i, "chainB_start"] =  chainB_start
        dataAF.at[i, "chainB_end"] =  chainB_end
    else:
        if model_id == "ranked_0":
            print(f"\t", bcolors.WARNING + "Chain B alignment failed" + bcolors.ENDC)
    if model_id == "ranked_0" and chainB_score < 0.5:
        print("\t\t", seqB_af)
        print("\t\t", seqB_tpl)
    
    

# For the mutations, the alignment mostly fails. For those restore the information using the known_DMI dataset
for i, row in dataAF[dataAF["benchmark_set"].isin(["mutations_DMI"])].iterrows():
    prediction_name = row["prediction_name"]
    benchmark_set = row["benchmark_set"]
    pdb_id = row["PDB_id"]
    pdb_id_2 = row["PDB_id_random_paired"] if row["PDB_id_random_paired"] is not None else pdb_id
    
    if len(list((_row1 := dataAF[np.logical_and(dataAF["benchmark_set"] == "known_DMI", dataAF["PDB_id"] == pdb_id)])["chainA_id"])) == 0:
        if model_id == "ranked_0":
            print(f"Can't find {pdb_id} from {prediction_name} ({benchmark_set}, chain A) in the known_DMI set")
        continue
    if len(list((_row2 := dataAF[np.logical_and(dataAF["benchmark_set"] == "known_DMI", dataAF["PDB_id"] == pdb_id_2)])["chainB_id"])) == 0:
        if model_id == "ranked_0":
            print(f"Can't find {pdb_id_2} from {prediction_name} ({benchmark_set}, chain B) in the known_DMI set")
        continue
    dataAF.at[i, "chainA_start"] = list(_row1["chainA_start"])[0]
    dataAF.at[i, "chainA_end"] = list(_row1["chainA_end"])[0]
    dataAF.at[i, "chainB_start"] = list(_row2["chainB_start"])[0]
    dataAF.at[i, "chainB_end"] = list(_row2["chainB_end"])[0]

[94mDEG_APCC_KENBOX_2_4GGD (known_DMI)[0m
	 chainA: 165-476 (1.000[0m)
	 chainB: 6-10 (1.000[0m)
[94mDEG_COP1_1_5IGO (known_DMI)[0m
	 chainA: 352-675 (0.988[0m)
	 chainB: 354-361 (1.000[0m)
[94mDEG_Kelch_Keap1_1_2FLU (known_DMI)[0m
	 chainA: 325-609 (1.000[0m)
	 chainB: 77-82 (1.000[0m)
[94mDEG_Kelch_Keap1_2_3WN7 (known_DMI)[0m
	 chainA: 324-609 (1.000[0m)
	 chainB: 26-32 (1.000[0m)
[94mDEG_MDM2_SWIB_1_1YCR (known_DMI)[0m
	 chainA: 30-109 (1.000[0m)
	 chainB: 19-26 (1.000[0m)
[94mDEG_SCF_COI1_1_3OGL (known_DMI)[0m
	 chainA: 52-592 (0.972[0m)
	 chainB: 203-220 (0.812[0m)
[94mDEG_SCF_FBXO31_1_5VZU (known_DMI)[0m
	 chainA: 139-539 (0.862[0m)
	 chainB: 289-295 (1.000[0m)
[94mDEG_SCF_TIR1_1_2P1Q (known_DMI)[0m
	 chainA: 45-578 (0.996[0m)
	 chainB: 1-13 (1.000[0m)
[94mDEG_SIAH_1_2A25 (known_DMI)[0m
	 chainA: 90-282 (0.723[0m)
	 chainB: 59-67 (1.000[0m)
[94mDEG_SPOP_SBC_1_3HQM (known_DMI)[0m
	 chainA: 28-164 (0.956[0m)
	 chainB: 1362-1366 (1.000[0m)
[9

In [10]:
dataAF[np.logical_and(dataAF["chainA_end"].isna(), dataAF["model_id"] == "ranked_0")]

Unnamed: 0,model_preset,benchmark_set,prediction_name,model_id,ranking_score,chainA_length,fraction_disordered,chainB_length,has_clash,iptm,...,DDI_pfam_id_random_paired,sequence_initial,sequence_mutated,chainA_id,chainB_id,chainA_start,chainA_end,chainB_start,chainB_end,num_mutations
1125,alphafold3,random_DMI,MLIG_PAM2_1_1JGN.DMOD_CDK_SPxK_1_2CCI,ranked_0,0.54,297,0.04,13,0.0,0.42,...,,,,A,B,,,,,
3095,alphafold3,random_DDI,D1PF07525_PF03931_3ZKJ.D2PF07724_PF00227_1G4B,ranked_0,0.31,48,0.08,170,0.0,0.15,...,PF07724_PF00227,,,,,,,,,
3100,alphafold3,random_DDI,D1PF07724_PF00227_1G4B.D2PF08644_PF03531_4KHB,ranked_0,0.25,143,0.21,95,0.0,0.08,...,PF08644_PF03531,,,,,,,,,
3155,alphafold3,random_DDI,D1PF14447_PF00179_3ZNI.D2PF14978_PF00327_5OOL,ranked_0,0.2,65,0.07,71,0.0,0.1,...,PF14978_PF00327,,,,,,,,,
3160,alphafold3,random_DDI,D1PF14978_PF00327_5OOL.D2PF15985_PF10175_6D6Q,ranked_0,0.57,89,0.95,127,0.0,0.04,...,PF15985_PF10175,,,,,,,,,


### 3 Pymol calculations for RMSD

For all structures, calculate the overall RMSD
- **RMSD_all_atom**: RMSD aligning the whole structure

For DMI, align the domains (chain A) first and calculate then the peptide RMSD
- **align_score_domain**: Score of domain alignment
- **num_align_atoms_domain** and **num_align_resi_domain**: Count of aligned atoms/residues of domain
- **RMSD_domain**: RMSD of the domain (chain A) after aligning on the domain
- **RMSD_backbone_peptide** and **RMSD_all_atom_peptide**: RMSD of the motif chain (chain B) after aligning on the domain

For DDI perform, use the longest chain (or chain A if both have equal length) as domain and define the shorter one as peptide. Then use the same definition as for DMI

In [10]:
# Calculating the RMSD related values using pymol

dataAF["align_score_domain"] = None
dataAF["num_align_atoms_domain"] = None
dataAF["num_align_resi_domain"] = None
dataAF["RMSD_all_atom"] = None
dataAF["RMSD_domain"] = None
dataAF["RMSD_backbone_peptide"] = None
dataAF["RMSD_all_atom_peptide"] = None

for i,row in dataAF.iterrows():
    benchmark_set = str(row["benchmark_set"])
    _set = "DDI" if "DDI" in benchmark_set else "DMI"
    pdb_id = str(row["PDB_id"]) if row.notnull()["PDB_id"] else None
    pdb_id_2 = str(row["PDB_id_random_paired"]) if row.notnull()["PDB_id_random_paired"] else None
    ddi_pfam_id = str(row["DDI_pfam_id"]) if row.notnull()["DDI_pfam_id"] else None
    ddi_pfam_id_2 = str(row["DDI_pfam_id_random_paired"]) if row.notnull()["DDI_pfam_id_random_paired"] else None
    prediction_name = str(row["prediction_name"]) if row.notnull()["prediction_name"] else None
    model_id = str(row["model_id"]) if row.notnull()["model_id"] else None
    chainA_id = str(row["chainA_id"]) if row.notnull()["chainA_id"] else None
    chainB_id = str(row["chainB_id"]) if row.notnull()["chainB_id"] else None
    chainA_start = int(row["chainA_start"]) if row.notnull()["chainA_start"] else None
    chainB_start = int(row["chainB_start"]) if row.notnull()["chainB_start"] else None
    chainA_end = int(row["chainA_end"]) if row.notnull()["chainA_end"] else None
    chainB_end = int(row["chainB_end"]) if row.notnull()["chainB_end"] else None
    chainA_length = int(row["chainA_length"]) if row.notnull()["chainA_length"] else None
    chainB_length = int(row["chainB_length"]) if row.notnull()["chainB_length"] else None

    if model_id == "ranked_0":
        pymol.cmd.reinitialize() 
        print(f"{bcolors.OKBLUE}{prediction_name} ({benchmark_set}){bcolors.ENDC}")    

    structure_path = path_resources / af_mode / _set / benchmark_set / prediction_name / (model_id + ".pdb")
    if not structure_path.exists():
        print(f"\t{bcolors.FAIL}{prediction_name} ({benchmark_set}) does not exist.{bcolors.ENDC} Skip RMSD calculation")
        continue
    
    template_row = dataSolved.loc[np.logical_and(dataSolved["set"] == _set, np.logical_and(dataSolved["PDB_id"] == pdb_id, np.logical_or(dataSolved["DDI_pfam_id"] == ddi_pfam_id, dataSolved["DDI_pfam_id"].isna())))]
    if len(template_row) == 0:
        print(f"\t{bcolors.FAIL}Can't find template structure for {prediction_name} ({benchmark_set}) and PDB ID {pdb_id}.{bcolors.ENDC} Skip RMSD calculation")
        continue
    elif len(template_row) >= 2:
        print(f"\t{bcolors.FAIL}Multiple template structures found for {prediction_name} ({benchmark_set}) and PDB ID {pdb_id}.{bcolors.ENDC} Skip RMSD calculation")
        continue
    template_path = path_resources / "solved" / str(template_row["path"].item())

    template2_path = None
    if pdb_id_2 is not None:
        template2_row = dataSolved.loc[np.logical_and(dataSolved["set"] == _set, np.logical_and(dataSolved["PDB_id"] == pdb_id_2, np.logical_or(dataSolved["DDI_pfam_id"] == ddi_pfam_id_2, dataSolved["DDI_pfam_id"].isna())))]
        if len(template2_row) == 0:
            print(f"\t{bcolors.FAIL}Can't find template structure for {prediction_name} ({benchmark_set}) and PDB ID {pdb_id_2}.{bcolors.ENDC} Skip RMSD calculation")
            continue
        elif len(template2_row) >= 2:
            print(f"\t{bcolors.FAIL}Multiple template structures found for {prediction_name} ({benchmark_set}) and PDB ID {pdb_id_2}.{bcolors.ENDC} Skip RMSD calculation")
            continue

        template2_path = path_resources / "solved" / str(template2_row["path"].item())

    #pymol.cmd.reinitialize() # Not needed usually, but slows performance significantly down
    for o in pymol.cmd.get_object_list():
        pymol.cmd.delete("all")
    pymol.cmd.sort()

    # First loading the structures. Use two temporary objects to allow renaming the chains even if the chains have the same name or have switched IDs
    pymol.cmd.load(structure_path, "af")
    if template2_path is not None:
        # Updating the object is possible, but turned out to be unstable
        pymol.cmd.load(template_path, "solvedA")
        pymol.cmd.load(template2_path, "solvedB")
        pymol.cmd.create("solved1", f"solvedA and chain {chainA_id}")
        pymol.cmd.create("solved2", f"solvedB and chain {chainB_id}")
        pymol.cmd.delete("solvedA")
        pymol.cmd.delete("solvedB")
    else:
        pymol.cmd.load(template_path, "solvedraw")
        pymol.cmd.create("solved1", f"solvedraw and chain {chainA_id}")
        pymol.cmd.sort()
        pymol.cmd.create("solved2", f"solvedraw and chain {chainB_id}")
        pymol.cmd.delete("solvedraw")
    pymol.cmd.sort()
    # Now rename the chains and create merged object
    pymol.cmd.alter(f"solved1 and chain {chainA_id}", "chain = 'A'")
    pymol.cmd.sort()
    pymol.cmd.alter(f"solved2 and chain {chainB_id}", "chain = 'B'")
    pymol.cmd.sort()
    pymol.cmd.create("solved", f"solved1 or solved2")
    pymol.cmd.delete("solved1")
    pymol.cmd.delete("solved2")
    pymol.cmd.sort()

    # Remove hydrogens and hetatm
    #pymol.cmd.remove(selection="elem 'H' or hetatm")
    pymol.cmd.remove(selection="not backbone and not sidechain or elem 'H'")
    pymol.cmd.sort()

    # Remove alternate location identifiers
    pymol.cmd.remove("not alt ''+A") # Using +A syntax to only effect the atoms with an alternate location identifier set
    pymol.cmd.sort()
    pymol.cmd.alter("all", "alt=''")
    pymol.cmd.sort()

    # Slice the chains to the known start/end residues. For chain B and AF a reindexing is performed as the rms_cur cmd of pymol requires same residue numbers for alignment
    if chainA_start is not None and chainB_start is not None:
        pymol.cmd.create("solved", f"solved and ((chain A and resi {chainA_start}-{chainA_end}) or (chain B and resi {chainB_start}-{chainB_end}))", source_state=0, target_state=0)
        pymol.cmd.sort()
        offsetA = chainA_start - 1
        pymol.cmd.alter("af and chain A", f"resi = (int(resi) + {offsetA})")
        pymol.cmd.sort()

        offsetB = chainB_start - 1
        pymol.cmd.alter("af and chain B", f"resi = (int(resi) + {offsetB})")
        pymol.cmd.sort()
    else:
        print(f"\t{bcolors.FAIL}Can't find information about the chain start/end in the template.{bcolors.ENDC} This may lead to wrong RMSD peptide values, so skip")
        continue

    pymol.cmd.sort()

    # DDI
    chain_align_1, chain_align_2 = "A", "B"
    if _set == "DDI" and chainB_length > chainA_length:
        chain_align_1, chain_align_2 = "B", "A"

    #For debugging
    #space = {'solved_resi': [], "af_resi": []}
    #pymol.cmd.iterate("solved and chain B", "solved_resi.append(int(resi))", space=space)
    #pymol.cmd.iterate("af and chain B", "af_resi.append(int(resi))", space=space)

    #    0: RMSD after refinement
    #    1: Number of aligned atoms after refinement
    #    2: Number of refinement cycles
    #    3: RMSD before refinement
    #    4: Number of aligned atoms before refinement
    #    5: Raw alignment score
    #    6: Number of residues aligned
    # Cycles = 0 to prevent rejection of outliers
    align_output_1 = pymol.cmd.align(mobile=f"af and chain {chain_align_1}", target=f"solved and chain {chain_align_1}", object="algn_domain", cycles=0)
    pymol.cmd.sort()
    RMSD_domain = align_output_1[0]
    num_align_atoms_domain = align_output_1[1]
    align_score_domain = align_output_1[5]
    num_align_resi_domain = align_output_1[6]

    RMSD_backbone_peptide = pymol.cmd.rms_cur(mobile=f"af and chain {chain_align_2} and bb.", target=f"solved and chain {chain_align_2} and bb.", object="peptide_super_bb", cycles=0)
    RMSD_all_atom_peptide = pymol.cmd.rms_cur(mobile=f"af and chain {chain_align_2}", target=f"solved and chain {chain_align_2}", object="peptide_super_all_atoms", cycles=0)
    
    align_output_all = pymol.cmd.align(mobile="af", target="solved", object="algn_all", cycles=0, )
    RMSD_all_atoms = align_output_all[0]

    dataAF.at[i, "RMSD_domain"] =  RMSD_domain
    dataAF.at[i, "align_score_domain"] =  align_score_domain
    dataAF.at[i, "num_align_atoms_domain"] =  num_align_atoms_domain
    dataAF.at[i, "num_align_resi_domain"] =  num_align_resi_domain

    if "random" not in benchmark_set:
        dataAF.at[i, "RMSD_backbone_peptide"] =  RMSD_backbone_peptide
        dataAF.at[i, "RMSD_all_atom_peptide"] =  RMSD_all_atom_peptide

    dataAF.at[i, "RMSD_all_atom"] =  RMSD_all_atoms
        
display(dataAF)

[94mDEG_APCC_KENBOX_2_4GGD (known_DMI)[0m
[94mDEG_COP1_1_5IGO (known_DMI)[0m
[94mDEG_Kelch_Keap1_1_2FLU (known_DMI)[0m
[94mDEG_Kelch_Keap1_2_3WN7 (known_DMI)[0m
[94mDEG_MDM2_SWIB_1_1YCR (known_DMI)[0m
[94mDEG_SCF_COI1_1_3OGL (known_DMI)[0m
[94mDEG_SCF_FBXO31_1_5VZU (known_DMI)[0m
[94mDEG_SCF_TIR1_1_2P1Q (known_DMI)[0m
[94mDEG_SIAH_1_2A25 (known_DMI)[0m
[94mDEG_SPOP_SBC_1_3HQM (known_DMI)[0m
[94mDOC_AGCK_PIF_3_1ATP (known_DMI)[0m
[94mDOC_ANK_TNKS_1_3TWU (known_DMI)[0m
[94mDOC_CDC14_PxL_1_6G84 (known_DMI)[0m
[94mDOC_CYCLIN_RxL_1_1H25 (known_DMI)[0m
[94mDOC_GSK3_Axin_1_1O9U (known_DMI)[0m
[94mDOC_MAPK_DCC_7_2B9J (known_DMI)[0m
[94mDOC_MAPK_GRA24_9_5ETA (known_DMI)[0m
[94mDOC_MAPK_HePTP_8_2GPH (known_DMI)[0m
[94mDOC_MAPK_JIP1_4_4H3B (known_DMI)[0m
[94mDOC_MAPK_MEF2A_6_4H3Q (known_DMI)[0m
[94mDOC_MAPK_NFAT4_5_2XS0 (known_DMI)[0m
[94mDOC_MAPK_RevD_3_3TEI (known_DMI)[0m
[94mDOC_MIT_MIM_1_2JQ9 (known_DMI)[0m
[94mDOC_PP1_MyPhoNE_1_1S70 (known_DMI)

Unnamed: 0,model_preset,benchmark_set,prediction_name,model_id,ranking_score,chainA_length,chainB_length,chainA_id,chainB_id,chainA_start,...,DockQ,iRMSD,LRMSD,Fnonnat,buried_area,min_distance,salt_bridges,hbonds,hydrophobic_interactions,ipSAE
0,alphafold3,known_DMI,DEG_APCC_KENBOX_2_4GGD,ranked_0,0.97,312,5,A,B,165,...,0.967617,0.341276,0.831159,0.000000,848.152,5.371,9,0,6,0.869025
1,alphafold3,known_DMI,DEG_APCC_KENBOX_2_4GGD,ranked_1,0.97,312,5,A,B,165,...,0.943274,0.426121,1.169542,0.076923,603.522,6.264,0,0,0,0.868551
2,alphafold3,known_DMI,DEG_APCC_KENBOX_2_4GGD,ranked_2,0.96,312,5,A,B,165,...,0.908970,0.621263,1.944753,0.111111,602.041,6.241,0,0,0,0.855837
3,alphafold3,known_DMI,DEG_APCC_KENBOX_2_4GGD,ranked_3,0.96,312,5,A,B,165,...,0.961964,0.382246,1.037074,0.038462,615.163,6.207,0,0,0,0.850758
4,alphafold3,known_DMI,DEG_APCC_KENBOX_2_4GGD,ranked_4,0.96,312,5,A,B,165,...,0.931179,0.573091,1.749101,0.074074,662.502,5.890,0,0,0,0.850449
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3175,alphafold3,random_DDI,D1PF18773_PF00071_2X19.D2PF00009_PF01873_2D74,ranked_0,0.36,60,113,B,B,392,...,,,,,1447.877,4.251,1,0,19,0.012324
3176,alphafold3,random_DDI,D1PF18773_PF00071_2X19.D2PF00009_PF01873_2D74,ranked_1,0.23,60,113,B,B,392,...,,,,,1482.311,4.197,1,0,25,0.000000
3177,alphafold3,random_DDI,D1PF18773_PF00071_2X19.D2PF00009_PF01873_2D74,ranked_2,0.22,60,113,B,B,392,...,,,,,1564.435,4.783,2,0,18,0.000000
3178,alphafold3,random_DDI,D1PF18773_PF00071_2X19.D2PF00009_PF01873_2D74,ranked_3,0.21,60,113,B,B,392,...,,,,,1431.933,4.257,0,0,59,0.000000


### 3 DockQ


In [12]:
from DockQ.DockQ import load_PDB, run_on_all_native_interfaces

dataAF["DockQ"] = np.nan
dataAF["iRMSD"] = np.nan
dataAF["LRMSD"] = np.nan
dataAF["Fnonnat"] = np.nan
for i, row in dataAF[dataAF["benchmark_set"].isin(["known_DMI", "known_DDI"])].iterrows():
    benchmark_set = str(row["benchmark_set"])
    _set = "DDI" if "DDI" in benchmark_set else "DMI"
    pdb_id = str(row["PDB_id"]) if row.notnull()["PDB_id"] else None
    pdb_id_2 = str(row["PDB_id_random_paired"]) if row.notnull()["PDB_id_random_paired"] else None
    ddi_pfam_id = str(row["DDI_pfam_id"]) if row.notnull()["DDI_pfam_id"] else None
    ddi_pfam_id_2 = str(row["DDI_pfam_id_random_paired"]) if row.notnull()["DDI_pfam_id_random_paired"] else None
    prediction_name = str(row["prediction_name"]) if row.notnull()["prediction_name"] else None
    model_id = str(row["model_id"]) if row.notnull()["model_id"] else None
    chainA_id = str(row["chainA_id"]) if row.notnull()["chainA_id"] else None
    chainB_id = str(row["chainB_id"]) if row.notnull()["chainB_id"] else None
    chainA_start = int(row["chainA_start"]) if row.notnull()["chainA_start"] else None
    chainB_start = int(row["chainB_start"]) if row.notnull()["chainB_start"] else None
    chainA_end = int(row["chainA_end"]) if row.notnull()["chainA_end"] else None
    chainB_end = int(row["chainB_end"]) if row.notnull()["chainB_end"] else None

    if model_id == "ranked_0":
        print(f"{bcolors.OKBLUE}{prediction_name} ({benchmark_set}){bcolors.ENDC}")

    structure_path = path_resources / af_mode / _set / benchmark_set / prediction_name / (model_id + ".pdb")
    if not structure_path.exists():
        print(f"\t{bcolors.FAIL}{prediction_name} ({benchmark_set}) does not exist.{bcolors.ENDC} Skip DockQ")
        continue

    template_row = dataSolved.loc[np.logical_and(dataSolved["set"] == _set, np.logical_and(dataSolved["PDB_id"] == pdb_id, np.logical_or(dataSolved["DDI_pfam_id"] == ddi_pfam_id, dataSolved["DDI_pfam_id"].isna())))]
    if len(template_row) == 0:
        print(f"\t{bcolors.FAIL}Can't find template structure for {prediction_name} ({benchmark_set}) and PDB ID {pdb_id}.{bcolors.ENDC} Skip")
        continue
    elif len(template_row) >= 2:
        print(f"\t{bcolors.FAIL}Multiple template structures found for {prediction_name} ({benchmark_set}) and PDB ID {pdb_id}.{bcolors.ENDC} Skip")
        continue
    template_path = path_solved / str(template_row["path"].item())
    dockq_structure_af = load_PDB(str(structure_path))
    dockq_structure_solved = load_PDB(str(template_path))

    chain_map = {chainA_id: "A", chainB_id:"B"}
    chain_key = chainA_id + chainB_id

    result = run_on_all_native_interfaces(dockq_structure_af, dockq_structure_solved, chain_map=chain_map)[0]
    dataAF.at[i, "DockQ"] = result[chain_key]["DockQ"]
    dataAF.at[i, "iRMSD"] = result[chain_key]["iRMSD"]
    dataAF.at[i, "LRMSD"] = result[chain_key]["LRMSD"]
    dataAF.at[i, "Fnonnat"] = np.float64(result[chain_key]["fnonnat"])

display(dataAF)


[94mDEG_APCC_KENBOX_2_4GGD (known_DMI)[0m
[94mDEG_COP1_1_5IGO (known_DMI)[0m
[94mDEG_Kelch_Keap1_1_2FLU (known_DMI)[0m
[94mDEG_Kelch_Keap1_2_3WN7 (known_DMI)[0m
[94mDEG_MDM2_SWIB_1_1YCR (known_DMI)[0m
[94mDEG_SCF_COI1_1_3OGL (known_DMI)[0m
[94mDEG_SCF_FBXO31_1_5VZU (known_DMI)[0m
[94mDEG_SCF_TIR1_1_2P1Q (known_DMI)[0m
[94mDEG_SIAH_1_2A25 (known_DMI)[0m
[94mDEG_SPOP_SBC_1_3HQM (known_DMI)[0m
[94mDOC_AGCK_PIF_3_1ATP (known_DMI)[0m
[94mDOC_ANK_TNKS_1_3TWU (known_DMI)[0m
[94mDOC_CDC14_PxL_1_6G84 (known_DMI)[0m
[94mDOC_CYCLIN_RxL_1_1H25 (known_DMI)[0m
[94mDOC_GSK3_Axin_1_1O9U (known_DMI)[0m
[94mDOC_MAPK_DCC_7_2B9J (known_DMI)[0m
[94mDOC_MAPK_GRA24_9_5ETA (known_DMI)[0m
[94mDOC_MAPK_HePTP_8_2GPH (known_DMI)[0m
[94mDOC_MAPK_JIP1_4_4H3B (known_DMI)[0m
[94mDOC_MAPK_MEF2A_6_4H3Q (known_DMI)[0m
[94mDOC_MAPK_NFAT4_5_2XS0 (known_DMI)[0m
[94mDOC_MAPK_RevD_3_3TEI (known_DMI)[0m
[94mDOC_MIT_MIM_1_2JQ9 (known_DMI)[0m
[94mDOC_PP1_MyPhoNE_1_1S70 (known_DMI)

Unnamed: 0,model_preset,benchmark_set,prediction_name,model_id,ranking_score,chainA_length,fraction_disordered,chainB_length,has_clash,iptm,...,num_align_atoms_domain,num_align_resi_domain,RMSD_all_atom,RMSD_domain,RMSD_backbone_peptide,RMSD_all_atom_peptide,DockQ,iRMSD,LRMSD,Fnonnat
0,alphafold3,known_DMI,DEG_APCC_KENBOX_2_4GGD,ranked_0,0.97,312,0.02,5,0.0,0.96,...,2414,312,0.774309,0.760539,0.833257,1.382265,0.967617,0.341276,0.831159,0.000000
1,alphafold3,known_DMI,DEG_APCC_KENBOX_2_4GGD,ranked_1,0.97,312,0.02,5,0.0,0.96,...,2414,312,0.779012,0.73746,1.174286,2.126553,0.943274,0.426121,1.169542,0.076923
2,alphafold3,known_DMI,DEG_APCC_KENBOX_2_4GGD,ranked_2,0.96,312,0.02,5,0.0,0.96,...,2414,312,0.886367,0.839664,1.980533,2.412356,0.908970,0.621263,1.944753,0.111111
3,alphafold3,known_DMI,DEG_APCC_KENBOX_2_4GGD,ranked_3,0.96,312,0.02,5,0.0,0.95,...,2414,312,0.769146,0.750947,1.043171,1.522163,0.961964,0.382246,1.037074,0.038462
4,alphafold3,known_DMI,DEG_APCC_KENBOX_2_4GGD,ranked_4,0.96,312,0.02,5,0.0,0.95,...,2414,312,0.788792,0.727131,1.757695,2.539318,0.931179,0.573091,1.749101,0.074074
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3175,alphafold3,random_DDI,D1PF18773_PF00071_2X19.D2PF00009_PF01873_2D74,ranked_0,0.36,60,0.22,113,0.0,0.19,...,906,113,4.124206,4.124205,0.0,0.0,,,,
3176,alphafold3,random_DDI,D1PF18773_PF00071_2X19.D2PF00009_PF01873_2D74,ranked_1,0.23,60,0.08,113,0.0,0.12,...,906,113,4.952358,4.952358,0.0,0.0,,,,
3177,alphafold3,random_DDI,D1PF18773_PF00071_2X19.D2PF00009_PF01873_2D74,ranked_2,0.22,60,0.14,113,0.0,0.07,...,906,113,4.878703,4.878703,0.0,0.0,,,,
3178,alphafold3,random_DDI,D1PF18773_PF00071_2X19.D2PF00009_PF01873_2D74,ranked_3,0.21,60,0.07,113,0.0,0.10,...,906,113,4.806929,4.806929,0.0,0.0,,,,


### 4 IPSAE metric

In [None]:
def calc_ipsae_metric(row: pd.Series):
    path_cif = path_AF_luck_drive / Path(row["model_path"])
    path_confidences = path_cif.parent / "confidences.json"

    with tempfile.TemporaryDirectory() as tmpdir:
        shutil.copy(path_cif, tmp_path_cif := (Path(tmpdir) / "model.cif"))
        shutil.copy(path_confidences, tmp_path_confidences := (Path(tmpdir) / "confidences.json"))
        subprocess.run(["python", path_ipsae_script, tmp_path_confidences, tmp_path_cif, "10", "10"], env=os.environ.copy())

        path_output = Path(tmpdir) / "model_10_10.txt"

        df_ipsae = pd.read_csv(path_output, header=0, skiprows=[0], sep=" ", skipinitialspace=True)
    return df_ipsae

# For AF2 the json files do not exist anymore
if af_mode == "AF3":
    dataAF["ipSAE"] = np.nan
    for i, row in dataAF.iterrows():
        if row["model_id"] == "ranked_0":
            print(row["prediction_name"], f"({round(100*i/len(dataAF))} %)")
        df_ipsae = calc_ipsae_metric(row)
        dataAF.at[i, "ipSAE"] = np.float64(df_ipsae["ipSAE"][2])
display(dataAF)

### 5 Interaction metrics

In [9]:
libpath = Path("../src").resolve()
print(libpath)
sys.path.insert(0, str(libpath))
import measure_PPI

D:\Eigene Datein\Programmieren\Git\abrilka\bachelorthesis\src


In [None]:
pathObj = []
for i, row in dataAF.iterrows():
    benchmark_set = str(row["benchmark_set"])
    _set = "DDI" if "DDI" in benchmark_set else "DMI"
    prediction_name = str(row["prediction_name"]) if row.notnull()["prediction_name"] else None
    model_id = str(row["model_id"]) if row.notnull()["model_id"] else None

    structure_path = path_resources / af_mode / _set / benchmark_set / prediction_name / (model_id + ".pdb")
    if not structure_path.exists():
        if row["model_id"] == "ranked_0":
            print(f"\t{bcolors.FAIL}{prediction_name} ({benchmark_set}) does not exist.{bcolors.ENDC} Skip interface metrics")
        continue

    pathObj.append((structure_path.resolve(), prediction_name))
df_intf_metrics = measure_PPI.Run(pathObj=pathObj)

	[91mPF07724_PF00227_1OFH_C_resi39_resi340.H_resi1_resi172 (known_DDI) does not exist.[0m Skip interface metrics
	[91mPF14978_PF00327_3J7Y_o_resi13_resi101.Z_resi57_resi127 (known_DDI) does not exist.[0m Skip interface metrics
[2025-04-26 19:06:04,533 | measure_PPI | INFO] Started Taskpool of None processes for 3170 files
[2025-04-26 19:06:09,567 | measure_PPI | INFO] 1% - ETA 0:04:34 | current speed 11.529 s⁻¹ | average speed 11.33 s⁻¹
[2025-04-26 19:06:14,571 | measure_PPI | INFO] 5% - ETA 0:02:58 | current speed 22.381 s⁻¹ | average speed 16.841 s⁻¹
[2025-04-26 19:06:19,586 | measure_PPI | INFO] 10% - ETA 0:02:06 | current speed 33.704 s⁻¹ | average speed 22.459 s⁻¹
[2025-04-26 19:06:24,636 | measure_PPI | INFO] 14% - ETA 0:01:54 | current speed 27.125 s⁻¹ | average speed 23.632 s⁻¹
[2025-04-26 19:06:29,659 | measure_PPI | INFO] 21% - ETA 0:01:34 | current speed 38.218 s⁻¹ | average speed 26.548 s⁻¹
[2025-04-26 19:06:34,883 | measure_PPI | INFO] 26% - ETA 0:01:25 | current speed

In [None]:
display(df_intf_metrics)

Unnamed: 0,structure_name,file,hbonds,salt_bridges,buried_area,min_distance,hydrophobic_interactions
2935,D1PF00009_PF01873_2D74.D2PF00026_PF06394_1F34,ranked_0.pdb,0,0,1894.381,4.469,79
2936,D1PF00009_PF01873_2D74.D2PF00026_PF06394_1F34,ranked_1.pdb,0,2,3106.188,4.490,172
2942,D1PF00009_PF01873_2D74.D2PF00026_PF06394_1F34,ranked_2.pdb,0,1,2040.531,4.759,113
2943,D1PF00009_PF01873_2D74.D2PF00026_PF06394_1F34,ranked_3.pdb,0,0,1331.882,4.668,43
2944,D1PF00009_PF01873_2D74.D2PF00026_PF06394_1F34,ranked_4.pdb,0,0,2068.876,3.861,106
...,...,...,...,...,...,...,...
2704,TRG_PTS1_2C0L_NAKL.NAKD,ranked_0.pdb,0,1,862.138,4.511,9
2708,TRG_PTS1_2C0L_NAKL.NAKD,ranked_1.pdb,0,1,852.458,4.482,9
2706,TRG_PTS1_2C0L_NAKL.NAKD,ranked_2.pdb,0,1,849.137,4.468,9
2707,TRG_PTS1_2C0L_NAKL.NAKD,ranked_3.pdb,0,1,860.881,4.482,9


In [None]:
dataAF["min_distance"] = None
dataAF["buried_area"] = None
dataAF["disulfide_bonds"] = None
dataAF["salt_bridges"] = None
dataAF["hbonds"] = None
dataAF["hydrophobic_interactions"] = None

for i, row_intf in df_intf_metrics.iterrows():
    row = dataAF[(dataAF["prediction_name"] == row_intf["structure_name"]) & (dataAF["model_id"] == row_intf["file"].replace(".pdb", ""))]
    if len(row) != 1:
        print(f"\t{bcolors.FAIL}Failed to locate {row_intf["structure_name"]} {row_intf["file"]}{bcolors.ENDC}")
        continue

    dataAF.at[i, "buried_area"] = row_intf["buried_area"]
    dataAF.at[i, "min_distance"] = row_intf["min_distance"]
    dataAF.at[i, "salt_bridges"] = row_intf["salt_bridges"]
    dataAF.at[i, "hbonds"] = row_intf["hbonds"]
    dataAF.at[i, "hydrophobic_interactions"] = row_intf["hydrophobic_interactions"]
    dataAF.at[i, "disulfide_bonds"] = row_intf["disulfide_bonds"]
display(dataAF)

Unnamed: 0,model_preset,benchmark_set,prediction_name,model_id,ranking_score,chainA_length,fraction_disordered,chainB_length,has_clash,iptm,...,DockQ,iRMSD,LRMSD,Fnonnat,ipSAE,hbonds,salt_bridges,buried_area,min_distance,hydrophobic_interactions
0,alphafold3,known_DMI,DEG_APCC_KENBOX_2_4GGD,ranked_0,0.97,312,0.02,5,0.0,0.96,...,0.967617,0.341276,0.831159,0.000000,0.869025,0,9,848.152,5.371,6
1,alphafold3,known_DMI,DEG_APCC_KENBOX_2_4GGD,ranked_1,0.97,312,0.02,5,0.0,0.96,...,0.943274,0.426121,1.169542,0.076923,0.868551,0,0,603.522,6.264,0
2,alphafold3,known_DMI,DEG_APCC_KENBOX_2_4GGD,ranked_2,0.96,312,0.02,5,0.0,0.96,...,0.908970,0.621263,1.944753,0.111111,0.855837,0,0,602.041,6.241,0
3,alphafold3,known_DMI,DEG_APCC_KENBOX_2_4GGD,ranked_3,0.96,312,0.02,5,0.0,0.95,...,0.961964,0.382246,1.037074,0.038462,0.850758,0,0,615.163,6.207,0
4,alphafold3,known_DMI,DEG_APCC_KENBOX_2_4GGD,ranked_4,0.96,312,0.02,5,0.0,0.95,...,0.931179,0.573091,1.749101,0.074074,0.850449,0,0,662.502,5.89,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3175,alphafold3,random_DDI,D1PF18773_PF00071_2X19.D2PF00009_PF01873_2D74,ranked_0,0.36,60,0.22,113,0.0,0.19,...,,,,,0.012324,0,1,1447.877,4.251,19
3176,alphafold3,random_DDI,D1PF18773_PF00071_2X19.D2PF00009_PF01873_2D74,ranked_1,0.23,60,0.08,113,0.0,0.12,...,,,,,0.000000,0,1,1482.311,4.197,25
3177,alphafold3,random_DDI,D1PF18773_PF00071_2X19.D2PF00009_PF01873_2D74,ranked_2,0.22,60,0.14,113,0.0,0.07,...,,,,,0.000000,0,2,1564.435,4.783,18
3178,alphafold3,random_DDI,D1PF18773_PF00071_2X19.D2PF00009_PF01873_2D74,ranked_3,0.21,60,0.07,113,0.0,0.10,...,,,,,0.000000,0,0,1431.933,4.257,59


### Save

In [11]:
dataAF

Unnamed: 0,model_preset,benchmark_set,prediction_name,model_id,ranking_score,chainA_length,chainB_length,chainA_id,chainB_id,chainA_start,...,DockQ,iRMSD,LRMSD,Fnonnat,buried_area,min_distance,salt_bridges,hbonds,hydrophobic_interactions,ipSAE
0,alphafold3,known_DMI,DEG_APCC_KENBOX_2_4GGD,ranked_0,0.97,312,5,A,B,165,...,0.967617,0.341276,0.831159,0.000000,848.152,5.371,9,0,6,0.869025
1,alphafold3,known_DMI,DEG_APCC_KENBOX_2_4GGD,ranked_1,0.97,312,5,A,B,165,...,0.943274,0.426121,1.169542,0.076923,603.522,6.264,0,0,0,0.868551
2,alphafold3,known_DMI,DEG_APCC_KENBOX_2_4GGD,ranked_2,0.96,312,5,A,B,165,...,0.908970,0.621263,1.944753,0.111111,602.041,6.241,0,0,0,0.855837
3,alphafold3,known_DMI,DEG_APCC_KENBOX_2_4GGD,ranked_3,0.96,312,5,A,B,165,...,0.961964,0.382246,1.037074,0.038462,615.163,6.207,0,0,0,0.850758
4,alphafold3,known_DMI,DEG_APCC_KENBOX_2_4GGD,ranked_4,0.96,312,5,A,B,165,...,0.931179,0.573091,1.749101,0.074074,662.502,5.890,0,0,0,0.850449
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3175,alphafold3,random_DDI,D1PF18773_PF00071_2X19.D2PF00009_PF01873_2D74,ranked_0,0.36,60,113,B,B,392,...,,,,,1447.877,4.251,1,0,19,0.012324
3176,alphafold3,random_DDI,D1PF18773_PF00071_2X19.D2PF00009_PF01873_2D74,ranked_1,0.23,60,113,B,B,392,...,,,,,1482.311,4.197,1,0,25,0.000000
3177,alphafold3,random_DDI,D1PF18773_PF00071_2X19.D2PF00009_PF01873_2D74,ranked_2,0.22,60,113,B,B,392,...,,,,,1564.435,4.783,2,0,18,0.000000
3178,alphafold3,random_DDI,D1PF18773_PF00071_2X19.D2PF00009_PF01873_2D74,ranked_3,0.21,60,113,B,B,392,...,,,,,1431.933,4.257,0,0,59,0.000000


In [13]:
enhance_dataframe()
dataAF.to_csv(path_resources / af_mode / (af_mode + "_metrics.tsv"), sep="\t", index=None)
dataAF.to_excel(path_resources / af_mode / (af_mode + "_metrics.xlsx"), sheet_name="AF2 metrics", index=None)