# Benchmark set evaluation
Created 2025-05-15

It was found, that some structures are missing in either the known, random or mutations dataset. I think I know now why: Some DMI/DDI types have more than one PDB structure and vise versa a PDB structure could link to multiple DMI/DDI types. Now its time to investigate this one and for all 

In [2]:
# Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.axes._axes import Axes
from matplotlib.figure import Figure
from pathlib import Path
from sklearn.metrics import roc_curve, roc_auc_score
import re
import tempfile
import shutil
import os
import subprocess
import sys
stdout, stderr = sys.stdout, sys.stderr
from typing import Literal

import pymol
from Bio.SeqUtils import seq1
from Bio.PDB import PDBParser
from Bio.PDB.Structure import Structure as BioPy_PDBStructure
from Bio.PDB.Model import Model as BioPy_PDBModel
from Bio.PDB.Chain import Chain
from Bio.PDB.PDBExceptions import PDBConstructionException
parser = PDBParser(QUIET=True)

class bcolors:
    HEADER = '\033[95m'
    OKBLUE = '\033[94m'
    OKCYAN = '\033[96m'
    OKGREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    ENDC = '\033[0m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'

In [6]:
# Settings

# Path to resource folder with the structures and metadata tables
path_resources = Path(r"D:\Eigene Datein\dev\Uni\JGU Bio Bachelorthesis\Daten\resources")

path_AF2 = path_resources / "AF2"
path_AF3 = path_resources / "AF3_hydrogens"
path_solved = path_resources / "solved"

In [8]:
dataLee = pd.read_excel(path_resources / "benchmark set" / "Lee et al supplemental" / "44320_2023_5_moesm3_esm.xlsx", sheet_name="Minimal DMI dataset")
display(dataLee)

Unnamed: 0,dmi_type,regular_expression,pdb_id,methods,organisms,uniprot_motif,uniprot_domain,chain_motif,chain_domain,chain_motif_start,chain_motif_end,chain_domain_start,chain_domain_end,motif_secondary_structure
0,DEG_APCC_KENBOX_2,.KEN.,4GGD,mutation analysis; pull down; x-ray crystallog...,Homo sapiens,O60566,Q12834,D,B,6,10,165,476,L
1,DEG_COP1_1,"[STDE]{1,3}.{0,2}[TSDE].{2,3}VP[STDE]G{0,1}[FL...",5IGO,coimmunoprecipitation; competition binding; fl...,Homo sapiens,Q96RU8,P43254,X,D,354,361,352,675,L
2,DEG_Kelch_Keap1_1,[DNS].[DES][TNS]GE,2FLU,alanine scanning; coimmunoprecipitation; compe...,Homo sapiens,Q16236,Q14145,P,X,77,82,325,609,L
3,DEG_Kelch_Keap1_2,QD.DLGV,3WN7,alanine scanning; glutathione s tranferase tag...,Mus musculus,Q60795,Q9Z2X8,B,A,26,32,324,609,H
4,DEG_MDM2_SWIB_1,"F[^P]{3}W[^P]{2,3}[VIL]",1YCR,fluorescence polarization spectroscopy; isothe...,Homo sapiens,P04637,Q00987,B,A,19,26,30,109,H
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
131,LIG_ActinCP_TwfCPI_2,"F.[KR]P..[PAS].{0,3}[RK]",7DS2,colocalization; comigration in non denaturing ...,Mus musculus,Q91YR1,P14315,C,B,323,333,2,244,L
132,LIG_KLC1_Yacidic_2,"[ED].{0,1}[IYVLMTF]Y[LIV][DE]",6FUZ,alanine scanning; coimmunoprecipitation; fluor...,Homo sapiens,Q9UQF2,O88447,A,A,707,711,214,436,L
133,LIG_LYPXL_SIV_4,[PA]Y..[AV][^P]{3}L,2XS1,biosensor; mutation analysis; western blot; x-...,Simian immunodeficiency virus - mac K6W,P05893,Q8WUM4,B,A,44,52,2,698,H
134,TRG_DiLeu_BaEn_1,E..[^P]L[LIVM],4NEE,isothermal titration calorimetry; mutation ana...,HIV-1 M:B_HXB2R,P04601,P62744,C,D,160,165,1,142,L


In [17]:
dataLee2 = pd.read_excel(path_resources / "benchmark set" / "Lee et al supplemental" / "44320_2023_5_moesm4_esm.xlsx", sheet_name="AF-MMv2.2 result")
display(dataLee2)

Unnamed: 0,prediction_name,chainA_length,chainB_length,model_id,model_confidence,chainA_intf_avg_plddt,chainB_intf_avg_plddt,intf_avg_plddt,pDockQ,iPAE,num_chainA_intf_res,num_chainB_intf_res,num_res_res_contact,num_atom_atom_contact,num_mutation_in_motif,label
0,DEG_APCC_KENBOX_2_4GGD,312,5,ranked_0,0.887117,96.107999,77.495999,91.454999,0.162263,3.311542,15,5,23,208,known minimal,1
1,DEG_APCC_KENBOX_2_4GGD,312,5,ranked_1,0.871984,95.793846,73.986000,89.736111,0.145001,3.395909,13,5,20,190,known minimal,1
2,DEG_APCC_KENBOX_2_4GGD,312,5,ranked_2,0.760784,95.547501,57.906001,86.585239,0.116743,6.166772,16,5,27,237,known minimal,1
3,DEG_APCC_KENBOX_2_4GGD,312,5,ranked_3,0.413662,94.646667,21.510000,76.362500,0.036380,16.713730,9,3,11,83,known minimal,1
4,DEG_APCC_KENBOX_2_4GGD,312,5,ranked_4,0.359078,94.830001,19.753333,72.307001,0.029969,18.696838,7,3,9,108,known minimal,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2695,TRG_PTS1_2C0L_NAKL.NAKD,305,4,ranked_0,0.877999,96.388571,74.585001,92.900000,0.234389,2.903771,21,4,31,271,1,0
2696,TRG_PTS1_2C0L_NAKL.NAKD,305,4,ranked_1,0.868754,96.614545,75.270000,93.330769,0.233830,3.026700,22,4,31,262,1,0
2697,TRG_PTS1_2C0L_NAKL.NAKD,305,4,ranked_2,0.852158,96.452917,75.197498,93.416428,0.223745,3.121255,24,4,35,272,1,0
2698,TRG_PTS1_2C0L_NAKL.NAKD,305,4,ranked_3,0.819368,95.764286,69.147499,91.505600,0.204897,4.233161,21,4,31,258,1,0


In [7]:
dataAF2 = pd.read_csv(path_AF2 / "AF2_metrics.tsv", sep="\t")
for c in ["chainA_start", "chainA_end", "chainB_start", "chainB_end", "num_mutations", "num_align_atoms_domain", "num_align_resi_domain", "hbonds", "salt_bridges", "hydrophobic_interactions"]:
    if c not in dataAF2.columns:
        print(f"Column {bcolors.FAIL}{c}{bcolors.ENDC} not (yet) in data frame")
        continue
    dataAF2[c] = dataAF2[c].astype(pd.Int64Dtype())
display(dataAF2)

Unnamed: 0,project_name,run_id,benchmark_set,prediction_name,model_id,chainA_length,chainB_length,chainA_id,chainB_id,chainA_start,...,DockQ,iRMSD,LRMSD,Fnonnat,buried_area,min_distance,disulfide_bonds,salt_bridges,hbonds,hydrophobic_interactions
0,AlphaFold_benchmark,run37,known_DMI,DEG_APCC_KENBOX_2_4GGD,ranked_0,312,5,A,B,165,...,0.878344,0.603831,1.575394,0.086957,613.651,6.063,0.0,0,9,0
1,AlphaFold_benchmark,run37,known_DMI,DEG_APCC_KENBOX_2_4GGD,ranked_1,312,5,A,B,165,...,0.880716,0.418230,1.100588,0.050000,580.310,6.083,0.0,0,9,0
2,AlphaFold_benchmark,run37,known_DMI,DEG_APCC_KENBOX_2_4GGD,ranked_2,312,5,A,B,165,...,0.883186,0.641834,1.776257,0.185185,662.104,6.072,0.0,0,10,3
3,AlphaFold_benchmark,run37,known_DMI,DEG_APCC_KENBOX_2_4GGD,ranked_3,312,5,A,B,165,...,0.475511,1.686332,5.358800,0.363636,398.498,5.417,0.0,0,2,0
4,AlphaFold_benchmark,run37,known_DMI,DEG_APCC_KENBOX_2_4GGD,ranked_4,312,5,A,B,165,...,0.223400,2.928606,9.908745,0.888889,323.304,5.092,0.0,0,2,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3175,AlphaFold_benchmark_DDI,run6,random_DDI,D1PF18773_PF00071_2X19.D2PF00009_PF01873_2D74,ranked_0,60,113,B,B,392,...,,,,,1617.382,5.591,0.0,3,7,56
3176,AlphaFold_benchmark_DDI,run6,random_DDI,D1PF18773_PF00071_2X19.D2PF00009_PF01873_2D74,ranked_1,60,113,B,B,392,...,,,,,791.256,6.373,0.0,0,3,7
3177,AlphaFold_benchmark_DDI,run6,random_DDI,D1PF18773_PF00071_2X19.D2PF00009_PF01873_2D74,ranked_2,60,113,B,B,392,...,,,,,882.547,7.906,0.0,1,2,11
3178,AlphaFold_benchmark_DDI,run6,random_DDI,D1PF18773_PF00071_2X19.D2PF00009_PF01873_2D74,ranked_3,60,113,B,B,392,...,,,,,1020.896,4.628,0.0,3,7,44


### 1 Finding differences in the datasets

In [9]:
dataMerged = pd.merge(
    left=dataLee,
    right=dataAF2[(dataAF2["benchmark_set"] == "known_DMI") & (dataAF2["model_id"] == "ranked_0")],
    left_on=["dmi_type"],
    right_on=["ELM_instance"],
    how="outer"
)
display(dataMerged)

Unnamed: 0,dmi_type,regular_expression,pdb_id,methods,organisms,uniprot_motif,uniprot_domain,chain_motif,chain_domain,chain_motif_start,...,DockQ,iRMSD,LRMSD,Fnonnat,buried_area,min_distance,disulfide_bonds,salt_bridges,hbonds,hydrophobic_interactions
0,DEG_APCC_KENBOX_2,.KEN.,4GGD,mutation analysis; pull down; x-ray crystallog...,Homo sapiens,O60566,Q12834,D,B,6,...,0.878344,0.603831,1.575394,0.086957,613.651,6.063,0.0,0,9,0
1,DEG_COP1_1,"[STDE]{1,3}.{0,2}[TSDE].{2,3}VP[STDE]G{0,1}[FL...",5IGO,coimmunoprecipitation; competition binding; fl...,Homo sapiens,Q96RU8,P43254,X,D,354,...,0.203083,3.647857,11.785731,0.818182,1025.173,5.075,0.0,0,6,54
2,DEG_Kelch_Keap1_1,[DNS].[DES][TNS]GE,2FLU,alanine scanning; coimmunoprecipitation; compe...,Homo sapiens,Q16236,Q14145,P,X,77,...,0.956840,0.406243,0.409987,0.135135,853.680,5.658,0.0,4,12,6
3,DEG_Kelch_Keap1_2,QD.DLGV,3WN7,alanine scanning; glutathione s tranferase tag...,Mus musculus,Q60795,Q9Z2X8,B,A,26,...,0.627900,1.098010,3.811533,0.357143,847.029,5.699,0.0,2,4,18
4,DEG_MDM2_SWIB_1,"F[^P]{3}W[^P]{2,3}[VIL]",1YCR,fluorescence polarization spectroscopy; isothe...,Homo sapiens,P04637,Q00987,B,A,19,...,0.927639,0.491499,0.974915,0.038462,969.729,4.725,0.0,0,2,104
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
131,TRG_ER_FFAT_1,"[EDS].{0,4}[ED][FY][FYKREM][DE][AC].{1,2}[EDST]",2RR3,classical fluorescence spectroscopy; confocal ...,Homo sapiens,P22059,Q9P0L0,B,A,356,...,0.832137,1.023273,1.642793,0.128205,1121.818,4.348,0.0,0,7,53
132,TRG_LysEnd_GGAAcLL_1,"D..LL.{1,2}$",1JWG,isothermal titration calorimetry; mutation ana...,Homo sapiens,P11717,Q9UJY5,C,A,7,...,0.949574,0.470555,1.451951,0.093750,1066.671,5.933,0.0,1,6,60
133,TRG_NES_CRM1_1,"([DEQ].{0,1}[LIM].{2,3}[LIVMF][^P]{2,3}[LMVF]....",3GB8,mutation analysis; pull down; x-ray crystallog...,Homo sapiens,O95149,O14980,B,A,2,...,0.683162,1.423170,2.824227,0.153846,1389.344,4.832,0.0,3,5,104
134,TRG_NLS_Bipartite_1,"[KR][KR].{7,15}[^DE]((K[RK])|(RK))(([^DE][KR])...",1PJM,colocalization; mutation analysis; x-ray cryst...,Homo sapiens,P06400,P52293,A,B,860,...,0.838144,0.780334,2.332849,0.160000,2256.518,4.881,0.0,4,20,50


In [13]:
dataAF2[dataAF2["hbonds"].isna()]

Unnamed: 0,project_name,run_id,benchmark_set,prediction_name,model_id,chainA_length,chainB_length,chainA_id,chainB_id,chainA_start,...,DockQ,iRMSD,LRMSD,Fnonnat,buried_area,min_distance,disulfide_bonds,salt_bridges,hbonds,hydrophobic_interactions
2860,AlphaFold_benchmark_DDI,run5,known_DDI,PF07724_PF00227_1OFH_C_resi39_resi340.H_resi1_...,ranked_0,168,172,C,H,39,...,,,,,,,,,,
2861,AlphaFold_benchmark_DDI,run5,known_DDI,PF07724_PF00227_1OFH_C_resi39_resi340.H_resi1_...,ranked_1,168,172,C,H,39,...,,,,,,,,,,
2862,AlphaFold_benchmark_DDI,run5,known_DDI,PF07724_PF00227_1OFH_C_resi39_resi340.H_resi1_...,ranked_2,168,172,C,H,39,...,,,,,,,,,,
2863,AlphaFold_benchmark_DDI,run5,known_DDI,PF07724_PF00227_1OFH_C_resi39_resi340.H_resi1_...,ranked_3,168,172,C,H,39,...,,,,,,,,,,
2864,AlphaFold_benchmark_DDI,run5,known_DDI,PF07724_PF00227_1OFH_C_resi39_resi340.H_resi1_...,ranked_4,168,172,C,H,39,...,,,,,,,,,,
2920,AlphaFold_benchmark_DDI,run5,known_DDI,PF14978_PF00327_3J7Y_o_resi13_resi101.Z_resi57...,ranked_0,89,71,o,Z,13,...,,,,,,,,,,
2921,AlphaFold_benchmark_DDI,run5,known_DDI,PF14978_PF00327_3J7Y_o_resi13_resi101.Z_resi57...,ranked_1,89,71,o,Z,13,...,,,,,,,,,,
2922,AlphaFold_benchmark_DDI,run5,known_DDI,PF14978_PF00327_3J7Y_o_resi13_resi101.Z_resi57...,ranked_2,89,71,o,Z,13,...,,,,,,,,,,
2923,AlphaFold_benchmark_DDI,run5,known_DDI,PF14978_PF00327_3J7Y_o_resi13_resi101.Z_resi57...,ranked_3,89,71,o,Z,13,...,,,,,,,,,,
2924,AlphaFold_benchmark_DDI,run5,known_DDI,PF14978_PF00327_3J7Y_o_resi13_resi101.Z_resi57...,ranked_4,89,71,o,Z,13,...,,,,,,,,,,


In [14]:
dataMerged[dataMerged["pdb_id"].isna()]

Unnamed: 0,dmi_type,regular_expression,pdb_id,methods,organisms,uniprot_motif,uniprot_domain,chain_motif,chain_domain,chain_motif_start,...,DockQ,iRMSD,LRMSD,Fnonnat,buried_area,min_distance,disulfide_bonds,salt_bridges,hbonds,hydrophobic_interactions


---> Thats good, the known DMI are equal. But where are the differences in the other comming from?

In [16]:
dataMerged[dataMerged["chain_domain_start"] != dataMerged["chainA_start"]]

Unnamed: 0,dmi_type,regular_expression,pdb_id,methods,organisms,uniprot_motif,uniprot_domain,chain_motif,chain_domain,chain_motif_start,...,DockQ,iRMSD,LRMSD,Fnonnat,buried_area,min_distance,disulfide_bonds,salt_bridges,hbonds,hydrophobic_interactions
80,LIG_MYND_2,PP.LI,2ODD,fluorescence polarization spectroscopy; isothe...,Homo sapiens,Q9Y618,Q06455,B,A,1105,...,0.784829,1.963057,1.018266,0.071429,796.717,4.986,0.0,0,6,51
102,LIG_RPA_C_Vert,[KRS]I[^P][^P][NK][KR][^P][^P]A[^P][^P][RKL][R...,1DPU,isothermal titration calorimetry; mutation ana...,Homo sapiens,P13051,P15927,B,A,73,...,0.667361,1.984128,4.182533,0.333333,936.565,3.966,0.0,1,6,14
103,LIG_Rb_LxCxE_1,"[DEST].{0,4}[LI].C.E.{1,4}[FLMIVAWPHY].{0,8}[D...",1GH6,coimmunoprecipitation; glutathione s tranferas...,Simian virus 40,P03070,P06400,A,B,101,...,0.06652,7.156339,19.661015,1.0,1446.21,6.37,0.0,1,6,67


In [25]:
dataMerged2 = pd.merge(
    left=dataLee2,
    right=dataAF2,
    on=["prediction_name", "model_id"],
    suffixes=["_Lee", "_AF2"],
    how="outer"
)
display(dataMerged2)

Unnamed: 0,prediction_name,chainA_length_Lee,chainB_length_Lee,model_id,model_confidence_Lee,chainA_intf_avg_plddt_Lee,chainB_intf_avg_plddt_Lee,intf_avg_plddt_Lee,pDockQ_Lee,iPAE_Lee,...,DockQ,iRMSD,LRMSD,Fnonnat,buried_area,min_distance,disulfide_bonds,salt_bridges,hbonds,hydrophobic_interactions
0,D1PF00009_PF01873_2D74.D2PF00026_PF06394_1F34,,,ranked_0,,,,,,,...,,,,,1692.652,4.191,0.0,0,1,86
1,D1PF00009_PF01873_2D74.D2PF00026_PF06394_1F34,,,ranked_1,,,,,,,...,,,,,1708.317,3.670,0.0,0,3,96
2,D1PF00009_PF01873_2D74.D2PF00026_PF06394_1F34,,,ranked_2,,,,,,,...,,,,,1907.898,4.643,0.0,1,1,79
3,D1PF00009_PF01873_2D74.D2PF00026_PF06394_1F34,,,ranked_3,,,,,,,...,,,,,1577.972,3.718,0.0,0,5,107
4,D1PF00009_PF01873_2D74.D2PF00026_PF06394_1F34,,,ranked_4,,,,,,,...,,,,,2139.539,3.955,0.0,1,15,101
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3175,TRG_PTS1_2C0L_NAKL.NAKD,305.0,4.0,ranked_0,0.877999,96.388571,74.585001,92.900000,0.234389,2.903771,...,,,,,839.162,4.782,0.0,3,10,11
3176,TRG_PTS1_2C0L_NAKL.NAKD,305.0,4.0,ranked_1,0.868754,96.614545,75.270000,93.330769,0.233830,3.026700,...,,,,,862.629,4.900,0.0,3,8,10
3177,TRG_PTS1_2C0L_NAKL.NAKD,305.0,4.0,ranked_2,0.852158,96.452917,75.197498,93.416428,0.223745,3.121255,...,,,,,861.115,4.704,0.0,2,12,9
3178,TRG_PTS1_2C0L_NAKL.NAKD,305.0,4.0,ranked_3,0.819368,95.764286,69.147499,91.505600,0.204897,4.233161,...,,,,,854.038,4.943,0.0,2,11,9


In [26]:
dataMerged2[dataMerged2["chainA_length_Lee"].isna() & ~(dataMerged2["benchmark_set"].isin(["known_DDI", "random_DDI"]))]

Unnamed: 0,prediction_name,chainA_length_Lee,chainB_length_Lee,model_id,model_confidence_Lee,chainA_intf_avg_plddt_Lee,chainB_intf_avg_plddt_Lee,intf_avg_plddt_Lee,pDockQ_Lee,iPAE_Lee,...,DockQ,iRMSD,LRMSD,Fnonnat,buried_area,min_distance,disulfide_bonds,salt_bridges,hbonds,hydrophobic_interactions


In [None]:
dataMerged2[dataMerged2["chainA_length_Lee"].isna() & ~(dataMerged2["benchmark_set"].isin(["known_DDI", "random_DDI"]))]

In [32]:
dataMerged2.columns

Index(['prediction_name', 'chainA_length_Lee', 'chainB_length_Lee', 'model_id',
       'model_confidence_Lee', 'chainA_intf_avg_plddt_Lee',
       'chainB_intf_avg_plddt_Lee', 'intf_avg_plddt_Lee', 'pDockQ_Lee',
       'iPAE_Lee', 'num_chainA_intf_res_Lee', 'num_chainB_intf_res_Lee',
       'num_res_res_contact_Lee', 'num_atom_atom_contact_Lee',
       'num_mutation_in_motif', 'label', 'project_name', 'run_id',
       'benchmark_set', 'chainA_length_AF2', 'chainB_length_AF2', 'chainA_id',
       'chainB_id', 'chainA_start', 'chainA_end', 'chainB_start', 'chainB_end',
       'PDB_id', 'ELM_instance', 'DDI_pfam_id', 'PDB_id_random_paired',
       'ELM_instance_random_paired', 'DDI_pfam_id_random_paired',
       'sequence_initial', 'sequence_mutated', 'num_mutations',
       'model_confidence_AF2', 'chainA_intf_avg_plddt_AF2',
       'chainB_intf_avg_plddt_AF2', 'intf_avg_plddt_AF2', 'pDockQ_AF2',
       'iPAE_AF2', 'num_chainA_intf_res_AF2', 'num_chainB_intf_res_AF2',
       'num_res_res

In [33]:
dataMerged2[((dataMerged2["num_atom_atom_contact_AF2"] - dataMerged2["num_atom_atom_contact_Lee"])**2 > 0.000001) & (dataMerged2["benchmark_set"].str.contains("DMI"))]

Unnamed: 0,prediction_name,chainA_length_Lee,chainB_length_Lee,model_id,model_confidence_Lee,chainA_intf_avg_plddt_Lee,chainB_intf_avg_plddt_Lee,intf_avg_plddt_Lee,pDockQ_Lee,iPAE_Lee,...,DockQ,iRMSD,LRMSD,Fnonnat,buried_area,min_distance,disulfide_bonds,salt_bridges,hbonds,hydrophobic_interactions
