# Recreating metrics for solved structures

In [1]:
# Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.axes._axes import Axes
from matplotlib.figure import Figure
from pathlib import Path
from sklearn.metrics import roc_curve, roc_auc_score
import re
import tempfile
import shutil
import os
import subprocess
import sys
stdout, stderr = sys.stdout, sys.stderr
from typing import Literal

import pymol
from Bio.SeqUtils import seq1
from Bio.PDB import PDBParser
from Bio.PDB.Structure import Structure as BioPy_PDBStructure
from Bio.PDB.Model import Model as BioPy_PDBModel
from Bio.PDB.Chain import Chain
from Bio.PDB.PDBExceptions import PDBConstructionException
parser = PDBParser(QUIET=True)

class bcolors:
    HEADER = '\033[95m'
    OKBLUE = '\033[94m'
    OKCYAN = '\033[96m'
    OKGREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    ENDC = '\033[0m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'

In [2]:
# Path to resource folder with the structures and metadata tables
path_resources = Path(r"D:\Eigene Datein\dev\Uni\JGU Bio Bachelorthesis\Daten\resources")

# Paths to the local folders
path_solved = path_resources / "solved"

# Apply to structures with hydrogens or raw data
solved_set: Literal["raw", "hydrogens"] = "hydrogens"

In [3]:
# Read in solved structure data

dataSolved = pd.DataFrame(columns=["set", "PDB_id", "DDI_pfam_id", "path", "chainA_id", "chainB_id"])

# DMI
for structure_file in [p for p in Path(path_solved / ("DMI" + ("_hydrogens" if solved_set == "hydrogens" else ""))).iterdir() if p.is_file() and p.suffix == ".pdb"]:
    pdb_id = structure_file.name.split("_")[0]
    dataSolved.loc[len(dataSolved)] = {"set" : "DMI", "PDB_id": pdb_id, "path": structure_file.relative_to(path_solved), "chainA_id": "A", "chainB_id": "B"}

# DDI
for structure_file in [p for p in Path(path_solved / ("DDI" + ("_hydrogens" if solved_set == "hydrogens" else ""))).iterdir() if p.is_file() and p.suffix == ".pdb"]:
    ddi_pfam_id = "_".join(structure_file.name.split("_")[0:2])
    pdb_id = structure_file.name.split("_")[2]
    chainA_id = structure_file.name.split("_")[3][0]
    chainB_id = structure_file.name.split("_")[3][1]
    dataSolved.loc[len(dataSolved)] = {"set" : "DDI", "PDB_id": pdb_id, "DDI_pfam_id": ddi_pfam_id, "path": structure_file.relative_to(path_solved), "chainA_id": chainA_id, "chainB_id": chainB_id}

display(dataSolved)

Unnamed: 0,set,PDB_id,DDI_pfam_id,path,chainA_id,chainB_id
0,DMI,1ATP,,DMI_hydrogens\1ATP_min_DMI.pdb,A,B
1,DMI,1AXC,,DMI_hydrogens\1AXC_min_DMI.pdb,A,B
2,DMI,1B72,,DMI_hydrogens\1B72_min_DMI.pdb,A,B
3,DMI,1B8Q,,DMI_hydrogens\1B8Q_min_DMI.pdb,A,B
4,DMI,1BXX,,DMI_hydrogens\1BXX_min_DMI.pdb,A,B
...,...,...,...,...,...,...
183,DDI,3ZNI,PF14447_PF00179,DDI_hydrogens\PF14447_PF00179_3ZNI_AC.pdb,A,C
184,DDI,3J7Y,PF14978_PF00327,DDI_hydrogens\PF14978_PF00327_3J7Y_oZ.pdb,o,Z
185,DDI,6D6Q,PF15985_PF10175,DDI_hydrogens\PF15985_PF10175_6D6Q_GL.pdb,G,L
186,DDI,3KZ1,PF17838_PF00071,DDI_hydrogens\PF17838_PF00071_3KZ1_BE.pdb,B,E


In [4]:
import sys
libpath = Path("../src").resolve()
print(libpath)
sys.path.insert(0, str(libpath))
import measure_PPI
pathObj = []

for i, row in dataSolved.iterrows():
    structure_path: Path = path_resources / "solved" / row["path"]
    if not structure_path.exists():
        print(f"\t{bcolors.FAIL}{structure_path.name} does not exist.{bcolors.ENDC} Skip interface metrics")
        continue
    pathObj.append((structure_path.resolve(), row["path"].stem))
df_intf_metrics = measure_PPI.Run(pathObj=pathObj, num_threads=12)

D:\Eigene Datein\Programmieren\Git\abrilka\bachelorthesis\src
[2025-04-29 18:21:47,769 | measure_PPI | INFO] Started Taskpool of 12 processes for 188 files
[2025-04-29 18:21:52,810 | measure_PPI | INFO] 48% - ETA 0:00:05 | current speed 18.471 s⁻¹ | average speed 18.272 s⁻¹
[2025-04-29 18:21:58,188 | measure_PPI | INFO] 80% - ETA 0:00:02 | current speed 10.97 s⁻¹ | average speed 14.501 s⁻¹
[2025-04-29 18:22:03,584 | measure_PPI | INFO] 92% - ETA 0:00:01 | current speed 4.077 s⁻¹ | average speed 10.943 s⁻¹
[2025-04-29 18:22:09,322 | measure_PPI | INFO] 97% - ETA 0:00:00 | current speed 1.917 s⁻¹ | average speed 8.539 s⁻¹
[2025-04-29 18:22:27,495 | measure_PPI | INFO] 99% - ETA 0:00:00 | current speed 0.165 s⁻¹ | average speed 4.708 s⁻¹
[2025-04-29 18:22:27,564 | measure_PPI | INFO] Finished processing 188 objects in 0:00:39 | average speed 4.725 s⁻¹


In [5]:
display(df_intf_metrics)

Unnamed: 0,structure_name,file,hbonds,salt_bridges,buried_area,min_distance,hydrophobic_interactions,disulfide_bonds
12,1ATP_min_DMI,1ATP_min_DMI.pdb,1,0,808.592,6.273,77,0
9,1AXC_min_DMI,1AXC_min_DMI.pdb,4,0,1209.332,4.174,62,0
0,1B72_min_DMI,1B72_min_DMI.pdb,1,0,547.149,5.182,36,0
2,1B8Q_min_DMI,1B8Q_min_DMI.pdb,4,0,871.756,4.376,42,0
8,1BXX_min_DMI,1BXX_min_DMI.pdb,6,0,783.465,4.435,19,0
...,...,...,...,...,...,...,...,...
180,PF14447_PF00179_3ZNI_AC,PF14447_PF00179_3ZNI_AC.pdb,10,0,1288.881,5.268,85,0
175,PF14978_PF00327_3J7Y_oZ,PF14978_PF00327_3J7Y_oZ.pdb,8,0,2488.556,4.885,70,0
178,PF15985_PF10175_6D6Q_GL,PF15985_PF10175_6D6Q_GL.pdb,12,1,3659.160,3.891,167,0
182,PF17838_PF00071_3KZ1_BE,PF17838_PF00071_3KZ1_BE.pdb,10,2,1249.398,5.276,60,0


In [6]:
dataSolved["min_distance"] = None
dataSolved["buried_area"] = None
dataSolved["salt_bridges"] = None
dataSolved["hbonds"] = None
dataSolved["disulfide_bonds"] = None
dataSolved["hydrophobic_interactions"] = None

for i, row in dataSolved.iterrows():
    prediction_name = row["path"].stem
    if len(row_intf_m := df_intf_metrics[df_intf_metrics["structure_name"] == prediction_name]) != 1:
        print(f"Failed to locate the experimental structure for {prediction_name}")
        continue

    dataSolved.at[i, "min_distance"] = row_intf_m["min_distance"].item()
    dataSolved.at[i, "buried_area"] = row_intf_m["buried_area"].item()
    dataSolved.at[i, "salt_bridges"] = row_intf_m["salt_bridges"].item()
    dataSolved.at[i, "hbonds"] = row_intf_m["hbonds"].item()
    dataSolved.at[i, "hydrophobic_interactions"] = row_intf_m["hydrophobic_interactions"].item()
    dataSolved.at[i, "disulfide_bonds"] = row_intf_m["disulfide_bonds"].item()

In [7]:
name = "solved_hydrogens_metrics" if solved_set == "hydrogens" else "solved_metrics"
dataSolved.to_csv(path_solved / f"{name}.tsv", index=None, sep="\t")
dataSolved.to_excel(path_solved / f"{name}.xlsx", sheet_name="solved metrics", index=None)