# Parsing output of AF3 from the cluster
created by Andreas 2025-02-04

Script to parse output from AF3 running on the cluster. Also detects errors on the runs using the reports

In [215]:
# Imports
from pathlib import Path
import pandas as pd
import numpy as np
import re

from Bio.PDB import PDBParser
from Bio.PDB.Structure import Structure as BioPy_PDBStructure
from Bio.PDB.Model import Model as BioPy_PDBModel
from Bio.PDB.PDBExceptions import PDBConstructionException
parser = PDBParser(QUIET=True)

rescode_dict = {
    "ALA" : "A",
    "ARG" : "R",
    "ASN" : "N",
    "ASP" : "D",
    "CYS" : "C",
    "GLN" : "Q",
    "GLU" : "E",
    "GLY" : "G",
    "HIS" : "H",
    "ILE" : "I",
    "LEU" : "L",
    "LYS" : "K",
    "MET" : "M",
    "PHE" : "F",
    "PRO" : "P",
    "SER" : "S",
    "THR" : "T",
    "TRP" : "W",
    "TYR" : "Y",
    "VAL" : "V"
}

In [2]:
# Path
luck_drive_folder = Path(r"L:\imb-luckgr2\projects\AlphaFold\AlphaFold3")

DMI_folders = [p for p in (luck_drive_folder / "AlphaFold_benchmark_DMI").iterdir() if p.is_dir()]
DDI_folders = [p for p in (luck_drive_folder / "AlphaFold_benchmark_DDI").iterdir() if p.is_dir()]
benchmark_folders = DMI_folders + DDI_folders
for p in benchmark_folders:
    print(p)

L:\imb-luckgr2\projects\AlphaFold\AlphaFold3\AlphaFold_benchmark_DMI\known_minimal
L:\imb-luckgr2\projects\AlphaFold\AlphaFold3\AlphaFold_benchmark_DMI\random_minimal
L:\imb-luckgr2\projects\AlphaFold\AlphaFold3\AlphaFold_benchmark_DMI\mutations
L:\imb-luckgr2\projects\AlphaFold\AlphaFold3\AlphaFold_benchmark_DMI\known_extension
L:\imb-luckgr2\projects\AlphaFold\AlphaFold3\AlphaFold_benchmark_DDI\known_ddi
L:\imb-luckgr2\projects\AlphaFold\AlphaFold3\AlphaFold_benchmark_DDI\random_ddi


### Scanning input .json files and report.html files
Scans for all input .json files and corrosponding report_{time}.html files to find failed runs


benchmark_set refers to the pairing method (mutated, randomized, ...)

prediction_name is None if a report file without a unique prediction id exist

report_file is None if the input file has not been run on the cluster

run_ok refers to if there had been an error running the input file on the server and also None if the input file has not been run on the cluster.

In [6]:
report_df = pd.DataFrame(columns=["benchmark_set", "prediction_name", "report_file", "run_ok"])
for folder in benchmark_folders:
    benchmark_set = folder.name
    print(benchmark_set)
    nextflow_inputs = [f for f in folder.iterdir() if f.is_file() and f.suffix.lower() == ".json"]
    for nextflow_input in nextflow_inputs:
        prediction_name = nextflow_input.stem
        report_df.loc[len(report_df)] = {"benchmark_set": benchmark_set, "prediction_name": prediction_name, "report_file": None, "run_ok": None}

    for p in [f for f in folder.iterdir() if f.is_file() and "report_" in f.stem and f.suffix.lower() == ".html"]:
        print("\t", p.name)
        with open(p) as f:
            content = f.read()
        prediction_name = x.groups()[0] if (x := re.search(r"\(\[id:\[([\w\-\.]+)\], jobsize:\d+\]\)", content)) is not None else None
        finished = bool("Workflow execution completed successfully!" in content)
        num_report_df = len(report_df.loc[np.logical_and(report_df["benchmark_set"] == benchmark_set, report_df["prediction_name"] == prediction_name), ["prediction_name"]])
        if num_report_df == 0:
            print("\t\tNo input json file")
            report_df.loc[len(report_df)] = {"benchmark_set": benchmark_set, "report_file": p.name, "prediction_name": prediction_name, "run_ok": finished}
        elif num_report_df > 1:
            print(f"\t\tMultiple reports for same prediction")
            report_df.loc[len(report_df)] = {"benchmark_set": benchmark_set, "report_file": p.name, "prediction_name": prediction_name, "run_ok": finished, "input_json": False}
        else:
            report_df.loc[np.logical_and(report_df["benchmark_set"] == benchmark_set, report_df["prediction_name"] == prediction_name), ["run_ok"]] = finished
            report_df.loc[np.logical_and(report_df["benchmark_set"] == benchmark_set, report_df["prediction_name"] == prediction_name), ["report_file"]] = p.name

known_minimal
	 report_2025-02-05_13-18.html
	 report_2025-02-05_13-35.html
	 report_2025-02-05_13-51.html
	 report_2025-02-05_14-08.html
	 report_2025-02-05_14-25.html
	 report_2025-02-05_16-00.html
	 report_2025-02-05_16-18.html
	 report_2025-02-05_16-36.html
	 report_2025-02-05_16-54.html
	 report_2025-02-05_17-18.html
	 report_2025-02-05_17-36.html
	 report_2025-02-05_17-51.html
	 report_2025-02-05_18-08.html
	 report_2025-02-05_18-24.html
	 report_2025-02-05_18-41.html
	 report_2025-02-05_18-58.html
	 report_2025-02-05_19-15.html
	 report_2025-02-05_19-32.html
	 report_2025-02-05_19-48.html
	 report_2025-02-05_20-04.html
	 report_2025-02-05_20-34.html
	 report_2025-02-05_21-03.html
	 report_2025-02-05_21-24.html
	 report_2025-02-05_21-46.html
	 report_2025-02-05_22-02.html
	 report_2025-02-05_22-29.html
	 report_2025-02-05_22-51.html
	 report_2025-02-05_23-18.html
	 report_2025-02-05_23-36.html
	 report_2025-02-05_23-52.html
	 report_2025-02-06_00-33.html
	 report_2025-02-06_00-58

In [11]:
x = report_df[report_df["run_ok"] != True]
x

Unnamed: 0,benchmark_set,prediction_name,report_file,run_ok
383,mutations,TRG_NLS_Bipartite_1_1PJM_GKRSAEGSNPPKPLKKL.GGR...,,
384,mutations,TRG_NLS_Bipartite_1_1PJM_GKRSAEGSNPPKPLKKL.GGG...,,
385,mutations,TRG_PTS1_2C0L_NAKL.NAGD,,
386,mutations,LIG_Pex14_1_2W84_WAQEF.GAQEF,,
387,mutations,LIG_Pex14_1_2W84_WAQEF.GAQED,,
...,...,...,...,...
1103,known_extension,DOC_SPAK_OSR1_1_MFL_Dmin,,
1104,known_extension,DOC_SPAK_OSR1_1_M443_M1243_D301_D527,,
1105,known_extension,DOC_SPAK_OSR1_1_M443_M1243_D7_D527,,
1106,known_extension,DOC_USP7_MATH_1_Mmin_DFL,,


In [7]:
print(f"Sceduled runs: {len(report_df)}, finished runs {len(report_df[~report_df['report_file'].isna()])}, of which {len(report_df[(report_df['run_ok'] == True)])} were successful. {len(report_df[report_df['prediction_name'].isna()])} runs could not been identified")
print(f"Benchmark sets: {set(report_df['benchmark_set'])}")
report_df

Sceduled runs: 1202, finished runs 617, of which 610 were successful. 1 runs could not been identified
Benchmark sets: {'mutations', 'random_ddi', 'known_minimal', 'known_extension', 'random_minimal', 'known_ddi'}


Unnamed: 0,benchmark_set,prediction_name,report_file,run_ok
0,known_minimal,LIG_HOMEOBOX_1B72,report_2025-02-05_13-18.html,True
1,known_minimal,DOC_SPAK_OSR1_1_2V3S,report_2025-02-05_13-35.html,True
2,known_minimal,DOC_USP7_MATH_1_3MQS,report_2025-02-05_13-51.html,True
3,known_minimal,DOC_USP7_MATH_2_1YY6,report_2025-02-05_14-08.html,True
4,known_minimal,DOC_USP7_UBL2_3_4YOC,report_2025-02-05_16-00.html,True
...,...,...,...,...
1197,random_ddi,D1PF14447_PF00179_3ZNI.D2PF14978_PF00327_5OOL,report_2025-02-11_07-40.html,True
1198,random_ddi,D1PF14978_PF00327_5OOL.D2PF15985_PF10175_6D6Q,report_2025-02-11_07-59.html,True
1199,random_ddi,D1PF15985_PF10175_6D6Q.D2PF17838_PF00071_3KZ1,report_2025-02-11_08-16.html,True
1200,random_ddi,D1PF17838_PF00071_3KZ1.D2PF18773_PF00071_2X19,report_2025-02-11_08-34.html,True


### Creating merged AF3 output file
Going through the output of the cluster and creating a merged tsv file. On the way, check for missing, corrupted or unexpected data.

In [None]:
dataAF = pd.DataFrame()
missformedOutputs = pd.DataFrame(columns=["benchmark_set", "prediction_name", "model_seed", "reason"])
emptyOutputs = pd.DataFrame(columns=["benchmark_set", "nextflow_name"])

for folder in benchmark_folders:
    benchmark_set = folder.name
    print(benchmark_set)
    nextflowFolders = [p for p in folder.iterdir() if p.is_dir()]
    for nextflowFolder in nextflowFolders:
        print("\t", nextflowFolder.name)
        if not (metricPath := (nextflowFolder / "alphafold3_metrics.tsv")).exists():
            emptyOutputs.loc[len(emptyOutputs)] = {"benchmark_set":benchmark_set, "nextflow_name": nextflowFolder.name}
            continue
        metricFile = pd.read_csv(metricPath, delimiter="\t", header=0)
        metricFile["benchmark_set"] = benchmark_set
        metricFile["prediction_file"] = None
        if not metricFile.shape[0] >= 1:
            emptyOutputs.loc[len(emptyOutputs)] = {"benchmark_set":benchmark_set, "nextflow_name": nextflowFolder.name}
            continue

        prediction_name = metricFile["prediction_name"][0]
        if not len(set(metricFile["prediction_name"])) == 1:
            missformedOutputs.loc[len(missformedOutputs)] = {"benchmark_set": benchmark_set, "prediction_name": prediction_name, "reason": "multiple prediction_name for one structure"}
            continue
        
        if not (structureFolder := nextflowFolder / "predictions" / "alphafold3" / prediction_name).exists():
            missformedOutputs.loc[len(missformedOutputs)] = {"benchmark_set": benchmark_set, "prediction_name": prediction_name, "reason": "prediction folder does not exist"}
            continue
        for model_file in [(p / "model.cif") for p in structureFolder.iterdir() if p.is_dir() and (p / "model.cif").exists()]:
            model_seed = model_file.parent.name
            if len(metricFile.loc[metricFile["model_id"] == model_seed, ["prediction_file"]]) == 0:
                missformedOutputs.loc[len(missformedOutputs)] = {"benchmark_set": benchmark_set, "prediction_name": prediction_name, "reason": "model seed is not contained in tsv file"}
                continue
            metricFile.loc[metricFile["model_id"] == model_seed, ["prediction_file"]] = model_file
        
        metricFile.sort_values(by=['ranking_score'], ascending=False, ignore_index=True, inplace=True)
        metricFile["model_id"] = metricFile.apply(lambda r: f"ranked_{int(r.name)}", axis=1)
        dataAF = pd.concat([dataAF, metricFile], ignore_index=True)
dataAF.drop(columns=["project_name"], inplace=True)
dataAF.rename(columns={"chainA_length": "chainB_length", "chainB_length": "chainA_length", 
                       "chainA_intf_avg_plddt": "chainB_intf_avg_plddt", "chainB_intf_avg_plddt": "chainA_intf_avg_plddt",
                       "num_chainA_intf_res": "num_chainB_intf_res", "num_chainB_intf_res": "num_chainA_intf_res",
                       }, inplace=True)
c = list(dataAF.columns)
c.remove("prediction_name")
c.remove("model_preset")
c.remove("benchmark_set")
c.insert(0, "model_preset")
c.insert(1, "benchmark_set")
c.insert(2, "prediction_name")
c.remove("chainA_length")
c.insert(c.index("chainB_length"), "chainA_length")
c.remove("chainA_intf_avg_plddt")
c.insert(c.index("chainB_intf_avg_plddt"), "chainA_intf_avg_plddt")
c.remove("num_chainA_intf_res")
c.insert(c.index("num_chainB_intf_res"), "num_chainA_intf_res")

dataAF = dataAF[c]
display(dataAF)

known_minimal
	 happy_brenner
	 nice_caravaggio
	 zen_tuckerman
	 adoring_mercator
	 disturbed_lichterman
	 peaceful_allen
	 crazy_goodall
	 intergalactic_lavoisier
	 voluminous_gautier
	 dreamy_golick
	 lonely_ride
	 reverent_lichterman
	 sad_borg
	 pensive_spence
	 exotic_spence
	 suspicious_mclean
	 sick_snyder
	 irreverent_bell
	 cheesy_wing
	 sharp_shaw
	 magical_heisenberg
	 hungry_yonath
	 intergalactic_shaw
	 exotic_mestorf
	 trusting_stonebraker
	 sleepy_koch
	 intergalactic_moriondo
	 romantic_ritchie
	 evil_hilbert
	 agitated_colden
	 furious_pasteur
	 cheeky_cajal
	 thirsty_ptolemy
	 ecstatic_woese
	 gloomy_goldberg
	 chaotic_gauss
	 reverent_booth
	 gigantic_shockley
	 stupefied_swirles
	 drunk_franklin
	 berserk_bassi
	 mad_angela
	 distracted_fermi
	 astonishing_albattani
	 backstabbing_marconi
	 thirsty_leakey
	 determined_linnaeus
	 intergalactic_jepsen
	 cranky_kowalevski
	 prickly_tesla
	 hungry_escher
	 marvelous_cajal
	 maniac_leavitt
	 disturbed_carlsson
	 friendl

Unnamed: 0,model_preset,benchmark_set,prediction_name,model_id,chainA_length,chainB_length,fraction_disordered,has_clash,iptm,ptm,...,chainA_intf_avg_plddt,chainB_intf_avg_plddt,intf_avg_plddt,num_chainA_intf_res,num_chainB_intf_res,num_res_res_contact,num_atom_atom_contact,iPAE,pDockQ,prediction_file
0,alphafold3,known_minimal,lig_homeobox_1b72,ranked_0,4,73,0.05,0.0,0.54,0.80,...,65.87,89.16,83.78,3,10,15,125,6.65,0.03,L:\imb-luckgr2\projects\AlphaFold\AlphaFold3\A...
1,alphafold3,known_minimal,lig_homeobox_1b72,ranked_1,4,73,0.05,0.0,0.46,0.76,...,60.69,84.61,78.23,4,11,19,187,8.60,0.03,L:\imb-luckgr2\projects\AlphaFold\AlphaFold3\A...
2,alphafold3,known_minimal,lig_homeobox_1b72,ranked_2,4,73,0.13,0.0,0.43,0.74,...,56.54,86.77,78.71,4,11,17,166,8.85,0.04,L:\imb-luckgr2\projects\AlphaFold\AlphaFold3\A...
3,alphafold3,known_minimal,lig_homeobox_1b72,ranked_3,4,73,0.09,0.0,0.42,0.77,...,63.45,87.81,80.31,4,9,19,183,8.40,0.04,L:\imb-luckgr2\projects\AlphaFold\AlphaFold3\A...
4,alphafold3,known_minimal,lig_homeobox_1b72,ranked_4,4,73,0.05,0.0,0.37,0.77,...,55.09,89.85,79.92,4,10,18,191,10.20,0.03,L:\imb-luckgr2\projects\AlphaFold\AlphaFold3\A...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3085,alphafold3,random_ddi,d1pf06991_pf08082_7aav.d2pf07417_pf00140_6omf,ranked_0,123,123,0.41,0.0,0.27,0.51,...,44.16,72.66,61.35,25,38,70,520,24.66,0.13,L:\imb-luckgr2\projects\AlphaFold\AlphaFold3\A...
3086,alphafold3,random_ddi,d1pf06991_pf08082_7aav.d2pf07417_pf00140_6omf,ranked_1,123,123,0.44,0.0,0.21,0.50,...,38.80,74.97,61.33,23,38,65,443,23.00,0.08,L:\imb-luckgr2\projects\AlphaFold\AlphaFold3\A...
3087,alphafold3,random_ddi,d1pf06991_pf08082_7aav.d2pf07417_pf00140_6omf,ranked_2,123,123,0.42,0.0,0.11,0.50,...,37.38,72.29,56.78,28,35,66,565,22.90,0.09,L:\imb-luckgr2\projects\AlphaFold\AlphaFold3\A...
3088,alphafold3,random_ddi,d1pf06991_pf08082_7aav.d2pf07417_pf00140_6omf,ranked_3,123,123,0.43,0.0,0.09,0.48,...,28.13,74.85,57.13,11,18,29,208,23.81,0.03,L:\imb-luckgr2\projects\AlphaFold\AlphaFold3\A...


In [72]:
# Find missing structures and correct lower case names

# First detecting missing structures
report_df_ = report_df[~report_df["prediction_name"].isna()].copy()
report_df_["prediction_name_lower"] = report_df["prediction_name"].str.lower()
input_output_merge = pd.merge(
    left = dataAF,
    right = report_df_,
    how="outer",
    left_on = ["benchmark_set", "prediction_name"],
    right_on = ["benchmark_set", "prediction_name_lower"],
    suffixes = ["", "_input"]
)
missingOutputs = input_output_merge[np.logical_and(~input_output_merge["run_ok"].isna(), input_output_merge["prediction_name"].isna())]
missingOutputs = missingOutputs[["benchmark_set", "prediction_name_input", "report_file", "run_ok"]]
unidentifiedOutputs = input_output_merge[input_output_merge["prediction_name_input"].isna()]

# Correcting lower case names
dataAF = pd.merge(
    left = dataAF,
    right = report_df_,
    how="left",
    left_on = ["benchmark_set", "prediction_name"],
    right_on = ["benchmark_set", "prediction_name_lower"],
    suffixes = ["", "_input"]
)
dataAF["prediction_name"] = dataAF["prediction_name_input"]
dataAF.drop(columns=["prediction_name_input", "prediction_name_lower", "report_file", "run_ok"], inplace=True)

In [73]:
dataAF

Unnamed: 0,model_preset,benchmark_set,prediction_name,model_id,chainA_length,chainB_length,fraction_disordered,has_clash,iptm,ptm,...,chainA_intf_avg_plddt,chainB_intf_avg_plddt,intf_avg_plddt,num_chainA_intf_res,num_chainB_intf_res,num_res_res_contact,num_atom_atom_contact,iPAE,pDockQ,prediction_file
0,alphafold3,known_minimal,LIG_HOMEOBOX_1B72,ranked_0,73,4,0.05,0.0,0.54,0.80,...,89.16,65.87,83.78,10,3,15,125,6.65,0.03,L:\imb-luckgr2\projects\AlphaFold\AlphaFold3\A...
1,alphafold3,known_minimal,LIG_HOMEOBOX_1B72,ranked_1,73,4,0.05,0.0,0.46,0.76,...,84.61,60.69,78.23,11,4,19,187,8.60,0.03,L:\imb-luckgr2\projects\AlphaFold\AlphaFold3\A...
2,alphafold3,known_minimal,LIG_HOMEOBOX_1B72,ranked_2,73,4,0.13,0.0,0.43,0.74,...,86.77,56.54,78.71,11,4,17,166,8.85,0.04,L:\imb-luckgr2\projects\AlphaFold\AlphaFold3\A...
3,alphafold3,known_minimal,LIG_HOMEOBOX_1B72,ranked_3,73,4,0.09,0.0,0.42,0.77,...,87.81,63.45,80.31,9,4,19,183,8.40,0.04,L:\imb-luckgr2\projects\AlphaFold\AlphaFold3\A...
4,alphafold3,known_minimal,LIG_HOMEOBOX_1B72,ranked_4,73,4,0.05,0.0,0.37,0.77,...,89.85,55.09,79.92,10,4,18,191,10.20,0.03,L:\imb-luckgr2\projects\AlphaFold\AlphaFold3\A...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3085,alphafold3,random_ddi,D1PF06991_PF08082_7AAV.D2PF07417_PF00140_6OMF,ranked_0,123,123,0.41,0.0,0.27,0.51,...,72.66,44.16,61.35,38,25,70,520,24.66,0.13,L:\imb-luckgr2\projects\AlphaFold\AlphaFold3\A...
3086,alphafold3,random_ddi,D1PF06991_PF08082_7AAV.D2PF07417_PF00140_6OMF,ranked_1,123,123,0.44,0.0,0.21,0.50,...,74.97,38.80,61.33,38,23,65,443,23.00,0.08,L:\imb-luckgr2\projects\AlphaFold\AlphaFold3\A...
3087,alphafold3,random_ddi,D1PF06991_PF08082_7AAV.D2PF07417_PF00140_6OMF,ranked_2,123,123,0.42,0.0,0.11,0.50,...,72.29,37.38,56.78,35,28,66,565,22.90,0.09,L:\imb-luckgr2\projects\AlphaFold\AlphaFold3\A...
3088,alphafold3,random_ddi,D1PF06991_PF08082_7AAV.D2PF07417_PF00140_6OMF,ranked_3,123,123,0.43,0.0,0.09,0.48,...,74.85,28.13,57.13,18,11,29,208,23.81,0.03,L:\imb-luckgr2\projects\AlphaFold\AlphaFold3\A...


In [74]:
print(f"Currently {len(set(dataAF['prediction_name']))} predictions have run (not including errors)")
display(dataAF)
print("Processed files with errors or missing output")
display(missingOutputs)
print("Missformed outputs")
display(missformedOutputs)
print("Empty output folders")
display(emptyOutputs)

Currently 618 predictions have run (not including errors)


Unnamed: 0,model_preset,benchmark_set,prediction_name,model_id,chainA_length,chainB_length,fraction_disordered,has_clash,iptm,ptm,...,chainA_intf_avg_plddt,chainB_intf_avg_plddt,intf_avg_plddt,num_chainA_intf_res,num_chainB_intf_res,num_res_res_contact,num_atom_atom_contact,iPAE,pDockQ,prediction_file
0,alphafold3,known_minimal,LIG_HOMEOBOX_1B72,ranked_0,73,4,0.05,0.0,0.54,0.80,...,89.16,65.87,83.78,10,3,15,125,6.65,0.03,L:\imb-luckgr2\projects\AlphaFold\AlphaFold3\A...
1,alphafold3,known_minimal,LIG_HOMEOBOX_1B72,ranked_1,73,4,0.05,0.0,0.46,0.76,...,84.61,60.69,78.23,11,4,19,187,8.60,0.03,L:\imb-luckgr2\projects\AlphaFold\AlphaFold3\A...
2,alphafold3,known_minimal,LIG_HOMEOBOX_1B72,ranked_2,73,4,0.13,0.0,0.43,0.74,...,86.77,56.54,78.71,11,4,17,166,8.85,0.04,L:\imb-luckgr2\projects\AlphaFold\AlphaFold3\A...
3,alphafold3,known_minimal,LIG_HOMEOBOX_1B72,ranked_3,73,4,0.09,0.0,0.42,0.77,...,87.81,63.45,80.31,9,4,19,183,8.40,0.04,L:\imb-luckgr2\projects\AlphaFold\AlphaFold3\A...
4,alphafold3,known_minimal,LIG_HOMEOBOX_1B72,ranked_4,73,4,0.05,0.0,0.37,0.77,...,89.85,55.09,79.92,10,4,18,191,10.20,0.03,L:\imb-luckgr2\projects\AlphaFold\AlphaFold3\A...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3085,alphafold3,random_ddi,D1PF06991_PF08082_7AAV.D2PF07417_PF00140_6OMF,ranked_0,123,123,0.41,0.0,0.27,0.51,...,72.66,44.16,61.35,38,25,70,520,24.66,0.13,L:\imb-luckgr2\projects\AlphaFold\AlphaFold3\A...
3086,alphafold3,random_ddi,D1PF06991_PF08082_7AAV.D2PF07417_PF00140_6OMF,ranked_1,123,123,0.44,0.0,0.21,0.50,...,74.97,38.80,61.33,38,23,65,443,23.00,0.08,L:\imb-luckgr2\projects\AlphaFold\AlphaFold3\A...
3087,alphafold3,random_ddi,D1PF06991_PF08082_7AAV.D2PF07417_PF00140_6OMF,ranked_2,123,123,0.42,0.0,0.11,0.50,...,72.29,37.38,56.78,35,28,66,565,22.90,0.09,L:\imb-luckgr2\projects\AlphaFold\AlphaFold3\A...
3088,alphafold3,random_ddi,D1PF06991_PF08082_7AAV.D2PF07417_PF00140_6OMF,ranked_3,123,123,0.43,0.0,0.09,0.48,...,74.85,28.13,57.13,18,11,29,208,23.81,0.03,L:\imb-luckgr2\projects\AlphaFold\AlphaFold3\A...


Processed files with errors or missing output


Unnamed: 0,benchmark_set,prediction_name_input,report_file,run_ok
409,known_extension,DOC_USP7_UBL2_3_Mmin_D528_D865,report_2025-02-01_10-45.html,False
947,known_extension,LIG_PDZ_Class_1_M1590_M1601_D1084_D1593,report_2025-02-03_15-49.html,False
1191,known_extension,LIG_Vh1_VBS_1_M532_M702_D1_D925,report_2025-02-03_15-34.html,False
1203,known_extension,LIG_Vh1_VBS_1_MFL_Dmin,report_2025-02-03_04-53.html,False
1230,known_extension,LIG_WW_1_Mmin_D2925_D3362,report_2025-01-31_19-26.html,False
1231,known_extension,LIG_WW_1_Mmin_DFL,report_2025-02-02_09-00.html,False


Missformed outputs


Unnamed: 0,benchmark_set,prediction_name,model_seed,reason


Empty output folders


Unnamed: 0,benchmark_set,nextflow_name
0,known_minimal,disturbed_lichterman
1,mutations,angry_newton
2,known_extension,sad_austin
3,known_extension,jolly_dalembert
4,known_extension,magical_boltzmann
5,known_extension,sharp_kay
6,known_extension,nostalgic_swanson
7,known_extension,high_raman
8,known_extension,nasty_fourier
9,known_extension,nasty_bell


#### Parsing file name

In [75]:
regex_paired_DMI = r"^([\w\-]+)_(\w{4})$"
regex_random_DMI = r"^M([\w\-]+)_(\w{4})\.D([\w\-]+)_(\w{4})$"
regex_mutated_DMI = r"^([\w\-]+)_(\w{4})_(\w+)\.([A-Za-z]+)$"
regex_known_extension_DMI = r"^([\w-]+)_((Mmin)|(MFL)|(M[\d]+_M[\d]+))_((DFL)|(Dmin)|(D[\d]+_D[\d]+))$"
regex_ddi_known = r"^([^\W_]+_[^\W_]+)_(\w{4})_(\w+)_resi(\d+)_resi(\d+).(\w+)_resi(\d+)_resi(\d+)$"
regex_ddi_random = r"^(D1[^\W_]+_[^\W_]+)_(\w{4}).(D2[^\W_]+_[^\W_]+)_(\w{4})$"


dataAF["PDB_id"] = None
dataAF["PDB_id_random_paired"] = None
dataAF["ELM_instance"] = None
dataAF["ELM_instance_random_paired"] = None
dataAF["sequence_initial"] = None
dataAF["sequence_mutated"] = None
dataAF["known_extension_motif"] = None
dataAF["known_extension_domain"] = None
dataAF["chain1_letter"] = None
dataAF["chain2_letter"] = None
dataAF["ddi_pfam_id"] = None
dataAF["ddi_pfam_id_random_paired"] = None

for i, row in dataAF.iterrows():
    pdb_id, pdb_id_2, elm_instance, elm_instance_2, sequence, sequence_f = None, None, None, None, None, None
    known_extensionM, known_extensionD, chain1_letter, chain2_letter, ddi_pfam_id, ddi_pfam_id_random_paired = None, None, None, None, None, None
    if (benchmark_set := row["benchmark_set"]) == "known_minimal":
        if (r1 := re.search(regex_paired_DMI, row["prediction_name"])) is not None and len(r1.groups()) == 2:
            elm_instance = r1.groups()[0]
            pdb_id = r1.groups()[1]
    elif benchmark_set == "random_minimal":
        if (r := re.search(regex_random_DMI, row["prediction_name"])) is not None and len(r.groups()) == 4:
            elm_instance = r.groups()[0]
            pdb_id = r.groups()[1]
            elm_instance_2 = r.groups()[2]
            pdb_id_2 = r.groups()[3]
    elif benchmark_set == "mutations":
        if (r := re.search(regex_mutated_DMI, row["prediction_name"])) is not None and len(r.groups()) == 4:
            elm_instance = r.groups()[0]
            pdb_id = r.groups()[1]
            sequence = r.groups()[2]
            sequence_f = r.groups()[3]
    elif benchmark_set == "known_extension":
        if (r := re.search(regex_known_extension_DMI, row["prediction_name"])) is not None and len(r.groups()) == 9:
            elm_instance = r.groups()[0]
            known_extensionM = r.groups()[1]
            known_extensionD = r.groups()[5]
    elif benchmark_set == "known_ddi":
        if (r := re.search(regex_ddi_known, row["prediction_name"])) is not None and len(r.groups()) == 8:
            ddi_pfam_id = r.groups()[0]
            pdb_id = r.groups()[1]
            chain1_letter = r.groups()[2]
            chain2_letter = r.groups()[5]
    elif benchmark_set == "random_ddi":
        if (r := re.search(regex_ddi_random, row["prediction_name"])) is not None and len(r.groups()) == 4:
            ddi_pfam_id = r.groups()[0]
            pdb_id = r.groups()[1]   
            ddi_pfam_id_random_paired = r.groups()[2]
            pdb_id_2 = r.groups()[3]  
    dataAF.at[i, "PDB_id"] =  pdb_id
    dataAF.at[i, "PDB_id_random_paired"] =  pdb_id_2
    dataAF.at[i, "ELM_instance"] =  elm_instance
    dataAF.at[i, "ELM_instance_random_paired"] =  elm_instance_2
    dataAF.at[i, "sequence_initial"] =  sequence
    dataAF.at[i, "sequence_mutated"] =  sequence_f
    dataAF.at[i, "known_extension_motif"] =  known_extensionM
    dataAF.at[i, "known_extension_domain"] =  known_extensionD
    dataAF.at[i, "chain1_letter"] =  chain1_letter
    dataAF.at[i, "chain2_letter"] =  chain2_letter
    dataAF.at[i, "ddi_pfam_id"] =  ddi_pfam_id
    dataAF.at[i, "ddi_pfam_id_random_paired"] =  ddi_pfam_id_random_paired

dataAF

Unnamed: 0,model_preset,benchmark_set,prediction_name,model_id,chainA_length,chainB_length,fraction_disordered,has_clash,iptm,ptm,...,ELM_instance,ELM_instance_random_paired,sequence_initial,sequence_mutated,known_extension_motif,known_extension_domain,chain1_letter,chain2_letter,ddi_pfam_id,ddi_pfam_id_random_paired
0,alphafold3,known_minimal,LIG_HOMEOBOX_1B72,ranked_0,73,4,0.05,0.0,0.54,0.80,...,LIG_HOMEOBOX,,,,,,,,,
1,alphafold3,known_minimal,LIG_HOMEOBOX_1B72,ranked_1,73,4,0.05,0.0,0.46,0.76,...,LIG_HOMEOBOX,,,,,,,,,
2,alphafold3,known_minimal,LIG_HOMEOBOX_1B72,ranked_2,73,4,0.13,0.0,0.43,0.74,...,LIG_HOMEOBOX,,,,,,,,,
3,alphafold3,known_minimal,LIG_HOMEOBOX_1B72,ranked_3,73,4,0.09,0.0,0.42,0.77,...,LIG_HOMEOBOX,,,,,,,,,
4,alphafold3,known_minimal,LIG_HOMEOBOX_1B72,ranked_4,73,4,0.05,0.0,0.37,0.77,...,LIG_HOMEOBOX,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3085,alphafold3,random_ddi,D1PF06991_PF08082_7AAV.D2PF07417_PF00140_6OMF,ranked_0,123,123,0.41,0.0,0.27,0.51,...,,,,,,,,,D1PF06991_PF08082,D2PF07417_PF00140
3086,alphafold3,random_ddi,D1PF06991_PF08082_7AAV.D2PF07417_PF00140_6OMF,ranked_1,123,123,0.44,0.0,0.21,0.50,...,,,,,,,,,D1PF06991_PF08082,D2PF07417_PF00140
3087,alphafold3,random_ddi,D1PF06991_PF08082_7AAV.D2PF07417_PF00140_6OMF,ranked_2,123,123,0.42,0.0,0.11,0.50,...,,,,,,,,,D1PF06991_PF08082,D2PF07417_PF00140
3088,alphafold3,random_ddi,D1PF06991_PF08082_7AAV.D2PF07417_PF00140_6OMF,ranked_3,123,123,0.43,0.0,0.09,0.48,...,,,,,,,,,D1PF06991_PF08082,D2PF07417_PF00140


## Calculations of template dependend metrics

Some columns are yet missing: RMSD_domain	num_align_atoms_domain	align_score_domain	num_align_resi_domain	RMSD_backbone_peptide	RMSD_all_atom_peptide	known_motif_plddt	DockQ	iRMS	LRMS	Fnonnat	num_mutation_in_motif

Also, model_confidence is missing. But where to restore it?

First load solved structure and check for which files a PDB id exist

In [223]:
dataSolved = pd.DataFrame(columns=["set", "PDB_id", "ddi_pfam_id", "chain_A_id", "chain_B_id", "chain_A_sequence", "chain_B_sequence"])

solved_base_path = Path("../ressources/solved").resolve()
if not solved_base_path.exists():
    raise RuntimeError(f"The path {solved_base_path} does not exist")

DDI_solved = [p for p in (solved_base_path / "DDI_solved_structures").iterdir() if p.is_file() and p.suffix.lower() == ".pdb"]
DMI_solved = [p for p in (solved_base_path / "DMI_solved_structures").iterdir() if p.is_file() and p.suffix.lower() == ".pdb"]

# First DMI
for structure_file in DMI_solved:
    pdb_id = structure_file.name.split("_")[0]
    if not len(pdb_id) == 4:
        raise RuntimeError(f"Unexpected file name {structure_file.name}")
    

    structure_biopy = parser.get_structure("structure", file=structure_file)
    chains = [c for c in structure_biopy.get_chains()]
    if len(chains) != 2:
        print(f"Unexpected chains in {structure_file.name}")
        continue
    chainA = structure_biopy[0][chains[0].id]
    chainB = structure_biopy[0][chains[1].id]

    sequenceA = '-'.join([r.get_resname() for r in chainA.get_residues()])
    sequenceB = '-'.join([r.get_resname() for r in chainB.get_residues()])

    dataSolved.loc[len(dataSolved)] = {"set" : "DMI", "PDB_id": pdb_id, "chain_A_sequence": sequenceA, "chain_B_sequence": sequenceB}




# Now DDI
for structure_file in DDI_solved:
    pfam_id = "_".join(structure_file.name.split("_")[0:2])
    pdb_id = structure_file.name.split("_")[2]
    chainA_id = structure_file.name.split("_")[3][0]
    chainB_id = structure_file.name.split("_")[3][1]

    structure_biopy = parser.get_structure("structure", file=structure_file)
    chains = [c for c in structure_biopy.get_chains()]
    chain_ids = [c.id for c in structure_biopy.get_chains()]
    if len(chains) != 2 or chainA_id not in chain_ids or chainB_id not in chain_ids:
        print(f"Unexpected chains in {structure_file.name}: Expected {chainA_id} and {chainB_id}, got {chains}")
        continue
    chainA = structure_biopy[0][chainA_id]
    chainB = structure_biopy[0][chainB_id]

    sequenceA = '-'.join([r.get_resname() for r in chainA.get_residues()])
    sequenceB = '-'.join([r.get_resname() for r in chainB.get_residues()])

    dataSolved.loc[len(dataSolved)] = {"set" : "DDI", "PDB_id": pdb_id, "ddi_pfam_id": ddi_pfam_id, "chain_A_id": chainA_id, "chain_B_id": chainB_id, "chain_A_sequence": sequenceA, "chain_B_sequence": sequenceB}

display(dataSolved)

Unnamed: 0,set,PDB_id,ddi_pfam_id,chain_A_id,chain_B_id,chain_A_sequence,chain_B_sequence
0,DMI,1ATP,,,,GLN-PHE-ASP-ARG-ILE-LYS-THR-LEU-GLY-THR-GLY-SE...,PHE-THR-GLU-PHE
1,DMI,1AXC,,,,MET-PHE-GLU-ALA-ARG-LEU-VAL-GLN-GLY-SER-ILE-LE...,GLN-THR-SER-MET-THR-ASP-PHE-TYR-HIS-SER
2,DMI,1B72,,,,ARG-LYS-ARG-ARG-ASN-PHE-ASN-LYS-GLN-ALA-THR-GL...,PHE-ASP-TRP-MET
3,DMI,1B8Q,,,,ASN-VAL-ILE-SER-VAL-ARG-LEU-PHE-LYS-ARG-LYS-VA...,VAL-LYS-VAL-ASP-SER-VAL
4,DMI,1BXX,,,,ILE-GLY-TRP-ARG-ARG-GLU-GLY-ILE-LYS-TYR-ARG-AR...,TYR-GLN-ARG-LEU
...,...,...,...,...,...,...,...
181,DDI,3ZNI,D1PF06991_PF08082,A,C,GLN-ALA-ALA-ALA-ASP-ARG-ARG-THR-VAL-GLU-LYS-TH...,ALA-LEU-LYS-ARG-ILE-HIS-LYS-GLU-LEU-ASN-ASP-LE...
182,DDI,3J7Y,D1PF06991_PF08082,o,Z,ARG-GLY-ARG-ILE-PRO-GLY-ARG-GLN-TRP-ILE-GLY-LY...,LYS-PHE-THR-ARG-SER-ARG-ILE-PRO-GLU-LYS-VAL-PH...
183,DDI,6D6Q,D1PF06991_PF08082,G,L,ALA-ARG-ALA-ALA-ARG-THR-VAL-LEU-GLY-GLN-VAL-VA...,ARG-LYS-THR-ARG-LEU-SER-LYS-ASN-LEU-LEU-ARG-ME...
184,DDI,3KZ1,D1PF06991_PF08082,B,E,ASN-TRP-GLN-HIS-THR-VAL-GLY-LYS-ASP-VAL-VAL-AL...,ALA-ILE-ARG-LYS-LYS-LEU-VAL-ILE-VAL-GLY-ASP-GL...


KeyError: 'TPO'

### Save metric file

In [45]:
try:
    dataAF
except NameError:
    raise Exception("Please first run the cells to get the dataAF frame")

destination = Path(r"..\ressources\AF3").resolve()
if not destination.exists() or not destination.is_dir():
    raise Exception("Your destination path is not valid")

dataAF.to_csv(destination / "AF3_metrics.tsv", sep="\t", index=False)
dataAF.to_excel(destination / "AF3_metrics.xlsx", sheet_name="AF3", index=False)

## Converting AF3 structure files (.cif) to pdb files

In [76]:
import pymol

In [77]:
dataAF

Unnamed: 0,model_preset,benchmark_set,prediction_name,model_id,chainA_length,chainB_length,fraction_disordered,has_clash,iptm,ptm,...,ELM_instance,ELM_instance_random_paired,sequence_initial,sequence_mutated,known_extension_motif,known_extension_domain,chain1_letter,chain2_letter,ddi_pfam_id,ddi_pfam_id_random_paired
0,alphafold3,known_minimal,LIG_HOMEOBOX_1B72,ranked_0,73,4,0.05,0.0,0.54,0.80,...,LIG_HOMEOBOX,,,,,,,,,
1,alphafold3,known_minimal,LIG_HOMEOBOX_1B72,ranked_1,73,4,0.05,0.0,0.46,0.76,...,LIG_HOMEOBOX,,,,,,,,,
2,alphafold3,known_minimal,LIG_HOMEOBOX_1B72,ranked_2,73,4,0.13,0.0,0.43,0.74,...,LIG_HOMEOBOX,,,,,,,,,
3,alphafold3,known_minimal,LIG_HOMEOBOX_1B72,ranked_3,73,4,0.09,0.0,0.42,0.77,...,LIG_HOMEOBOX,,,,,,,,,
4,alphafold3,known_minimal,LIG_HOMEOBOX_1B72,ranked_4,73,4,0.05,0.0,0.37,0.77,...,LIG_HOMEOBOX,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3085,alphafold3,random_ddi,D1PF06991_PF08082_7AAV.D2PF07417_PF00140_6OMF,ranked_0,123,123,0.41,0.0,0.27,0.51,...,,,,,,,,,D1PF06991_PF08082,D2PF07417_PF00140
3086,alphafold3,random_ddi,D1PF06991_PF08082_7AAV.D2PF07417_PF00140_6OMF,ranked_1,123,123,0.44,0.0,0.21,0.50,...,,,,,,,,,D1PF06991_PF08082,D2PF07417_PF00140
3087,alphafold3,random_ddi,D1PF06991_PF08082_7AAV.D2PF07417_PF00140_6OMF,ranked_2,123,123,0.42,0.0,0.11,0.50,...,,,,,,,,,D1PF06991_PF08082,D2PF07417_PF00140
3088,alphafold3,random_ddi,D1PF06991_PF08082_7AAV.D2PF07417_PF00140_6OMF,ranked_3,123,123,0.43,0.0,0.09,0.48,...,,,,,,,,,D1PF06991_PF08082,D2PF07417_PF00140


In [205]:
try:
    dataAF
except NameError:
    raise Exception("Please first run the cells to get the dataAF frame")

# If this property is not set, pymol will ignore the alter commands on the ID when exporting
pymol.cmd.set("pdb_retain_ids", 0)

destination = Path("../ressources/AF3")
if not destination.exists() or not destination.is_dir():
    raise Exception("Your destination folder does not exist")
for index, row in dataAF.iterrows():
    prediction_file = Path(row["prediction_file"])
    if not prediction_file.exists():
        print(f"{prediction_file.name} does not exist")
        continue


    structure_folder_dest: Path = (destination / ("DDI" if "ddi" in str(row['benchmark_set']).lower() else "DMI") / row["benchmark_set"] / row["prediction_name"])
    structure_folder_dest.mkdir(parents=True, exist_ok=True)

    if (structure_file_dest := structure_folder_dest / (str(row["model_id"]) + ".pdb")).exists():
        print(f"{row["prediction_name"]}/{structure_file_dest.name} already processed. Skip")
        continue
    else:
        print(f"{row["prediction_name"]}")

    pymol.cmd.load(prediction_file, prediction_file.stem)

    # Reorder chains
    pymol.cmd.alter(selection="chain A", expression="chain = 'C'")
    pymol.cmd.alter(selection="chain B", expression="chain = 'A'")
    pymol.cmd.alter(selection="chain C", expression="chain = 'B'")
    pymol.cmd.alter(selection="segi A", expression="segi = 'C'")
    pymol.cmd.alter(selection="segi B", expression="segi = 'A'")
    pymol.cmd.alter(selection="segi C", expression="segi = 'B'")
    pymol.cmd.sort()
    pymol.cmd.alter(selection="chain A", expression=f"ID = (int(ID) - {pymol.cmd.count_atoms('chain B')})")
    pymol.cmd.alter(selection="chain B", expression=f"ID = (int(ID) + {pymol.cmd.count_atoms('chain A')})")
    pymol.cmd.sort()

    pymol.cmd.save(structure_file_dest)
    for o in pymol.cmd.get_object_list():
        pymol.cmd.delete(o)

LIG_HOMEOBOX_1B72/ranked_0.pdb: Overwrite
LIG_HOMEOBOX_1B72/ranked_1.pdb: Overwrite
LIG_HOMEOBOX_1B72/ranked_2.pdb: Overwrite
LIG_HOMEOBOX_1B72/ranked_3.pdb: Overwrite
LIG_HOMEOBOX_1B72/ranked_4.pdb: Overwrite
DOC_SPAK_OSR1_1_2V3S/ranked_0.pdb: Overwrite
DOC_SPAK_OSR1_1_2V3S/ranked_1.pdb: Overwrite
DOC_SPAK_OSR1_1_2V3S/ranked_2.pdb: Overwrite
DOC_SPAK_OSR1_1_2V3S/ranked_3.pdb: Overwrite
DOC_SPAK_OSR1_1_2V3S/ranked_4.pdb: Overwrite
DOC_USP7_MATH_1_3MQS/ranked_0.pdb: Overwrite
DOC_USP7_MATH_1_3MQS/ranked_1.pdb: Overwrite
DOC_USP7_MATH_1_3MQS/ranked_2.pdb: Overwrite
DOC_USP7_MATH_1_3MQS/ranked_3.pdb: Overwrite
DOC_USP7_MATH_1_3MQS/ranked_4.pdb: Overwrite
DOC_USP7_MATH_2_1YY6/ranked_0.pdb: Overwrite
DOC_USP7_MATH_2_1YY6/ranked_1.pdb: Overwrite
DOC_USP7_MATH_2_1YY6/ranked_2.pdb: Overwrite
DOC_USP7_MATH_2_1YY6/ranked_3.pdb: Overwrite
DOC_USP7_MATH_2_1YY6/ranked_4.pdb: Overwrite
DOC_USP7_UBL2_3_4YOC/ranked_0.pdb: Overwrite
DOC_USP7_UBL2_3_4YOC/ranked_1.pdb: Overwrite
DOC_USP7_UBL2_3_4YOC/rank

In [200]:
# Run this cell if the code above crashes to reset pymol
for o in pymol.cmd.get_object_list():
        pymol.cmd.delete(o)

In [118]:
pymol.cmd.iterate('chain B', "print(ID)")

613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
641
642
640
643
644
645
646
647
648
649
650
651
652
653
654


42