# Parsing output of AF3 from the cluster
created by Andreas 2025-02-04

Script to parse output from AF3 running on the cluster. Also detects errors on the runs using the reports

In [1]:
# Imports
from pathlib import Path
import pandas as pd
import numpy as np
import re

In [2]:
# Path
outputBase = Path(r"L:\imb-luckgr2\projects\AlphaFold\AlphaFold3\AlphaFold_benchmark_DMI")
outputFolders = [p for p in outputBase.iterdir() if p.is_dir()]
for p in outputFolders:
    print(p)

L:\imb-luckgr2\projects\AlphaFold\AlphaFold3\AlphaFold_benchmark_DMI\known_extension
L:\imb-luckgr2\projects\AlphaFold\AlphaFold3\AlphaFold_benchmark_DMI\known_minimal
L:\imb-luckgr2\projects\AlphaFold\AlphaFold3\AlphaFold_benchmark_DMI\mutations
L:\imb-luckgr2\projects\AlphaFold\AlphaFold3\AlphaFold_benchmark_DMI\random_minimal


### Scanning input .json files and report.html files
Scans for all input .json files and corrosponding report_{time}.html files to find failed runs


benchmark_set refers to the pairing method (mutated, randomized, ...)

prediction_name is None if a report file without a unique prediction id exist

report_file is None if the input file has not been run on the cluster

run_ok refers to if there had been an error running the input file on the server and also None if the input file has not been run on the cluster.

In [9]:
report_df = pd.DataFrame(columns=["benchmark_set", "prediction_name", "report_file", "run_ok"])
for outputFolder in outputFolders:
    benchmark_set = outputFolder.name
    print(benchmark_set)
    nextflow_inputs = [f for f in outputFolder.iterdir() if f.is_file() and f.suffix.lower() == ".json"]
    for nextflow_input in nextflow_inputs:
        prediction_name = nextflow_input.stem
        report_df.loc[len(report_df)] = {"benchmark_set": benchmark_set, "prediction_name": prediction_name}

    for p in [f for f in outputFolder.iterdir() if f.is_file() and "report_" in f.stem and f.suffix.lower() == ".html"]:
        print(p.name)
        with open(p) as f:
            content = f.read()
        prediction_name = x.groups()[0] if (x := re.search(r"\(\[id:\[([\w\-]+)\], jobsize:\d+\]\)", content)) is not None else None
        finished = bool("Workflow execution completed successfully!" in content)
        if len(report_df.loc[np.logical_and(report_df["benchmark_set"] == benchmark_set, report_df["prediction_name"] == prediction_name), ["prediction_name"]]) == 0:
            report_df.loc[len(report_df)] = {"benchmark_set": benchmark_set, "report_file": p.name, "prediction_name": prediction_name, "run_ok": finished}
        else:
            report_df.loc[np.logical_and(report_df["benchmark_set"] == benchmark_set, report_df["prediction_name"] == prediction_name), ["run_ok"]] = finished
            report_df.loc[np.logical_and(report_df["benchmark_set"] == benchmark_set, report_df["prediction_name"] == prediction_name), ["report_file"]] = p.name


known_extension
report_2025-01-31_15-12.html
report_2025-01-31_15-19.html
report_2025-01-31_15-41.html
report_2025-01-31_16-04.html
report_2025-01-31_16-29.html
report_2025-01-31_16-50.html
report_2025-01-31_17-13.html
report_2025-01-31_17-37.html
report_2025-01-31_17-58.html
report_2025-01-31_18-21.html
report_2025-01-31_18-45.html
report_2025-01-31_19-07.html
report_2025-01-31_19-26.html
report_2025-01-31_20-44.html
report_2025-01-31_21-08.html
report_2025-01-31_21-27.html
report_2025-01-31_21-49.html
report_2025-01-31_22-08.html
report_2025-01-31_22-29.html
report_2025-01-31_22-48.html
report_2025-01-31_23-10.html
report_2025-01-31_23-29.html
report_2025-01-31_23-54.html
report_2025-02-01_00-15.html
report_2025-02-01_00-39.html
report_2025-02-01_01-01.html
report_2025-02-01_01-20.html
report_2025-02-01_01-35.html
report_2025-02-01_01-52.html
report_2025-02-01_02-12.html
report_2025-02-01_02-29.html
report_2025-02-01_02-46.html
report_2025-02-01_03-05.html
report_2025-02-01_03-20.htm

In [11]:
print(f"Sceduled runs: {len(report_df)}, finished runs {len(report_df[~report_df['report_file'].isna()])}, of which {len(report_df[(report_df['run_ok'] == True)])} were successful")
report_df

Sceduled runs: 1108, finished runs 140, of which 133 were successful


Unnamed: 0,benchmark_set,prediction_name,report_file,run_ok
0,known_extension,LIG_Pex14_3_M1_M412_D1_D377,,
1,known_extension,LIG_PDZ_Class_1_M1590_M1601_D1084_D1593,report_2025-02-03_15-49.html,False
2,known_extension,LIG_PDZ_Class_1_M1590_M1601_D1185_D1492,,
3,known_extension,LIG_PDZ_Class_1_M1590_M1601_Dmin,,
4,known_extension,LIG_PDZ_Class_1_Mmin_D1084_D1593,,
...,...,...,...,...
1103,random_minimal,MTRG_ER_FFAT_1_2RR3.DDOC_MAPK_HePTP_8_2GPH,,
1104,random_minimal,MTRG_LysEnd_GGAAcLL_1_1JWG.DLIG_SUMO_SIM_anti_...,,
1105,random_minimal,MTRG_NES_CRM1_1_3GB8.DDOC_PP1_MyPhoNE_1_1S70,,
1106,random_minimal,MTRG_NLS_Bipartite_1_1PJM.DLIG_PDZ_Class_1_1D5G,,


### Creating merged AF3 output file
Going through the output of the cluster and creating a merged tsv file. On the way, check for missing, corrupted or unexpected data.

In [None]:
dataAF = pd.DataFrame()
missformedOutputs = pd.DataFrame(columns=["benchmark_set", "prediction_name", "model_seed", "reason"])
emptyOutputs = pd.DataFrame(columns=["benchmark_set", "nextflow_name"])

for outputFolder in outputFolders:
    benchmark_set = outputFolder.name
    print(benchmark_set)
    nextflowFolders = [p for p in outputFolder.iterdir() if p.is_dir()]
    for nextflowFolder in nextflowFolders:
        print(nextflowFolder.name)
        if not (metricPath := (nextflowFolder / "alphafold3_metrics.tsv")).exists():
            emptyOutputs.loc[len(emptyOutputs)] = {"benchmark_set":benchmark_set, "nextflow_name": nextflowFolder.name}
            continue
        metricFile = pd.read_csv(metricPath, delimiter="\t", header=0)
        metricFile["benchmark_set"] = benchmark_set
        metricFile["prediction_file"] = None
        if not metricFile.shape[0] >= 1:
            emptyOutputs.loc[len(emptyOutputs)] = {"benchmark_set":benchmark_set, "nextflow_name": nextflowFolder.name}
            continue

        prediction_name = metricFile["prediction_name"][0]
        if not len(set(metricFile["prediction_name"])) == 1:
            missformedOutputs.loc[len(missformedOutputs)] = {"benchmark_set": benchmark_set, "prediction_name": prediction_name, "reason": "multiple prediction_name for one structure"}
            continue
        
        if not (structureFolder := nextflowFolder / "predictions" / "alphafold3" / prediction_name).exists():
            missformedOutputs.loc[len(missformedOutputs)] = {"benchmark_set": benchmark_set, "prediction_name": prediction_name, "reason": "prediction folder does not exist"}
            continue
        for model_file in [(p / "model.cif") for p in structureFolder.iterdir() if p.is_dir() and (p / "model.cif").exists()]:
            model_seed = model_file.parent.name
            if len(metricFile.loc[metricFile["model_id"] == model_seed, ["prediction_file"]]) == 0:
                missformedOutputs.loc[len(missformedOutputs)] = {"benchmark_set": benchmark_set, "prediction_name": prediction_name, "reason": "model seed is not contained in tsv file"}
                continue
            metricFile.loc[metricFile["model_id"] == model_seed, ["prediction_file"]] = model_file
        
        metricFile.sort_values(by=['ranking_score'], ascending=False, ignore_index=True, inplace=True)
        metricFile["model_id"] = metricFile.apply(lambda r: f"ranked_{int(r.name) + 1}", axis=1)
        dataAF = pd.concat([dataAF, metricFile], ignore_index=True)

report_df_ = report_df[~report_df["prediction_name"].isna()].copy()
report_df_["prediction_name_lower"] = report_df["prediction_name"].str.lower()
input_output_merge = pd.merge(
    left = dataAF,
    right = report_df_,
    how="outer",
    left_on = ["benchmark_set", "prediction_name"],
    right_on = ["benchmark_set", "prediction_name_lower"],
    suffixes = ["", "_input"]
)
missingOutputs = input_output_merge[np.logical_and(~input_output_merge["run_ok"].isna(), input_output_merge["prediction_name"].isna())]
missingOutputs = missingOutputs[["benchmark_set", "prediction_name_input", "report_file", "run_ok"]]
unidentifiedOutputs = input_output_merge[input_output_merge["prediction_name_input"].isna()]

# Correcting lower case names
dataAF = pd.merge(
    left = dataAF,
    right = report_df_,
    how="left",
    left_on = ["benchmark_set", "prediction_name"],
    right_on = ["benchmark_set", "prediction_name_lower"],
    suffixes = ["", "_input"]
)
dataAF["prediction_name"] = dataAF["prediction_name_input"]
dataAF.drop(columns=["prediction_name_input", "prediction_name_lower", "report_file", "run_ok", "project_name"], inplace=True)
c = list(dataAF.columns)
c.remove("prediction_name")
c.remove("model_preset")
c.insert(0, "prediction_name")
c.insert(1, "model_preset")
dataAF = dataAF[c]

known_extension
sad_austin
sharp_woese
sad_picasso
fervent_bassi
thirsty_visvesvaraya
exotic_wiles
peaceful_elion
loquacious_dijkstra
high_sinoussi
fabulous_waddington
desperate_church
boring_venter
jolly_dalembert
cheeky_kowalevski
magical_ride
compassionate_joliot
deadly_caravaggio
determined_jepsen
extravagant_pesquet
sad_raman
high_goldberg
jovial_bhabha
nice_brazil
loquacious_minsky
thirsty_poitras
tiny_knuth
shrivelled_gates
maniac_wescoff
nice_heisenberg
special_koch
clever_moriondo
zen_perlman
scruffy_miescher
exotic_brazil
mad_kirch
loquacious_cantor
gloomy_goodall
stupefied_hawking
cranky_noether
tiny_poincare
fabulous_kalam
spontaneous_brattain
trusting_caravaggio
adoring_albattani
prickly_gautier
exotic_ekeblad
clever_ardinghelli
nostalgic_bell
friendly_almeida
happy_linnaeus
special_monod
loving_hodgkin
hungry_sanger
jovial_lattes
magical_boltzmann
elegant_mcnulty
curious_engelbart
distracted_avogadro
lethal_tuckerman
boring_edison
nice_volta
infallible_edison
sad_ride
nau

In [16]:
print(f"Currenlty {len(set(dataAF['prediction_name']))} predictions have run (not including errors)")
display(dataAF)
print("Processed files with errors or missing output")
display(missingOutputs)
print("Missformed outputs")
display(missformedOutputs)
print("Empty output folders")
display(emptyOutputs)

Currenlty 133 predictions have run (not including errors)


Unnamed: 0,prediction_name,model_preset,model_id,chainA_length,chainB_length,fraction_disordered,has_clash,iptm,ptm,ranking_score,...,chainB_intf_avg_plddt,intf_avg_plddt,num_chainA_intf_res,num_chainB_intf_res,num_res_res_contact,num_atom_atom_contact,iPAE,pDockQ,benchmark_set,prediction_file
0,LIG_LRP6_Inhibitor_1_M1_M85_Dmin,alphafold3,ranked_1,300,85,0.22,0.0,0.65,0.75,0.78,...,56.47,75.34,23,9,38,335,6.10,0.08,known_extension,L:\imb-luckgr2\projects\AlphaFold\AlphaFold3\A...
1,LIG_LRP6_Inhibitor_1_M1_M85_Dmin,alphafold3,ranked_2,300,85,0.22,0.0,0.64,0.74,0.77,...,35.59,64.23,35,23,69,679,23.50,0.16,known_extension,L:\imb-luckgr2\projects\AlphaFold\AlphaFold3\A...
2,LIG_LRP6_Inhibitor_1_M1_M85_Dmin,alphafold3,ranked_3,300,85,0.22,0.0,0.62,0.76,0.76,...,51.20,69.70,19,14,38,400,4.40,0.12,known_extension,L:\imb-luckgr2\projects\AlphaFold\AlphaFold3\A...
3,LIG_LRP6_Inhibitor_1_M1_M85_Dmin,alphafold3,ranked_4,300,85,0.22,0.0,0.60,0.76,0.74,...,49.64,71.49,28,16,48,469,7.15,0.13,known_extension,L:\imb-luckgr2\projects\AlphaFold\AlphaFold3\A...
4,LIG_LRP6_Inhibitor_1_M1_M85_Dmin,alphafold3,ranked_5,300,85,0.22,0.0,0.53,0.75,0.69,...,56.76,72.19,18,11,35,317,4.90,0.10,known_extension,L:\imb-luckgr2\projects\AlphaFold\AlphaFold3\A...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
660,LIG_Vh1_VBS_1_M494_M740_D1_D925,alphafold3,ranked_1,925,247,0.06,0.0,0.22,0.63,0.33,...,53.63,50.71,49,59,126,1047,27.90,0.15,known_extension,L:\imb-luckgr2\projects\AlphaFold\AlphaFold3\A...
661,LIG_Vh1_VBS_1_M494_M740_D1_D925,alphafold3,ranked_2,925,247,0.07,0.0,0.18,0.62,0.30,...,56.83,53.03,82,101,307,3339,29.30,0.40,known_extension,L:\imb-luckgr2\projects\AlphaFold\AlphaFold3\A...
662,LIG_Vh1_VBS_1_M494_M740_D1_D925,alphafold3,ranked_3,925,247,0.06,0.0,0.15,0.64,0.28,...,52.20,49.95,29,33,71,698,28.90,0.07,known_extension,L:\imb-luckgr2\projects\AlphaFold\AlphaFold3\A...
663,LIG_Vh1_VBS_1_M494_M740_D1_D925,alphafold3,ranked_4,925,247,0.07,0.0,0.13,0.63,0.27,...,61.45,66.69,14,16,27,141,29.80,0.14,known_extension,L:\imb-luckgr2\projects\AlphaFold\AlphaFold3\A...


Processed files with errors or missing output


Unnamed: 0,benchmark_set,prediction_name_input,report_file,run_ok
179,known_extension,DOC_USP7_UBL2_3_Mmin_D528_D865,report_2025-02-01_10-45.html,False
717,known_extension,LIG_PDZ_Class_1_M1590_M1601_D1084_D1593,report_2025-02-03_15-49.html,False
961,known_extension,LIG_Vh1_VBS_1_M532_M702_D1_D925,report_2025-02-03_15-34.html,False
973,known_extension,LIG_Vh1_VBS_1_MFL_Dmin,report_2025-02-03_04-53.html,False
1000,known_extension,LIG_WW_1_Mmin_D2925_D3362,report_2025-01-31_19-26.html,False
1001,known_extension,LIG_WW_1_Mmin_DFL,report_2025-02-02_09-00.html,False


Missformed outputs


Unnamed: 0,benchmark_set,prediction_name,model_seed,reason


Empty output folders


Unnamed: 0,benchmark_set,nextflow_name
0,known_extension,sad_austin
1,known_extension,jolly_dalembert
2,known_extension,magical_boltzmann
3,known_extension,sharp_kay
4,known_extension,nostalgic_swanson
5,known_extension,high_raman
6,known_extension,nasty_fourier
7,known_extension,nasty_bell
8,known_extension,stupefied_gutenberg
