# Parsing the output of the new AF3 models
2025-02-04 by Andreas

Script for parsing the AF3 output into a single file

In [81]:
from pathlib import Path
import pandas as pd
import numpy as np
import re

In [5]:
outputBase = Path(r"L:\imb-luckgr2\projects\AlphaFold\AlphaFold3\AlphaFold_benchmark_DMI")
outputFolders = [p for p in outputBase.iterdir() if p.is_dir()]
for p in outputFolders:
    print(p)

L:\imb-luckgr2\projects\AlphaFold\AlphaFold3\AlphaFold_benchmark_DMI\known_extension
L:\imb-luckgr2\projects\AlphaFold\AlphaFold3\AlphaFold_benchmark_DMI\known_minimal
L:\imb-luckgr2\projects\AlphaFold\AlphaFold3\AlphaFold_benchmark_DMI\mutations
L:\imb-luckgr2\projects\AlphaFold\AlphaFold3\AlphaFold_benchmark_DMI\random_minimal


In [78]:
dataAF = pd.DataFrame()
missformedOutputs = pd.DataFrame(columns=["benchmark_set", "prediction_name", "model_seed", "reason"])
missingOutputs = pd.DataFrame(columns=["benchmark_set",  "prediction_name"])
emptyOutputs = pd.DataFrame(columns=["benchmark_set", "nextflow_name"])

for outputFolder in outputFolders:
    benchmark_set = outputFolder.name
    nextflowFolders = [p for p in outputFolder.iterdir() if p.is_dir()]
    for nextflowFolder in nextflowFolders:
        print(nextflowFolder.name)
        if not (metricPath := (nextflowFolder / "alphafold3_metrics.tsv")).exists():
            emptyOutputs.loc[len(emptyOutputs)] = {"benchmark_set":benchmark_set, "nextflow_name": nextflowFolder.name}
            continue
        metricFile = pd.read_csv(metricPath, delimiter="\t", header=0)
        metricFile["benchmark_set"] = benchmark_set
        metricFile["prediction_file"] = None
        if not metricFile.shape[0] >= 1:
            emptyOutputs.loc[len(emptyOutputs)] = {"benchmark_set":benchmark_set, "nextflow_name": nextflowFolder.name}
            continue

        prediction_name = metricFile["prediction_name"][0]
        if not len(set(metricFile["prediction_name"])) == 1:
            missformedOutputs.loc[len(missformedOutputs)] = {"benchmark_set": benchmark_set, "prediction_name": prediction_name, "reason": "multiple prediction_name for one structure"}
            continue
        
        if not (structureFolder := nextflowFolder / "predictions" / "alphafold3" / prediction_name).exists():
            missformedOutputs.loc[len(missformedOutputs)] = {"benchmark_set": benchmark_set, "prediction_name": prediction_name, "reason": "prediction folder does not exist"}
            continue
        for model_file in [(p / "model.cif") for p in structureFolder.iterdir() if p.is_dir() and (p / "model.cif").exists()]:
            model_seed = model_file.parent.name
            if len(metricFile.loc[metricFile["model_id"] == model_seed, ["prediction_file"]]) == 0:
                missformedOutputs.loc[len(missformedOutputs)] = {"benchmark_set": benchmark_set, "prediction_name": prediction_name, "reason": "model seed is not contained in tsv file"}
                continue
            metricFile.loc[metricFile["model_id"] == model_seed, ["prediction_file"]] = model_file
        
        metricFile.sort_values(by=['ranking_score'], ascending=False, ignore_index=True, inplace=True)
        metricFile["model_id"] = metricFile.apply(lambda r: f"ranked_{int(r.name) + 1}", axis=1)
        dataAF = pd.concat([dataAF, metricFile], ignore_index=True)

    nextflow_inputs = [f for f in outputFolder.iterdir() if f.is_file() and f.suffix.lower() == ".json"]
    for nextflow_input in nextflow_inputs:
        prediction_name = nextflow_input.stem
        if (dataAF[np.logical_and(dataAF["benchmark_set"] == benchmark_set, dataAF["prediction_name"] == prediction_name.lower())]).shape[0] == 0:
            missingOutputs.loc[len(missingOutputs)] = {"benchmark_set": benchmark_set, "prediction_name": prediction_name}
        else:
            dataAF[np.logical_and(dataAF["benchmark_set"] == benchmark_set, dataAF["prediction_name"] == prediction_name.lower())]["prediction_name"] == prediction_name # Correcting lowercase by AF3

sad_austin
sharp_woese
sad_picasso
fervent_bassi
thirsty_visvesvaraya
exotic_wiles
peaceful_elion
loquacious_dijkstra
high_sinoussi
fabulous_waddington
desperate_church
boring_venter
jolly_dalembert
cheeky_kowalevski
magical_ride
compassionate_joliot
deadly_caravaggio
determined_jepsen
extravagant_pesquet
sad_raman
high_goldberg
jovial_bhabha
nice_brazil
loquacious_minsky
thirsty_poitras
tiny_knuth
shrivelled_gates
maniac_wescoff
nice_heisenberg
special_koch
clever_moriondo
zen_perlman
scruffy_miescher
exotic_brazil
mad_kirch
loquacious_cantor
gloomy_goodall
stupefied_hawking
cranky_noether
tiny_poincare
fabulous_kalam
spontaneous_brattain
trusting_caravaggio
adoring_albattani
prickly_gautier
exotic_ekeblad
clever_ardinghelli
nostalgic_bell
friendly_almeida
happy_linnaeus
special_monod
loving_hodgkin
hungry_sanger
jovial_lattes
magical_boltzmann
elegant_mcnulty
curious_engelbart
distracted_avogadro
lethal_tuckerman
boring_edison
nice_volta
infallible_edison
sad_ride
nauseous_yalow
susp

In [77]:
display(dataAF)
print("Missformed outputs")
display(missformedOutputs)
print("Missing outputs")
display(missingOutputs)
print("Empty outputs")
display(emptyOutputs)

Unnamed: 0,model_id,chainA_length,chainB_length,fraction_disordered,has_clash,iptm,ptm,ranking_score,chainA_intf_avg_plddt,chainB_intf_avg_plddt,...,num_chainB_intf_res,num_res_res_contact,num_atom_atom_contact,iPAE,pDockQ,project_name,prediction_name,model_preset,benchmark_set,prediction_file
0,ranked_1,300,85,0.22,0.0,0.65,0.75,0.78,82.73,56.47,...,9,38,335,6.10,0.08,sharp_woese,lig_lrp6_inhibitor_1_m1_m85_dmin,alphafold3,known_extension,L:\imb-luckgr2\projects\AlphaFold\AlphaFold3\A...
1,ranked_2,300,85,0.22,0.0,0.64,0.74,0.77,83.06,35.59,...,23,69,679,23.50,0.16,sharp_woese,lig_lrp6_inhibitor_1_m1_m85_dmin,alphafold3,known_extension,L:\imb-luckgr2\projects\AlphaFold\AlphaFold3\A...
2,ranked_3,300,85,0.22,0.0,0.62,0.76,0.76,83.34,51.20,...,14,38,400,4.40,0.12,sharp_woese,lig_lrp6_inhibitor_1_m1_m85_dmin,alphafold3,known_extension,L:\imb-luckgr2\projects\AlphaFold\AlphaFold3\A...
3,ranked_4,300,85,0.22,0.0,0.60,0.76,0.74,83.98,49.64,...,16,48,469,7.15,0.13,sharp_woese,lig_lrp6_inhibitor_1_m1_m85_dmin,alphafold3,known_extension,L:\imb-luckgr2\projects\AlphaFold\AlphaFold3\A...
4,ranked_5,300,85,0.22,0.0,0.53,0.75,0.69,81.61,56.76,...,11,35,317,4.90,0.10,sharp_woese,lig_lrp6_inhibitor_1_m1_m85_dmin,alphafold3,known_extension,L:\imb-luckgr2\projects\AlphaFold\AlphaFold3\A...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
660,ranked_1,925,247,0.06,0.0,0.22,0.63,0.33,47.19,53.63,...,59,126,1047,27.90,0.15,furious_cray,lig_vh1_vbs_1_m494_m740_d1_d925,alphafold3,known_extension,L:\imb-luckgr2\projects\AlphaFold\AlphaFold3\A...
661,ranked_2,925,247,0.07,0.0,0.18,0.62,0.30,48.34,56.83,...,101,307,3339,29.30,0.40,furious_cray,lig_vh1_vbs_1_m494_m740_d1_d925,alphafold3,known_extension,L:\imb-luckgr2\projects\AlphaFold\AlphaFold3\A...
662,ranked_3,925,247,0.06,0.0,0.15,0.64,0.28,47.39,52.20,...,33,71,698,28.90,0.07,furious_cray,lig_vh1_vbs_1_m494_m740_d1_d925,alphafold3,known_extension,L:\imb-luckgr2\projects\AlphaFold\AlphaFold3\A...
663,ranked_4,925,247,0.07,0.0,0.13,0.63,0.27,72.69,61.45,...,16,27,141,29.80,0.14,furious_cray,lig_vh1_vbs_1_m494_m740_d1_d925,alphafold3,known_extension,L:\imb-luckgr2\projects\AlphaFold\AlphaFold3\A...


Missformed outputs


Unnamed: 0,benchmark_set,prediction_name,model_seed,reason


Missing outputs


Unnamed: 0,benchmark_set,prediction_name
0,known_extension,LIG_Pex14_3_M1_M412_D1_D377
1,known_extension,LIG_PDZ_Class_1_M1590_M1601_D1084_D1593
2,known_extension,LIG_PDZ_Class_1_M1590_M1601_D1185_D1492
3,known_extension,LIG_PDZ_Class_1_M1590_M1601_Dmin
4,known_extension,LIG_PDZ_Class_1_Mmin_D1084_D1593
...,...,...
969,random_minimal,MTRG_ER_FFAT_1_2RR3.DDOC_MAPK_HePTP_8_2GPH
970,random_minimal,MTRG_LysEnd_GGAAcLL_1_1JWG.DLIG_SUMO_SIM_anti_...
971,random_minimal,MTRG_NES_CRM1_1_3GB8.DDOC_PP1_MyPhoNE_1_1S70
972,random_minimal,MTRG_NLS_Bipartite_1_1PJM.DLIG_PDZ_Class_1_1D5G


Empty outputs


Unnamed: 0,benchmark_set,nextflow_name
0,known_extension,sad_austin
1,known_extension,jolly_dalembert
2,known_extension,magical_boltzmann
3,known_extension,sharp_kay
4,known_extension,nostalgic_swanson
5,known_extension,high_raman
6,known_extension,nasty_fourier
7,known_extension,nasty_bell
8,known_extension,stupefied_gutenberg


In [79]:
input_count = len([f for f in outputFolders[0].iterdir() if f.is_file() and f.suffix.lower() == ".json"])
output_count = len(set(dataAF["prediction_name"]))
print(input_count, output_count)

565 133


## Scanning report.html

In [87]:
report_df = pd.DataFrame(columns=["benchmark_set", "report_file", "prediction_name", "ok"])
for outputFolder in outputFolders:
    benchmark_set = outputFolder.name
    for p in [f for f in outputFolders[0].iterdir() if f.is_file() and "report" in f.stem and f.suffix.lower() == ".html"]:
        print(p.name)
        with open(p) as f:
            content = f.read()
        prediction_name = x.groups()[0] if (x := re.search(r"\(\[id:\[(\w+)\], jobsize:\d+\]\)", content)) is not None else None
        finished = ("Workflow execution completed successfully!" in content)
        report_df.loc[len(report_df)] = {"benchmark_set": benchmark_set, "report_file": p.name, "prediction_name": prediction_name, "ok": finished}
report_df

report_2025-01-31_15-12.html
report_2025-01-31_15-19.html
report_2025-01-31_15-41.html
report_2025-01-31_16-04.html
report_2025-01-31_16-29.html
report_2025-01-31_16-50.html
report_2025-01-31_17-13.html
report_2025-01-31_17-37.html
report_2025-01-31_17-58.html
report_2025-01-31_18-21.html
report_2025-01-31_18-45.html
report_2025-01-31_19-07.html
report_2025-01-31_19-26.html
report_2025-01-31_20-44.html
report_2025-01-31_21-08.html
report_2025-01-31_21-27.html
report_2025-01-31_21-49.html
report_2025-01-31_22-08.html
report_2025-01-31_22-29.html
report_2025-01-31_22-48.html
report_2025-01-31_23-10.html
report_2025-01-31_23-29.html
report_2025-01-31_23-54.html
report_2025-02-01_00-15.html
report_2025-02-01_00-39.html
report_2025-02-01_01-01.html
report_2025-02-01_01-20.html
report_2025-02-01_01-35.html
report_2025-02-01_01-52.html
report_2025-02-01_02-12.html
report_2025-02-01_02-29.html
report_2025-02-01_02-46.html
report_2025-02-01_03-05.html
report_2025-02-01_03-20.html
report_2025-02

Unnamed: 0,benchmark_set,report_file,prediction_name,ok
0,known_extension,report_2025-01-31_15-12.html,,False
1,known_extension,report_2025-01-31_15-19.html,LIG_LRP6_Inhibitor_1_M1_M85_Dmin,True
2,known_extension,report_2025-01-31_15-41.html,LIG_LRP6_Inhibitor_1_M23_M57_D1_D362,True
3,known_extension,report_2025-01-31_16-04.html,LIG_LRP6_Inhibitor_1_M23_M57_D1_D422,True
4,known_extension,report_2025-01-31_16-29.html,LIG_LRP6_Inhibitor_1_M23_M57_Dmin,True
...,...,...,...,...
555,random_minimal,report_2025-02-03_04-53.html,LIG_Vh1_VBS_1_MFL_Dmin,False
556,random_minimal,report_2025-02-03_09-50.html,LIG_Vh1_VBS_1_M196_M1983_D1_D925,True
557,random_minimal,report_2025-02-03_14-47.html,LIG_Vh1_VBS_1_M494_M740_D1_D925,True
558,random_minimal,report_2025-02-03_15-34.html,LIG_Vh1_VBS_1_M532_M702_D1_D925,False


In [88]:
report_df

Unnamed: 0,benchmark_set,report_file,prediction_name,ok
0,known_extension,report_2025-01-31_15-12.html,,False
1,known_extension,report_2025-01-31_15-19.html,LIG_LRP6_Inhibitor_1_M1_M85_Dmin,True
2,known_extension,report_2025-01-31_15-41.html,LIG_LRP6_Inhibitor_1_M23_M57_D1_D362,True
3,known_extension,report_2025-01-31_16-04.html,LIG_LRP6_Inhibitor_1_M23_M57_D1_D422,True
4,known_extension,report_2025-01-31_16-29.html,LIG_LRP6_Inhibitor_1_M23_M57_Dmin,True
...,...,...,...,...
555,random_minimal,report_2025-02-03_04-53.html,LIG_Vh1_VBS_1_MFL_Dmin,False
556,random_minimal,report_2025-02-03_09-50.html,LIG_Vh1_VBS_1_M196_M1983_D1_D925,True
557,random_minimal,report_2025-02-03_14-47.html,LIG_Vh1_VBS_1_M494_M740_D1_D925,True
558,random_minimal,report_2025-02-03_15-34.html,LIG_Vh1_VBS_1_M532_M702_D1_D925,False
