# AF metrics: File name parsing, template dependend metrics, 
Created 04.04.2025 by Andreas B

This script takes structure files and creates various metrics with it

In [1]:
# Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.axes._axes import Axes
from matplotlib.figure import Figure
from pathlib import Path
from sklearn.metrics import roc_curve, roc_auc_score
import re
import filecmp
import os
from typing import Literal

import pymol
from Bio.SeqUtils import seq1
from Bio.PDB import PDBParser
from Bio.PDB.Structure import Structure as BioPy_PDBStructure
from Bio.PDB.Model import Model as BioPy_PDBModel
from Bio.PDB.Chain import Chain
from Bio.PDB.PDBExceptions import PDBConstructionException
parser = PDBParser(QUIET=True)

class bcolors:
    HEADER = '\033[95m'
    OKBLUE = '\033[94m'
    OKCYAN = '\033[96m'
    OKGREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    ENDC = '\033[0m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'

### 0 Imports and Settings

In [2]:
# Settings

# Path to resource folder with the structures and metadata tables
path_resources = Path(r"D:\Eigene Datein\dev\Uni\JGU Bio Bachelorthesis\Daten\resources")
# Which AF output should be parsed
af_mode: Literal["AF2", "AF3"] = "AF2"

path_AF = path_resources / af_mode
path_solved = path_resources / "solved"


In [3]:
# Read in the AF data
if af_mode == "AF2":
    dataAF = pd.read_csv(path_AF / "AF_metrics_all_structures.tsv", sep="\t")
    # Drop columns to recalculate them
    dataAF.drop(columns=["RMSD_domain", "num_align_atoms_domain", "align_score_domain", "num_align_resi_domain", "RMSD_backbone_peptide", "RMSD_all_atom_peptide", "known_motif_plddt", "DockQ", "iRMS", "LRMS", "Fnonnat", "label"], inplace=True)

    # Adding benchmark set column
    benchmark_set_replace_dict = {"1": "mutations_DMI", "2" : "mutations_DMI", "approved minimal DDI": "known_DDI", "known minimal": "known_DMI", "random minimal": "random_DMI", "random minimal DDI": "random_DDI"}
    dataAF["benchmark_set"] = None
    dataAF["num_mutations"] = None

    for i, row in dataAF.iterrows():
        if row["num_mutation_in_motif"] == "1":
            dataAF.at[i, "num_mutations"] = 1
        elif row["num_mutation_in_motif"] == "2":
            dataAF.at[i, "num_mutations"] = 2
        benchmark_set = benchmark_set_replace_dict[row["num_mutation_in_motif"]]
        dataAF.at[i, "benchmark_set"] = benchmark_set
    dataAF.drop(columns=["num_mutation_in_motif"], inplace=True)

elif af_mode == "AF3":
    dataAF = pd.read_csv(path_AF / "AF3_raw_metrics.tsv", sep="\t")

    benchmark_set_replace_dict = {"mutations": "mutations_DMI", "known_minimal": "known_DMI", "known_DDI": "known_DDI", "random_minimal": "random_DMI", "random_DDI": "random_DDI"}

    for i, row in dataAF.iterrows():
        if row["num_mutation_in_motif"] == "1":
            dataAF.at[i, "num_mutations"] = 1
        elif row["num_mutation_in_motif"] == "2":
            dataAF.at[i, "num_mutations"] = 2
        benchmark_set = benchmark_set_replace_dict[row["num_mutation_in_motif"]]
        dataAF.at[i, "benchmark_set"] = benchmark_set

display(dataAF)

Unnamed: 0,project_name,prediction_name,run_id,chainA_length,chainB_length,model_id,model_confidence,chainA_intf_avg_plddt,chainB_intf_avg_plddt,intf_avg_plddt,pDockQ,iPAE,num_chainA_intf_res,num_chainB_intf_res,num_res_res_contact,num_atom_atom_contact,benchmark_set,num_mutations
0,AlphaFold_benchmark,DEG_APCC_KENBOX_2_4GGD,run37,312,5,ranked_0,0.887117,96.107999,77.495999,91.454999,0.162263,3.311542,15,5,23,208,known_DMI,
1,AlphaFold_benchmark,DEG_APCC_KENBOX_2_4GGD,run37,312,5,ranked_1,0.871984,95.793846,73.986000,89.736111,0.145001,3.395909,13,5,20,190,known_DMI,
2,AlphaFold_benchmark,DEG_APCC_KENBOX_2_4GGD,run37,312,5,ranked_2,0.760784,95.547501,57.906001,86.585239,0.116743,6.166772,16,5,27,237,known_DMI,
3,AlphaFold_benchmark,DEG_APCC_KENBOX_2_4GGD,run37,312,5,ranked_3,0.413662,94.646667,21.510000,76.362500,0.036380,16.713730,9,3,11,83,known_DMI,
4,AlphaFold_benchmark,DEG_APCC_KENBOX_2_4GGD,run37,312,5,ranked_4,0.359078,94.830001,19.753333,72.307001,0.029969,18.696838,7,3,9,108,known_DMI,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3175,AlphaFold_benchmark_DDI,D1PF18773_PF00071_2X19.D2PF00009_PF01873_2D74,run6,60,113,ranked_0,0.298701,49.827778,73.161904,62.392307,0.046600,16.147715,18,21,44,494,random_DDI,
3176,AlphaFold_benchmark_DDI,D1PF18773_PF00071_2X19.D2PF00009_PF01873_2D74,run6,60,113,ranked_1,0.246961,54.362000,76.415000,66.390909,0.033111,17.929291,10,12,19,173,random_DDI,
3177,AlphaFold_benchmark_DDI,D1PF18773_PF00071_2X19.D2PF00009_PF01873_2D74,run6,60,113,ranked_2,0.211579,52.651538,72.910908,61.937083,0.024834,18.416771,13,11,19,101,random_DDI,
3178,AlphaFold_benchmark_DDI,D1PF18773_PF00071_2X19.D2PF00009_PF01873_2D74,run6,60,113,ranked_3,0.211144,50.398334,78.798751,66.627144,0.026651,19.303787,12,16,29,402,random_DDI,


In [4]:
# Read in solved structure data

dataSolved = pd.DataFrame(columns=["set", "PDB_id", "ddi_pfam_id", "path", "chainA_id", "chainB_id"])

# DMI
for structure_file in [p for p in Path(path_solved / "DMI").iterdir() if p.is_file() and p.suffix == ".pdb"]:
    pdb_id = structure_file.name.split("_")[0]
    dataSolved.loc[len(dataSolved)] = {"set" : "DMI", "PDB_id": pdb_id, "path": structure_file.relative_to(path_solved), "chainA_id": "A", "chainB_id": "B"}

# DDI
for structure_file in [p for p in Path(path_solved / "DDI").iterdir() if p.is_file() and p.suffix == ".pdb"]:
    ddi_pfam_id = "_".join(structure_file.name.split("_")[0:2])
    pdb_id = structure_file.name.split("_")[2]
    chainA_id = structure_file.name.split("_")[3][0]
    chainB_id = structure_file.name.split("_")[3][1]
    dataSolved.loc[len(dataSolved)] = {"set" : "DDI", "PDB_id": pdb_id, "ddi_pfam_id": ddi_pfam_id, "path": structure_file.relative_to(path_solved), "chainA_id": chainA_id, "chainB_id": chainB_id}

display(dataSolved)

Unnamed: 0,set,PDB_id,ddi_pfam_id,path,chainA_id,chainB_id
0,DMI,1ATP,,DMI\1ATP_min_DMI.pdb,A,B
1,DMI,1AXC,,DMI\1AXC_min_DMI.pdb,A,B
2,DMI,1B72,,DMI\1B72_min_DMI.pdb,A,B
3,DMI,1B8Q,,DMI\1B8Q_min_DMI.pdb,A,B
4,DMI,1BXX,,DMI\1BXX_min_DMI.pdb,A,B
...,...,...,...,...,...,...
183,DDI,3ZNI,PF14447_PF00179,DDI\PF14447_PF00179_3ZNI_AC.pdb,A,C
184,DDI,3J7Y,PF14978_PF00327,DDI\PF14978_PF00327_3J7Y_oZ.pdb,o,Z
185,DDI,6D6Q,PF15985_PF10175,DDI\PF15985_PF10175_6D6Q_GL.pdb,G,L
186,DDI,3KZ1,PF17838_PF00071,DDI\PF17838_PF00071_3KZ1_BE.pdb,B,E


### 1 Parsing the file names
Many informations (PDB ID, mutation sequence, ...) are included in the filename. This section parses them and adds them to the metrics data frame. The detected values include:
* **PDB_id**: Included in all structures
* **ELM_instance**: Included in DMI structures
* **PDB_id_random_paired** and **ELM_instance_random_paired**: Only included in the randomly paired benchmark sets: random_minimal (both) and random_ddi (only PDB ID)
* **sequence_initial** and **sequence_mutated**: Included in the mutations benchmark set (DMI)
* **chainA_id** and **chainB_id**: The chain IDs in the solved structure file. For DMI it is always A and B. For known_ddi the ids are included in the filename, which are also used for random_ddi.
* **ddi_pfam_id**: Included in DDI structures
* **ddi_pfam_id_random_paired**: Included in the random_ddi benchmark set
* **chainA_id**, **chainA_start**, **chainA_end** and the same three for **chainB**: For DDI structures, the chain ID as well as start and end of the selection are included in the filename. For DMI they will be added later

Note: known_extensions were excluded earlier, but if you need to parse them remove the comments in the code cell below

In [5]:
# Regex checks on filename
regex_paired_DMI = r"^([\w\-]+)_(\w{4})$"
regex_random_DMI = r"^M([\w\-]+)_(\w{4})\.D([\w\-]+)_(\w{4})$"
regex_mutated_DMI = r"^([\w\-]+)_(\w{4})_(\w+)\.([A-Za-z]+)$"
regex_known_extension_DMI = r"^([\w-]+)_((Mmin)|(MFL)|(M[\d]+_M[\d]+))_((DFL)|(Dmin)|(D[\d]+_D[\d]+))$"
regex_ddi_known = r"^([^\W_]+_[^\W_]+)_(\w{4})_(\w+)_resi(\d+)_resi(\d+).(\w+)_resi(\d+)_resi(\d+)$"
regex_ddi_random = r"^D1([^\W_]+_[^\W_]+)_(\w{4}).D2([^\W_]+_[^\W_]+)_(\w{4})$"


dataAF["PDB_id"] = None
dataAF["ELM_instance"] = None
dataAF["ddi_pfam_id"] = None
dataAF["PDB_id_random_paired"] = None
dataAF["ELM_instance_random_paired"] = None
dataAF["ddi_pfam_id_random_paired"] = None
dataAF["sequence_initial"] = None
dataAF["sequence_mutated"] = None
# known_extensions have not been run. Therefore exclude them here but keep the code for them
#dataAF["known_extension_motif"] = None 
#dataAF["known_extension_domain"] = None
dataAF["chainA_id"] = None
dataAF["chainB_id"] = None
dataAF["chainA_start"] = None
dataAF["chainA_end"] = None
dataAF["chainB_start"] = None
dataAF["chainB_end"] = None

for i, row in dataAF.iterrows():
    pdb_id, pdb_id_2, elm_instance, elm_instance_2, sequence, sequence_f = None, None, None, None, None, None
    known_extensionM, known_extensionD, chain1_letter, chain2_letter, ddi_pfam_id, ddi_pfam_id_random_paired = None, None, None, None, None, None
    c1_start, c1_end, c2_start, c2_end = None, None, None, None
    if (benchmark_set := row["benchmark_set"]) == "known_DMI":
        if (r1 := re.search(regex_paired_DMI, row["prediction_name"])) is not None and len(r1.groups()) == 2:
            elm_instance = r1.groups()[0]
            pdb_id = r1.groups()[1]
            chain1_letter, chain2_letter = "A", "B"
    elif benchmark_set == "random_DMI":
        if (r := re.search(regex_random_DMI, row["prediction_name"])) is not None and len(r.groups()) == 4:
            # Contraintuitive, but here before dot is motif and after dot is domain
            elm_instance_2 = r.groups()[0]
            pdb_id_2 = r.groups()[1]
            elm_instance = r.groups()[2]
            pdb_id = r.groups()[3]
            chain1_letter, chain2_letter = "A", "B"
    elif benchmark_set == "mutations_DMI":
        if (r := re.search(regex_mutated_DMI, row["prediction_name"])) is not None and len(r.groups()) == 4:
            elm_instance = r.groups()[0]
            pdb_id = r.groups()[1]
            sequence = r.groups()[2]
            sequence_f = r.groups()[3]
            chain1_letter, chain2_letter = "A", "B"
    #elif benchmark_set == "known_extension":
    #    if (r := re.search(regex_known_extension_DMI, row["prediction_name"])) is not None and len(r.groups()) == 9:
    #        elm_instance = r.groups()[0]
    #        known_extensionM = r.groups()[1]
    #        known_extensionD = r.groups()[5]
    elif benchmark_set == "known_DDI":
        if (r := re.search(regex_ddi_known, row["prediction_name"])) is not None and len(r.groups()) == 8:
            ddi_pfam_id = r.groups()[0]
            pdb_id = r.groups()[1]
            chain1_letter = r.groups()[2]
            c1_start = r.groups()[3]
            c1_end = r.groups()[4]
            chain2_letter = r.groups()[5]
            c2_start = r.groups()[6]
            c2_end = r.groups()[7]

            if pdb_id != pdb_id.upper():
                pdb_id = pdb_id.upper()
                new_prediction_name = row["prediction_name"][:r.span(2)[0]] + pdb_id + row["prediction_name"][r.span(2)[1]:]
                print(f"Fixed prediction_name in set {benchmark_set} from {row['prediction_name']} to {new_prediction_name}")
                dataAF.at[i, "prediction_name"] = new_prediction_name
    elif benchmark_set == "random_DDI":
        if (r := re.search(regex_ddi_random, row["prediction_name"])) is not None and len(r.groups()) == 4:
            ddi_pfam_id = r.groups()[0]
            pdb_id = r.groups()[1]   
            ddi_pfam_id_random_paired = r.groups()[2]
            pdb_id_2 = r.groups()[3]  
    else:
        raise RuntimeError(f"Regex failed on {row['pdb_id']}")
    
    dataAF.at[i, "PDB_id"] =  pdb_id
    dataAF.at[i, "PDB_id_random_paired"] =  pdb_id_2
    dataAF.at[i, "ELM_instance"] =  elm_instance
    dataAF.at[i, "ELM_instance_random_paired"] =  elm_instance_2
    dataAF.at[i, "sequence_initial"] =  sequence
    dataAF.at[i, "sequence_mutated"] =  sequence_f
    #dataAF.at[i, "known_extension_motif"] =  known_extensionM
    #dataAF.at[i, "known_extension_domain"] =  known_extensionD
    dataAF.at[i, "chainA_id"] =  chain1_letter
    dataAF.at[i, "chainB_id"] =  chain2_letter
    dataAF.at[i, "ddi_pfam_id"] =  ddi_pfam_id
    dataAF.at[i, "ddi_pfam_id_random_paired"] =  ddi_pfam_id_random_paired
    dataAF.at[i, "chainA_start"] =  c1_start
    dataAF.at[i, "chainA_end"] =  c1_end
    dataAF.at[i, "chainB_start"] =  c2_start
    dataAF.at[i, "chainB_end"] =  c2_end

# The chain ids as well as start and end residues for the random DDI can be obtained from the known DDI
for i, row in dataAF[dataAF["benchmark_set"] == "random_DDI"].iterrows():
    prediction_name = row["prediction_name"]
    pdb_id, pdb_id_2 = row["PDB_id"], row["PDB_id_random_paired"]
    ddi_pfam_id, ddi_pfam_id_2 = row["ddi_pfam_id"], row["ddi_pfam_id_random_paired"]
    
    if len(list((_row1 := dataAF[np.logical_and(dataAF["benchmark_set"] == "known_DDI", np.logical_and(dataAF["PDB_id"] == pdb_id, dataAF["ddi_pfam_id"] == ddi_pfam_id))])["chainA_id"])) == 0:
        print(f"Can't find {pdb_id} from {prediction_name} (random_ddi, chain A) in the known_DDI set")
        continue
    if len(list((_row2 := dataAF[np.logical_and(dataAF["benchmark_set"] == "known_DDI", np.logical_and(dataAF["PDB_id"] == pdb_id_2, dataAF["ddi_pfam_id"] == ddi_pfam_id_2))])["chainB_id"])) == 0:
        print(f"Can't find {pdb_id_2} from {prediction_name} (random_ddi, chain B) in the known_DDI set")
        continue
    dataAF.at[i, "chainA_id"] = list(_row1["chainA_id"])[0]
    dataAF.at[i, "chainA_start"] = list(_row1["chainA_start"])[0]
    dataAF.at[i, "chainA_end"] = list(_row1["chainA_end"])[0]
    dataAF.at[i, "chainB_id"] =  list(_row2["chainB_id"])[0]
    dataAF.at[i, "chainB_start"] = list(_row2["chainB_start"])[0]
    dataAF.at[i, "chainB_end"] = list(_row2["chainB_end"])[0]
print("\n", f"Rows, where the regex failed")
display(dataAF[dataAF["PDB_id"].isna()])
display(dataAF)

Fixed prediction_name in set known_DDI from PF00009_PF01873_2d74_A_resi12_resi200.B_resi21_resi133 to PF00009_PF01873_2D74_A_resi12_resi200.B_resi21_resi133
Fixed prediction_name in set known_DDI from PF00009_PF01873_2d74_A_resi12_resi200.B_resi21_resi133 to PF00009_PF01873_2D74_A_resi12_resi200.B_resi21_resi133
Fixed prediction_name in set known_DDI from PF00009_PF01873_2d74_A_resi12_resi200.B_resi21_resi133 to PF00009_PF01873_2D74_A_resi12_resi200.B_resi21_resi133
Fixed prediction_name in set known_DDI from PF00009_PF01873_2d74_A_resi12_resi200.B_resi21_resi133 to PF00009_PF01873_2D74_A_resi12_resi200.B_resi21_resi133
Fixed prediction_name in set known_DDI from PF00009_PF01873_2d74_A_resi12_resi200.B_resi21_resi133 to PF00009_PF01873_2D74_A_resi12_resi200.B_resi21_resi133
Fixed prediction_name in set known_DDI from PF00026_PF06394_1f34_A_resi13_resi326.B_resi62_resi120 to PF00026_PF06394_1F34_A_resi13_resi326.B_resi62_resi120
Fixed prediction_name in set known_DDI from PF00026_PF0639

Unnamed: 0,project_name,prediction_name,run_id,chainA_length,chainB_length,model_id,model_confidence,chainA_intf_avg_plddt,chainB_intf_avg_plddt,intf_avg_plddt,...,ELM_instance_random_paired,ddi_pfam_id_random_paired,sequence_initial,sequence_mutated,chainA_id,chainB_id,chainA_start,chainA_end,chainB_start,chainB_end


Unnamed: 0,project_name,prediction_name,run_id,chainA_length,chainB_length,model_id,model_confidence,chainA_intf_avg_plddt,chainB_intf_avg_plddt,intf_avg_plddt,...,ELM_instance_random_paired,ddi_pfam_id_random_paired,sequence_initial,sequence_mutated,chainA_id,chainB_id,chainA_start,chainA_end,chainB_start,chainB_end
0,AlphaFold_benchmark,DEG_APCC_KENBOX_2_4GGD,run37,312,5,ranked_0,0.887117,96.107999,77.495999,91.454999,...,,,,,A,B,,,,
1,AlphaFold_benchmark,DEG_APCC_KENBOX_2_4GGD,run37,312,5,ranked_1,0.871984,95.793846,73.986000,89.736111,...,,,,,A,B,,,,
2,AlphaFold_benchmark,DEG_APCC_KENBOX_2_4GGD,run37,312,5,ranked_2,0.760784,95.547501,57.906001,86.585239,...,,,,,A,B,,,,
3,AlphaFold_benchmark,DEG_APCC_KENBOX_2_4GGD,run37,312,5,ranked_3,0.413662,94.646667,21.510000,76.362500,...,,,,,A,B,,,,
4,AlphaFold_benchmark,DEG_APCC_KENBOX_2_4GGD,run37,312,5,ranked_4,0.359078,94.830001,19.753333,72.307001,...,,,,,A,B,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3175,AlphaFold_benchmark_DDI,D1PF18773_PF00071_2X19.D2PF00009_PF01873_2D74,run6,60,113,ranked_0,0.298701,49.827778,73.161904,62.392307,...,,PF00009_PF01873,,,B,B,392,451,21,133
3176,AlphaFold_benchmark_DDI,D1PF18773_PF00071_2X19.D2PF00009_PF01873_2D74,run6,60,113,ranked_1,0.246961,54.362000,76.415000,66.390909,...,,PF00009_PF01873,,,B,B,392,451,21,133
3177,AlphaFold_benchmark_DDI,D1PF18773_PF00071_2X19.D2PF00009_PF01873_2D74,run6,60,113,ranked_2,0.211579,52.651538,72.910908,61.937083,...,,PF00009_PF01873,,,B,B,392,451,21,133
3178,AlphaFold_benchmark_DDI,D1PF18773_PF00071_2X19.D2PF00009_PF01873_2D74,run6,60,113,ranked_3,0.211144,50.398334,78.798751,66.627144,...,,PF00009_PF01873,,,B,B,392,451,21,133


### 2 Adding domain and motif start / end from template file
While for the DDI structures selection start and end are included in the filename, for DMI structures there is absolutely no information about start/end of motif and domain. At least, the DMI structures are cut to only include the minimal domain/motif, but there still may be mutations or missing residues in experimental structures.

To restore this information use the template and perform a simple search for three consecutive residues in both chains and calculate the offset between the chain IDs. Then take the most common offset and use it at least 50 % of the AF residues were matched this way

In [6]:
def align_sequences(chain_af:  Chain, chain_template: Chain) -> tuple[int, int, float, str, str]:
    """ Estimate the residue id offset between two chains based on a neighbour local alignment (BioPython has no convinient alignment function).
    
        :returns tuple[int, int, float, str, str]: Start ID, End ID, score, Sequence Chain A, Sequence Chain B
    """
    residues_af = [r for r in chain_af.get_residues()]
    residues_tpl = [r for r in chain_template.get_residues()]
    seq_af = seq1(''.join([r.resname for r in residues_af]))
    seq_tpl = seq1(''.join([r.resname for r in residues_tpl]))
    offset_list = []

    misscounts = 0
    for a0, a1, a2 in zip(residues_af[:-2], residues_af[1:-1], residues_af[2:]):
        _found = False
        for t0, t1, t2 in zip(residues_tpl[:-2], residues_tpl[1:-1], residues_tpl[2:]):
            if a0.resname == t0.resname and a1.resname == t1.resname and a2.resname == t2.resname:
                offset_list.append(t1.id[1] - a1.id[1])
                _found = True
        if not _found:
            misscounts += 1

    # For degenerated short chains (motif) use no neighbours for matching
    # if len(offset_list) == 0:
    #     for r1 in residues_af:
    #         for r2 in residues_tpl:
    #             if r1.resname == r2.resname:
    #                 offset_list.append(r2.id[1] - r1.id[1])

    if len(offset_list) == 0:
        return (None, None, 0, seq_af, seq_tpl)
    offsets, counts = np.unique(offset_list, return_counts=True)
    offset = offsets[np.argmax(counts)]
    score = 1 - misscounts/(len(residues_af) - 2)
    return  offset + 1, offset + len(residues_af), score, seq_af, seq_tpl

for i, row in dataAF[dataAF["benchmark_set"].isin(["known_DMI", "random_DMI", "mutations_DMI"])].iterrows():
    pdb_id = str(row["PDB_id"])
    pdb_id_2 = None
    if row["PDB_id_random_paired"] is not None:
        pdb_id_2 = str(row["PDB_id_random_paired"])
    prediction_name = row["prediction_name"]
    benchmark_set = row["benchmark_set"]
    model_id = row["model_id"]

    if model_id == "ranked_0":
        print(bcolors.OKBLUE + f"{prediction_name} ({benchmark_set})" + bcolors.ENDC)

    #if not prediction_name == "MLIG_MYND_2_2ODD.DMOD_SUMO_for_1_1KPS": continue

    af_path = path_AF / "DMI" / benchmark_set / prediction_name / (model_id + ".pdb")
    af_biopy = parser.get_structure("structure", file=af_path)[0]
    chainA_af = af_biopy["A"]
    chainB_af = af_biopy["B"]    

    template1_path = path_solved / "DMI" / (pdb_id + "_min_DMI.pdb")
    if not template1_path.exists():
        print(f"\t", bcolors.WARNING + f"{prediction_name} has no template file for {pdb_id}" + bcolors.WARNING)
        continue
    template1_biopy = parser.get_structure("structure", file=template1_path)[0]
    chainA_tlp = template1_biopy["A"]
    if pdb_id_2 is not None:
        template2_path = path_solved / "DMI" / (pdb_id_2 + "_min_DMI.pdb")
        if not template2_path.exists():
            print(f"\t", f"{prediction_name} has no template file for {pdb_id}")
            continue
        template2_biopy = parser.get_structure("structure", file=template2_path)[0]
        chainB_tlp = template2_biopy["B"]
    else:
        chainB_tlp = template1_biopy["B"]

    chainA_start, chainA_end, chainA_score, seqA_af, seqA_tpl = align_sequences(chain_af=chainA_af, chain_template=chainA_tlp)
    if chainA_start is not None:
        if model_id == "ranked_0":
            print("\t", f"chainA: {chainA_start}-{chainA_end} ({bcolors.WARNING if chainA_score < 0.5 else ''}{chainA_score:0.3f}{bcolors.ENDC})")
        dataAF.at[i, "chainA_start"] =  chainA_start
        dataAF.at[i, "chainA_end"] =  chainA_end
    else:
        if model_id == "ranked_0":
            print(f"\t", bcolors.WARNING + "Chain A alignment failed" + bcolors.ENDC)
    if model_id == "ranked_0" and chainA_score < 0.5:
        print("\t\t", seqA_af)
        print("\t\t", seqA_tpl)

    chainB_start, chainB_end, chainB_score, seqB_af, seqB_tpl = align_sequences(chain_af=chainB_af, chain_template=chainB_tlp)
    if chainB_start is not None:
        if model_id == "ranked_0":
            print("\t", f"chainB: {chainB_start}-{chainB_end} ({bcolors.WARNING if chainB_score < 0.5 else ''}{chainB_score:0.3f}{bcolors.ENDC})")
        dataAF.at[i, "chainB_start"] =  chainB_start
        dataAF.at[i, "chainB_end"] =  chainB_end
    else:
        if model_id == "ranked_0":
            print(f"\t", bcolors.WARNING + "Chain B alignment failed" + bcolors.ENDC)
    if model_id == "ranked_0" and chainB_score < 0.5:
        print("\t\t", seqB_af)
        print("\t\t", seqB_tpl)
    
    

# For the mutations, the alignment mostly fails. For those restore the information using the known_DMI dataset
for i, row in dataAF[dataAF["benchmark_set"].isin(["mutations_DMI"])].iterrows():
    prediction_name = row["prediction_name"]
    benchmark_set = row["benchmark_set"]
    pdb_id = row["PDB_id"]
    pdb_id_2 = row["PDB_id_random_paired"] if row["PDB_id_random_paired"] is not None else pdb_id
    
    if len(list((_row1 := dataAF[np.logical_and(dataAF["benchmark_set"] == "known_DMI", dataAF["PDB_id"] == pdb_id)])["chainA_id"])) == 0:
        print(f"Can't find {pdb_id} from {prediction_name} ({benchmark_set}, chain A) in the known_DMI set")
        continue
    if len(list((_row2 := dataAF[np.logical_and(dataAF["benchmark_set"] == "known_DMI", dataAF["PDB_id"] == pdb_id_2)])["chainB_id"])) == 0:
        print(f"Can't find {pdb_id_2} from {prediction_name} ({benchmark_set}, chain B) in the known_DMI set")
        continue
    dataAF.at[i, "chainA_start"] = list(_row1["chainA_start"])[0]
    dataAF.at[i, "chainA_end"] = list(_row1["chainA_end"])[0]
    dataAF.at[i, "chainB_start"] = list(_row2["chainB_start"])[0]
    dataAF.at[i, "chainB_end"] = list(_row2["chainB_end"])[0]

[94mDEG_APCC_KENBOX_2_4GGD (known_DMI)[0m
	 chainA: 165-476 (1.000[0m)
	 chainB: 6-10 (1.000[0m)
[94mDEG_COP1_1_5IGO (known_DMI)[0m
	 chainA: 352-675 (0.988[0m)
	 chainB: 354-361 (1.000[0m)
[94mDEG_Kelch_Keap1_1_2FLU (known_DMI)[0m
	 chainA: 325-609 (1.000[0m)
	 chainB: 77-82 (1.000[0m)
[94mDEG_Kelch_Keap1_2_3WN7 (known_DMI)[0m
	 chainA: 324-609 (1.000[0m)
	 chainB: 26-32 (1.000[0m)
[94mDEG_MDM2_SWIB_1_1YCR (known_DMI)[0m
	 chainA: 30-109 (1.000[0m)
	 chainB: 19-26 (1.000[0m)
[94mDEG_SCF_COI1_1_3OGL (known_DMI)[0m
	 chainA: 52-592 (0.972[0m)
	 chainB: 203-220 (0.812[0m)
[94mDEG_SCF_FBXO31_1_5VZU (known_DMI)[0m
	 chainA: 139-539 (0.862[0m)
	 chainB: 289-295 (1.000[0m)
[94mDEG_SCF_TIR1_1_2P1Q (known_DMI)[0m
	 chainA: 45-578 (0.996[0m)
	 chainB: 1-13 (1.000[0m)
[94mDEG_SIAH_1_2A25 (known_DMI)[0m
	 chainA: 90-282 (0.723[0m)
	 chainB: 59-67 (1.000[0m)
[94mDEG_SPOP_SBC_1_3HQM (known_DMI)[0m
	 chainA: 28-164 (0.956[0m)
	 chainB: 1362-1366 (1.000[0m)
[9

In [7]:
dataAF[np.logical_and(dataAF["chainA_end"].isna(), dataAF["model_id"] == "ranked_0")]

Unnamed: 0,project_name,prediction_name,run_id,chainA_length,chainB_length,model_id,model_confidence,chainA_intf_avg_plddt,chainB_intf_avg_plddt,intf_avg_plddt,...,ELM_instance_random_paired,ddi_pfam_id_random_paired,sequence_initial,sequence_mutated,chainA_id,chainB_id,chainA_start,chainA_end,chainB_start,chainB_end
1120,AlphaFold_benchmark,MLIG_PAM2_1_1JGN.DMOD_CDK_SPxK_1_2CCI,run38,297,13,ranked_0,0.402063,88.842,37.006667,69.40375,...,LIG_PAM2_1,,,,A,B,,,,
3095,AlphaFold_benchmark_DDI,D1PF07525_PF03931_3ZKJ.D2PF07724_PF00227_1G4B,run6,48,170,ranked_0,0.260652,50.000001,86.291668,72.921054,...,,PF07724_PF00227,,,,,,,,
3100,AlphaFold_benchmark_DDI,D1PF07724_PF00227_1G4B.D2PF08644_PF03531_4KHB,run6,143,95,ranked_0,0.286443,66.624443,77.482667,73.410833,...,,PF08644_PF03531,,,,,,,,
3155,AlphaFold_benchmark_DDI,D1PF14447_PF00179_3ZNI.D2PF14978_PF00327_5OOL,run6,65,71,ranked_0,0.429536,80.666154,83.33,81.824348,...,,PF14978_PF00327,,,,,,,,
3160,AlphaFold_benchmark_DDI,D1PF14978_PF00327_5OOL.D2PF15985_PF10175_6D6Q,run6,89,127,ranked_0,0.094312,38.565715,24.708462,31.893704,...,,PF15985_PF10175,,,,,,,,


### Save

In [8]:
dataAF.to_csv(path_resources / af_mode / (af_mode + "_metrics.tsv"), sep="\t", index=None)
dataAF.to_excel(path_resources / af_mode / (af_mode + "_metrics.xlsx"), sheet_name="AF2 metrics", index=None)