# Calculations of template dependend metrics for AF3

created by Andreas 2025-02-19

This notebook is for calculating the template depended metrics on the AF3 output. It is based on the script *calculate_template_dependent_metrics.py* by Chop Yan Lee.

It will add the following columns: RMSD_domain, num_align_atoms_domain, align_score_domain, num_align_resi_domain, RMSD_backbone_peptide, RMSD_all_atom_peptide, known_motif_plddt, DockQ, iRMS, LRMS and num_mutation_in_motif. The column Fnonnat is not calculated by this script.

In [1]:
# Imports
from pathlib import Path
import pandas as pd
import numpy as np
import re
import difflib

import pymol
from Bio.PDB import PDBParser
from Bio.PDB.Structure import Structure as BioPy_PDBStructure
from Bio.PDB.Model import Model as BioPy_PDBModel
from Bio.PDB.PDBExceptions import PDBConstructionException
parser = PDBParser(QUIET=True)


ressources_path = Path("../ressources").resolve()


In [2]:
pymol.finish_launching()

In [2]:
dataSolved = pd.DataFrame(columns=["set", "PDB_id", "ddi_pfam_id", "chain_A_id", "chain_B_id", "chain_A_sequence", "chain_B_sequence"])

solved_base_path = Path("../ressources/solved").resolve()
if not solved_base_path.exists():
    raise RuntimeError(f"The path {solved_base_path} does not exist")

DDI_solved = [p for p in (solved_base_path / "DDI_solved_structures").iterdir() if p.is_file() and p.suffix.lower() == ".pdb"]
DMI_solved = [p for p in (solved_base_path / "DMI_solved_structures").iterdir() if p.is_file() and p.suffix.lower() == ".pdb"]

# First DMI
for structure_file in DMI_solved:
    pdb_id = structure_file.name.split("_")[0]
    if not len(pdb_id) == 4:
        raise RuntimeError(f"Unexpected file name {structure_file.name}")
    

    structure_biopy = parser.get_structure("structure", file=structure_file)
    chains = [c for c in structure_biopy.get_chains()]
    if len(chains) != 2:
        print(f"Unexpected chains in {structure_file.name}")
        continue
    chainA = structure_biopy[0][chains[0].id]
    chainB = structure_biopy[0][chains[1].id]

    sequenceA = '-'.join([r.get_resname() for r in chainA.get_residues()])
    sequenceB = '-'.join([r.get_resname() for r in chainB.get_residues()])

    dataSolved.loc[len(dataSolved)] = {"set" : "DMI", "PDB_id": pdb_id, "chain_A_sequence": sequenceA, "chain_B_sequence": sequenceB}




# Now DDI
for structure_file in DDI_solved:
    ddi_pfam_id = "_".join(structure_file.name.split("_")[0:2])
    pdb_id = structure_file.name.split("_")[2]
    chainA_id = structure_file.name.split("_")[3][0]
    chainB_id = structure_file.name.split("_")[3][1]

    structure_biopy = parser.get_structure("structure", file=structure_file)
    chains = [c for c in structure_biopy.get_chains()]
    chain_ids = [c.id for c in structure_biopy.get_chains()]
    if len(chains) != 2 or chainA_id not in chain_ids or chainB_id not in chain_ids:
        print(f"Unexpected chains in {structure_file.name}: Expected {chainA_id} and {chainB_id}, got {chains}")
        continue
    chainA = structure_biopy[0][chainA_id]
    chainB = structure_biopy[0][chainB_id]

    sequenceA = '-'.join([r.get_resname() for r in chainA.get_residues()])
    sequenceB = '-'.join([r.get_resname() for r in chainB.get_residues()])

    dataSolved.loc[len(dataSolved)] = {"set" : "DDI", "PDB_id": pdb_id, "ddi_pfam_id": ddi_pfam_id, "chain_A_id": chainA_id, "chain_B_id": chainB_id, "chain_A_sequence": sequenceA, "chain_B_sequence": sequenceB}

display(dataSolved)

Unnamed: 0,set,PDB_id,ddi_pfam_id,chain_A_id,chain_B_id,chain_A_sequence,chain_B_sequence
0,DMI,1ATP,,,,GLN-PHE-ASP-ARG-ILE-LYS-THR-LEU-GLY-THR-GLY-SE...,PHE-THR-GLU-PHE
1,DMI,1AXC,,,,MET-PHE-GLU-ALA-ARG-LEU-VAL-GLN-GLY-SER-ILE-LE...,GLN-THR-SER-MET-THR-ASP-PHE-TYR-HIS-SER
2,DMI,1B72,,,,ARG-LYS-ARG-ARG-ASN-PHE-ASN-LYS-GLN-ALA-THR-GL...,PHE-ASP-TRP-MET
3,DMI,1B8Q,,,,ASN-VAL-ILE-SER-VAL-ARG-LEU-PHE-LYS-ARG-LYS-VA...,VAL-LYS-VAL-ASP-SER-VAL
4,DMI,1BXX,,,,ILE-GLY-TRP-ARG-ARG-GLU-GLY-ILE-LYS-TYR-ARG-AR...,TYR-GLN-ARG-LEU
...,...,...,...,...,...,...,...
181,DDI,3ZNI,PF14447_PF00179,A,C,GLN-ALA-ALA-ALA-ASP-ARG-ARG-THR-VAL-GLU-LYS-TH...,ALA-LEU-LYS-ARG-ILE-HIS-LYS-GLU-LEU-ASN-ASP-LE...
182,DDI,3J7Y,PF14978_PF00327,o,Z,ARG-GLY-ARG-ILE-PRO-GLY-ARG-GLN-TRP-ILE-GLY-LY...,LYS-PHE-THR-ARG-SER-ARG-ILE-PRO-GLU-LYS-VAL-PH...
183,DDI,6D6Q,PF15985_PF10175,G,L,ALA-ARG-ALA-ALA-ARG-THR-VAL-LEU-GLY-GLN-VAL-VA...,ARG-LYS-THR-ARG-LEU-SER-LYS-ASN-LEU-LEU-ARG-ME...
184,DDI,3KZ1,PF17838_PF00071,B,E,ASN-TRP-GLN-HIS-THR-VAL-GLY-LYS-ASP-VAL-VAL-AL...,ALA-ILE-ARG-LYS-LYS-LEU-VAL-ILE-VAL-GLY-ASP-GL...


In [3]:
# Read parsed AF3 metric file

dataAF = pd.read_csv(Path("../ressources/AF3/AF3_metrics.tsv"), sep="\t")
display(dataAF)

Unnamed: 0,model_preset,benchmark_set,prediction_name,model_id,chainA_length,chainB_length,fraction_disordered,has_clash,iptm,ptm,...,ELM_instance,ELM_instance_random_paired,sequence_initial,sequence_mutated,known_extension_motif,known_extension_domain,ddi_pfam_id,ddi_pfam_id_random_paired,chain1_letter,chain2_letter
0,alphafold3,known_minimal,LIG_HOMEOBOX_1B72,ranked_0,4,73,0.05,0.0,0.54,0.80,...,LIG_HOMEOBOX,,,,,,,,,
1,alphafold3,known_minimal,LIG_HOMEOBOX_1B72,ranked_1,4,73,0.05,0.0,0.46,0.76,...,LIG_HOMEOBOX,,,,,,,,,
2,alphafold3,known_minimal,LIG_HOMEOBOX_1B72,ranked_2,4,73,0.13,0.0,0.43,0.74,...,LIG_HOMEOBOX,,,,,,,,,
3,alphafold3,known_minimal,LIG_HOMEOBOX_1B72,ranked_3,4,73,0.09,0.0,0.42,0.77,...,LIG_HOMEOBOX,,,,,,,,,
4,alphafold3,known_minimal,LIG_HOMEOBOX_1B72,ranked_4,4,73,0.05,0.0,0.37,0.77,...,LIG_HOMEOBOX,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3085,alphafold3,random_ddi,D1PF06991_PF08082_7AAV.D2PF07417_PF00140_6OMF,ranked_0,123,123,0.41,0.0,0.27,0.51,...,,,,,,,D1PF06991_PF08082,D2PF07417_PF00140,,
3086,alphafold3,random_ddi,D1PF06991_PF08082_7AAV.D2PF07417_PF00140_6OMF,ranked_1,123,123,0.44,0.0,0.21,0.50,...,,,,,,,D1PF06991_PF08082,D2PF07417_PF00140,,
3087,alphafold3,random_ddi,D1PF06991_PF08082_7AAV.D2PF07417_PF00140_6OMF,ranked_2,123,123,0.42,0.0,0.11,0.50,...,,,,,,,D1PF06991_PF08082,D2PF07417_PF00140,,
3088,alphafold3,random_ddi,D1PF06991_PF08082_7AAV.D2PF07417_PF00140_6OMF,ranked_3,123,123,0.43,0.0,0.09,0.48,...,,,,,,,D1PF06991_PF08082,D2PF07417_PF00140,,


In [16]:
dataAF["RMSD_domain"] = None
dataAF["align_score_domain"] = None
dataAF["num_align_atoms_domain"] = None
dataAF["num_align_resi_domain"] = None
dataAF["RMSD_backbone_peptide"] = None
dataAF["RMSD_all_atom_peptide"] = None

for i,row in dataAF.iterrows():
    # RMSD can only be calculated for DMI. Mutations also allows limited RMSD calculations, as only chain B is mutated
    if (benchmark_set := row["benchmark_set"] == "known_minimal") or benchmark_set == "mutations":
        pdb_id = str(row["PDB_id"])
        prediction_name = str(row["prediction_name"])
        model_id = str(row["model_id"])

        structure_path = ressources_path / "AF3" / "DMI" / "known_minimal" / prediction_name / str(model_id + ".pdb")
        if not structure_path.exists():
            print(f"For structure {prediction_name}_{model_id} the pdb file does not exist. Skip")
            continue
        structure_template_path = ressources_path / "solved" / "DMI_solved_structures" / str(pdb_id + "_min_DMI.pdb")
        if not structure_template_path.exists():
            print(f"For structure {prediction_name}_{model_id} the template pdb file does not exist. Skip")
            continue
        
        structure_biopy = parser.get_structure("structure", file=structure_path)
        chains = [c for c in structure_biopy.get_chains()]
        if len(chains) != 2:
            print(f"Unexpected chains in {structure_file.name}")
            continue
        chainA = structure_biopy[0][chains[0].id]
        chainB = structure_biopy[0][chains[1].id]

        sequenceA = '-'.join([r.get_resname() for r in chainA.get_residues()])
        sequenceB = '-'.join([r.get_resname() for r in chainB.get_residues()])

        structure_biopy_template = parser.get_structure("structure", file=structure_template_path)
        chains_template = [c for c in structure_biopy.get_chains()]
        if len(chains_template) != 2:
            print(f"Unexpected chains in {structure_template_path.name}")
            continue
        chainA_template = structure_biopy[0][chains[0].id]
        chainB_template = structure_biopy[0][chains[1].id]

        sequenceA_template = '-'.join([r.get_resname() for r in chainA_template.get_residues()])
        sequenceB_template = '-'.join([r.get_resname() for r in chainB_template.get_residues()])

    
        chainA_ratio = difflib.SequenceMatcher(None, sequenceA, sequenceA_template).ratio()

        if chainA_ratio < 0.9:
            print(f"For structure {prediction_name}_{model_id} the sequences for chain A differ more than expected")
            print("\t", sequenceA, "\n\t", sequenceA_template)
            continue
        if sequenceB != sequenceB_template:
            print(f"For structure {prediction_name}_{model_id} the sequences for chain B differ")
            print("\t", sequenceB, "\n\t", sequenceB_template)
            continue
        for o in pymol.cmd.get_object_list():
            pymol.cmd.delete(o)

        pymol.cmd.load(structure_path, "AF")
        pymol.cmd.load(structure_template_path, "solved")
        pymol.cmd.remove(selection="elem 'H'")

        space = {'solved_resi': [], "af_resi": []}
        pymol.cmd.iterate("solved and chain B", "solved_resi.append(int(resi))", space=space)
        pymol.cmd.iterate("af and chain B", "af_resi.append(int(resi))", space=space)
        # Calcuate two offsets from beginning and end of chain B to check for potential missing IDs
        offset_low = np.min(space["solved_resi"]) - np.min(space["af_resi"])
        offset_high = np.max(space["solved_resi"]) - np.max(space["af_resi"])
        if not offset_low == offset_high:
            print(f"Offset error for {prediction_name} (model {model_id})")
            continue

        pymol.cmd.alter("af and chain B", f"resi = (int(resi) + {offset_low})")
        pymol.cmd.sort()
        pymol.cmd.alter("chain B", "segi = chain")
        pymol.cmd.sort()

        #    0: RMSD after refinement
        #    1: Number of aligned atoms after refinement
        #    2: Number of refinement cycles
        #    3: RMSD before refinement
        #    4: Number of aligned atoms before refinement
        #    5: Raw alignment score
        #    6: Number of residues aligned

        # Cycles = 0 to prevent refinement of the structure (we don't want any modifications to the structure)
        align_output = pymol.cmd.align(mobile="AF and chain A", target="solved and chain A", object="algn_domain", cycles=0, )
        RMSD_domain = align_output[0]
        num_align_atoms_domain = align_output[1]
        align_score_domain = align_output[5]
        num_align_resi_domain = align_output[6]

        RMSD_backbone_peptide = pymol.cmd.rms_cur(mobile="AF and chain B and bb.", target="solved and chain B and bb.", object="peptide_super_bb")
        RMSD_all_atom_peptide = pymol.cmd.rms_cur(mobile="AF and chain B", target="solved and chain B", object="peptide_super_all_atoms")
        dataAF.at[i, "RMSD_domain"] =  RMSD_domain
        dataAF.at[i, "align_score_domain"] =  align_score_domain
        dataAF.at[i, "num_align_atoms_domain"] =  num_align_atoms_domain
        dataAF.at[i, "num_align_resi_domain"] =  num_align_resi_domain
        dataAF.at[i, "RMSD_backbone_peptide"] =  RMSD_backbone_peptide
        dataAF.at[i, "RMSD_all_atom_peptide"] =  RMSD_all_atom_peptide

NameError: name 'dataAF' is not defined

In [None]:
prediction_name = "DEG_SCF_COI1_1_3OGL"
pdb_id = prediction_name.split("_")
print(pdb_id)

In [11]:
prediction_name = "DEG_SCF_COI1_1_3OGL"
pdb_id = prediction_name.split("_")[-1]
model_id = "ranked_0"
structure_path = ressources_path / "AF3" / "DMI" / "known_minimal" / prediction_name / str(model_id + ".pdb")
structure_template_path = ressources_path / "solved" / "DMI_solved_structures" / str(pdb_id + "_min_DMI.pdb")
for o in pymol.cmd.get_object_list():
    pymol.cmd.delete(o)

pymol.cmd.load(structure_path, "AF")
pymol.cmd.load(structure_template_path, "solved")
pymol.cmd.remove(selection="elem 'H'")

space = {'solved_resi': [], "af_resi": []}
pymol.cmd.iterate("solved and chain B", "solved_resi.append(int(resi))", space=space)
pymol.cmd.iterate("af and chain B", "af_resi.append(int(resi))", space=space)
# Calcuate two offsets from beginning and end of chain B to check for potential missing IDs
offset_low = np.min(space["solved_resi"]) - np.min(space["af_resi"])
offset_high = np.max(space["solved_resi"]) - np.max(space["af_resi"])
assert offset_low == offset_high

pymol.cmd.alter("af and chain B", f"resi = (int(resi) + {offset_low})")
pymol.cmd.sort()
pymol.cmd.alter("chain B", "segi = chain")
pymol.cmd.sort()

#    0: RMSD after refinement
#    1: Number of aligned atoms after refinement
#    2: Number of refinement cycles
#    3: RMSD before refinement
#    4: Number of aligned atoms before refinement
#    5: Raw alignment score
#    6: Number of residues aligned

# Cycles = 0 to prevent refinement of the structure (we don't want any modifications to the structure)
align_output = pymol.cmd.align(mobile="AF and chain A", target="solved and chain A", object="algn_domain", cycles=0, )
RMSD_domain = align_output[0]
num_align_atoms_domain = align_output[1]
align_score_domain = align_output[5]
num_align_resi_domain = align_output[6]

RMSD_backbone_peptide = pymol.cmd.rms_cur(mobile="AF and chain B and bb.", target="solved and chain B and bb.", object="peptide_super_bb")
RMSD_all_atom_peptide = pymol.cmd.rms_cur(mobile="AF and chain B", target="solved and chain B", object="peptide_super_all_atoms")
dataAF.at[i, "RMSD_domain"] =  RMSD_domain
dataAF.at[i, "align_score_domain"] =  align_score_domain
dataAF.at[i, "num_align_atoms_domain"] =  num_align_atoms_domain
dataAF.at[i, "num_align_resi_domain"] =  num_align_resi_domain
dataAF.at[i, "RMSD_backbone_peptide"] =  RMSD_backbone_peptide
dataAF.at[i, "RMSD_all_atom_peptide"] =  RMSD_all_atom_peptide

AssertionError: 

In [15]:
pymol.cmd.iterate("chain B and solved", "print(resi)")

132