**Test Area for Prescient Project on Druggability**

**Install dependencies in the terminal:**

In [6]:
# pip install biopandas biopython pandas numpy

**Import libraries:**

In [7]:
from biopandas.pdb import PandasPdb
from Bio.PDB import PDBParser, PDBIO, Select, NeighborSearch
from Bio.PDB.SASA import ShrakeRupley
from Bio.SeqUtils import seq1
from Bio.SeqUtils.ProtParam import ProteinAnalysis
import pandas as pd, tempfile

**Configuration:**

Use two definitions (distance AND ΔSASA)

In [8]:
PDB_IDS = ["5X8L", "5XXY"]   # list of complexes (one row per PDB)
DIST_CUTOFF = 4.5
DSASA_THRESH = 2.0
PH = 7.4
antigen_chains = ["A"]
light_chain = "L"
heavy_chain = "H"

parser = PDBParser(QUIET=True)

**Helper functions:**


In [9]:
# Sum the SASA of all atoms in each residue and return a dict with keys (chain, resnum, icode) - icode is insertion code (e.g. 23 24A 24B 25)
def per_residue_sasa(model):
    sr = ShrakeRupley(); sr.compute(model, level="A")
    res_sasa = {}
    for chain in model:
        for res in chain:
            if res.id[0] != " ": continue
            sasa = sum(getattr(atom, "sasa", 0.0) for atom in res)
            res_sasa[(chain.id, res.id[1], res.id[2].strip() or "")] = sasa
    return res_sasa

# For a given chain ID, return the amino acid sequence as a string of one-letter codes
def chain_seq(model, chain_id):
    seq = []
    for ch in model:
        if ch.id != chain_id: continue
        for res in ch:
            if res.id[0] != " ": continue
            aa = seq1(res.get_resname(), custom_map={'SEC':'U','PYL':'O'})
            if aa and len(aa) == 1: seq.append(aa)
    return "".join(seq)

# Select only specified chains when saving a PDB file
class OnlyChains(Select):
    def __init__(self, keep): self.keep=set(keep)
    def accept_chain(self, chain): return chain.id in self.keep

# Return True if any heavy atom of res is within cutoff Å of any atom in ns (a NeighborSearch object)
def residue_in_contact(res, ns, cutoff=4.5):
    for atom in res:
        if atom.element == "H": continue
        if ns.search(atom.coord, cutoff): return True # (is this also neighbor searching to itself i.e. the same chain?)
    return False

# For a single letter amino acid and pH, return (hydrophobicity, charge); handle unknowns gracefully
def aa_props(aa1: str, ph: float):
    """Return (hydrophobicity, charge) for a single-letter AA; handle unknowns gracefully."""
    try:
        pa = ProteinAnalysis(aa1)
        return pa.gravy(), pa.charge_at_pH(ph)
    except Exception:
        return None, None

#    Build full antigen sequence and per-residue properties aligned to it.
#    Returns: seq_str, hydros_list, charges_list
def antigen_seq_and_props(model, ag_chains, ph):
    seq_chars, hydros, charges = [], [], []
    for ch in model:
        if ch.id not in ag_chains: 
            continue
        for res in ch:
            if res.id[0] != " ": 
                continue
            aa = seq1(res.get_resname(), custom_map={'SEC':'U','PYL':'O'})
            if not (aa and len(aa) == 1):
                continue
            seq_chars.append(aa)
            h, c = aa_props(aa, ph)
            hydros.append(h)
            charges.append(c)
    return "".join(seq_chars), hydros, charges

**Main Loop Over PDBs**

In [10]:
rows = []                                                                                                # Prepare empty list for output rows and loop through PDB IDs
for PDB_ID in PDB_IDS:

   # ppdb = PandasPdb().fetch_pdb(PDB_ID)                                                                # Download PDB file (if not already present)
   # pdb_path = f"{PDB_ID}.pdb"
   # ppdb.to_pdb(pdb_path, records=None, gz=False)

    pdb_path = f"{PDB_ID}.pdb"  # Alternative assuming PDB file is already downloaded
    
    structure = parser.get_structure(PDB_ID, pdb_path)                                                   # Parse PDB file and get structure and model
    model = structure[0]

    # Antigen-only structure
    pdbio = PDBIO()                                                                                      # Create PDBIO object and save only antigen chains to a temp file
    pdbio.set_structure(structure)
    antigen_path = tempfile.gettempdir() + f"/{PDB_ID}_antigen_only.pdb"
    pdbio.save(antigen_path, select=OnlyChains(antigen_chains))
    antigen_model = parser.get_structure(PDB_ID+"_ag", antigen_path)[0]

    # SASA
    sasa_complex = per_residue_sasa(model)                                                               # Calculate per-residue SASA for complex and antigen-only to find ΔSASA
    sasa_free    = per_residue_sasa(antigen_model)

    # Binder atoms
    binder_atoms = []                                                 # Collect all non-H atoms from non-antigen chains and create NeighborSearch object for fast distance queries
    for ch in model:
        if ch.id not in antigen_chains:
            for res in ch:
                if res.id[0] != " ": continue
                for atom in res:
                    if atom.element != "H": binder_atoms.append(atom)
    ns_binder = NeighborSearch(binder_atoms)

    # Collect antigen residues, identify epitopes (distance ∩ ΔSASA)
    residues = []
    for ch in model:
        if ch.id in antigen_chains:
            for res in ch:                                                                                # For each antigen residue, check distance and ΔSASA criteria                            
                if res.id[0] != " ": continue
                rid = (ch.id, res.id[1], res.id[2].strip() or "")
                d_asa = sasa_free.get(rid, 0.0) - sasa_complex.get(rid, 0.0)
                contact = residue_in_contact(res, ns_binder, DIST_CUTOFF)
                residues.append((rid[1], res.get_resname(), contact and (d_asa >= DSASA_THRESH)))         # Store info in tuple e.g. (24, 'LYS', True)

    # Build epitope lists
    epi_positions = [r[0] for r in residues if r[2]]                                                      # Extract positions and residues for epitope residues only
    epi_resnames  = [r[1] for r in residues if r[2]]
    epi_residues  = [seq1(rn, custom_map={'SEC':'U','PYL':'O'}) for rn in epi_resnames]

    # Full-antigen sequence + per-residue properties (aligned to entire antigen)
    antigen_seq_str, antigen_hydros, antigen_charges = antigen_seq_and_props(model, antigen_chains, PH)

    # Add one row
    rows.append({
        "PDBID": PDB_ID,
        "antigen_seq": "".join(chain_seq(model, ch) for ch in antigen_chains),
        "light_chain_seq": chain_seq(model, light_chain),
        "heavy_chain_seq": chain_seq(model, heavy_chain),
        "epitope_positions": ",".join(map(str, epi_positions)),
        "epitope_residues": ",".join(epi_residues),
        "antigen_hydrophobicity": ",".join("" if v is None else f"{v:.3f}" for v in antigen_hydros),
        "antigen_charge": ",".join("" if v is None else f"{v:.3f}" for v in antigen_charges),
    })

**Final Data Frame**

In [11]:
df_complexes = pd.DataFrame(rows)
print(df_complexes)
df_complexes.to_csv("complex_summaries.csv", index=False)
print("Saved complex_summaries.csv")

  PDBID                                        antigen_seq  \
0  5X8L  AFTVTVPKDLYVVEYGSNMTIECKFPVEKELDLAALIVYWEMEDKN...   
1  5XXY  AFTVTVPKDLYVVEYGSNMTIECKFPVDLAALIVYWEMEDKNIIQF...   

                                     light_chain_seq  \
0  DIQMTQSPSSLSASVGDRVTITCRASQDVSTAVAWYQQKPGKAPKL...   
1  DIQMTQSPSSLSASVGDRVTITCRASQDVSTAVAWYQQKPGKAPKL...   

                                     heavy_chain_seq  \
0  EVQLVESGGGLVQPGGSLRLSCAASGFTFSDSWIHWVRQAPGKGLE...   
1  EVQLVESGGGLVQPGGSLRLSCAASGFTFSDSWIHWVRQAPGKGLE...   

                                   epitope_positions  \
0  20,24,25,37,39,41,42,43,44,45,46,47,49,51,52,5...   
1  18,49,51,52,54,56,58,59,60,61,62,63,66,68,69,1...   

                                    epitope_residues  \
0  T,P,K,T,E,K,F,P,V,E,K,E,D,A,A,I,Y,E,M,E,D,K,N,...   
1  A,D,A,A,I,Y,E,M,E,D,K,N,Q,V,H,V,R,M,S,Y,G,G,A,...   

                              antigen_hydrophobicity  \
0  1.800,2.800,-0.700,4.200,-0.700,4.200,-1.600,-...   
1  1.800,2.800,-0.700,4.2