**Test Area for Prescient Project on Druggability**

**Install dependencies in the terminal:**

In [1]:
# pip install biopandas biopython freesasa pandas numpy

In [2]:
import ssl, certifi
ssl._create_default_https_context = lambda: ssl.create_default_context(cafile=certifi.where())


**Import libraries:**

In [3]:
from biopandas.pdb import PandasPdb          # To fetch PDB files
from Bio.PDB import PDBParser, PDBIO, Select # To manipulate structures
from Bio.SeqUtils import seq1                # To convert 3-letter -> 1-letter residues
import freesasa                              # To compute solvent accessible surface area (SASA)
import pandas as pd                          # To handle dataframes
import numpy as np                           # To handle arrays
import tempfile, re, urllib.request          # To handle temporary files, regex, and downloading files

**Configuration:**

Use two definitions (distance AND ΔSASA). Their intersection is robust: real contacts that are also truly buried.

In [4]:
PDB_ID = "5XXY"           # Structure from PDB: PD-L1 + atezolizumab Fab (Antigent Binding Fragment)
DIST_CUTOFF = 4.5         # If any heavy atom of a PD-L1 residue is ≤ 4.5 Å from any antibody atom, mark it as contacting
DSASA_THRESH = 2.0        # Call a residue an epitope if its surface buries at least 2 Å² upon binding
PH = 7.4                  # Side-chain charge depends on pH; 7.4 is physiological standard

**Download the structure:**

In [5]:
ppdb = PandasPdb().fetch_pdb(PDB_ID)                                    # Use fetch_pdb to save the structure as a PandasPdb object
pdb_path = f"{PDB_ID}.pdb"                                              # Save filename as 5xxy.pdb
ppdb.to_pdb(path=pdb_path, records=None, gz=False, append_newline=True) # Write the PDB file to disk
print(f"Downloaded PDB structure {PDB_ID} to {pdb_path}")               # Confirm download

Downloaded PDB structure 5XXY to 5XXY.pdb


**Define chains:**

For 5XXY, chain A = PD-L1, chains H and L = antibody heavy and light

In [6]:
pdl1_chains   = ['A']       # PD-L1 antigen
binder_chains = ['H','L']   # Antibody chains

**Load structure with Biopython:**

A PDB file is just a plain text file with atomic coordinates.

PDBParser is the tool that reads that text and builds a hierarchical data structure you can work with in Python.

In [7]:
parser = PDBParser(QUIET=True)                      # Reads the coordinates into Python objects (Structure → Model → Chain → Residue → Atom).
structure = parser.get_structure(PDB_ID, pdb_path)  # Load structure from file
model = structure[0]                                # First (and only) model in PDB - usually one for x ray and many for NMR ensembles

**Helper functions:**

**residue_id** - Return unique ID for a residue (chain, number, insertion code)

**residue_name** - Return one-letter amino acid code (or X if unknown)

**heavy_atom_coords** - Get all non-hydrogen atom coordinates of a residue


In [8]:
def residue_id(res):
    return (res.get_parent().id, res.get_id()[1], res.get_id()[2].strip() or "")

# res.get_parent().id → finds the chain ID (e.g., chain "A" or "B" in a protein).
# res.get_id() → gives a tuple with information about the residue.
# [1] → the residue number (e.g. 45, meaning "residue #45").
# [2] → an insertion code (a letter used if two residues share the same number).
# .strip() or "" → removes extra spaces. If nothing’s left, it uses an empty string "".
# This ensures every residue has a unique identifier like: ("A", 45, "")

def residue_name(res):
    three = res.get_resname().strip()
    try:
        return seq1(three, custom_map={'SEC':'U','PYL':'O'})
    except Exception:
        return "X"

# res.get_resname() → gives the 3-letter code for the amino acid (like "ALA" for Alanine)
# seq1(three) → converts the 3-letter code into a 1-letter code (like "A")
# custom_map={'SEC':'U','PYL':'O'} → some unusual amino acids aren’t in the standard list, so tell Biopython what letters to use: SEC → U (Selenocysteine) PYL → O (Pyrrolysine)
# try / except → if Biopython can’t convert the residue name, it returns "X" (which usually means "unknown amino acid")

def heavy_atom_coords(res):
    coords = []
    for atom in res.get_atoms():
        if atom.element != "H": 
            coords.append(atom.get_coord())
    return np.array(coords)

# create an empty list coords to store the coordinates
# iterate over all atoms in the residue using res.get_atoms()
# check if the atom is not a hydrogen (atom.element != "H") - skip hydrogens as often missing in crystal structures
# if it’s a heavy atom, append its coordinates (atom.coord) to the coords list
# return the list of heavy atom coordinates


**Distance based epitope:**

In [9]:
# Gather all antibody atom coordinates in order to see which PD-L1 residues are close to any antibody atoms

binder_coords = []                            # create an empty list to hold coordinates
for ch in model:
    if ch.id in binder_chains:                # binder = antibody chains H and L
        for res in ch.get_residues():         # loop over residues in this chain
            coords = heavy_atom_coords(res)   # return a NumPy array of all non-hydrogen atom coordinates for that residue
            if coords.size:                   # check that coords is not empty and append to binder_coords list
                binder_coords.append(coords)
binder_coords = np.vstack(binder_coords)      # stack list of arrays vertically into one big NumPy array with all antibody atom coordinates

print(f"Collected {binder_coords.shape[0]} antibody heavy atom coordinates") # shape might be (1234, 3) for 1234 atoms in 3 dimensions


# For each PD-L1 residue: compute min distance to antibody

dist_epitope = {}
for ch in model:
    if ch.id in pdl1_chains:                                    # antigen = PD-L1 (chain A)
        for res in ch.get_residues():                           # loop over residues in this chain
            coords = heavy_atom_coords(res)                     # return a NumPy array of all non-hydrogen atom coordinates for that residue
            if coords.size == 0: continue                       # skip residues with no atoms
            diff = coords[:,None,:] - binder_coords[None,:,:]   # diff = difference between each PD-L1 atom and each antibody atom in 3D (NumPy broadcasting trick to deal with different numbers of atoms)
            dists = np.sqrt(np.sum(diff**2, axis=2))            # dist = Euclidean distance between each PD-L1 atom and each antibody atom
            dmin = np.min(dists)                                # minimum distance between any PD-L1 atom and any antibody atom
            dist_epitope[residue_id(res)] = dmin                # store minimum distance in dictionary with residue ID as key

print(f"Computed min distance to antibody for {len(dist_epitope)} PD-L1 residues") # should be number of residues in chain A

for res, dist in list(dist_epitope.items())[:5]:
    chain, num, ins = res
    print(f"Chain {chain}, Residue {num}{ins or ''}: {dist:.2f} Å")



Collected 3197 antibody heavy atom coordinates
Computed min distance to antibody for 99 PD-L1 residues
Chain A, Residue 18: 4.20 Å
Chain A, Residue 19: 5.34 Å
Chain A, Residue 20: 7.69 Å
Chain A, Residue 21: 7.50 Å
Chain A, Residue 22: 11.89 Å
