In [160]:
%%bash
pip install spektral
pip install rdkit

Process is terminated.


In [299]:
# Arrays and linear algebra
import numpy as np

# Structured data
import pandas as pd

# OS I/O
import os

# Execution timing
from time import process_time

# G/TPU tensors and neural networks
import tensorflow as tf
import spektral

# Computational biology/chemistry
import Bio
from Bio.PDB import PDBParser

from rdkit import Chem
import rdkit

# Visualization
from matplotlib import pyplot as plt

# Nearest-neighbor searching
from scipy.spatial import KDTree

# Jupyter settings
from IPython.core.display import HTML
np.set_printoptions(precision=4, linewidth=np.inf)

In [164]:
DATA_DIR = "../input/koff-dataset/koff_dataset"

In [377]:
class DataReader():
    def __init__(self, data_dir:str):
        """A data reader for processing the Koff Dataset. Reads the metadata and
        provides methods for reading ligands and proteins contain the data
        directory.
        
        Args:
          data_dir (str): The directory containing the koff_dataset
        """
        self.data_dir = data_dir
        self.structures_dir = os.path.join(self.data_dir, "initial_structure")
        self.metadata_fname = os.path.join(self.data_dir, "koff_index.xlsx")
        self.metadata_full = pd.read_excel(self.metadata_fname)
        self.metadata = self.metadata_full[["ligand name", "koff/s-1"]]
        
        # The PDBParser can be used to read any number of proteins from .pdb
        self.pdb_parser = PDBParser()
        
        # Create a dictionary of the metadata, so we can query it in O(1)
        ligand_names = self.metadata["ligand name"]
        k_offs = self.metadata["koff/s-1"]
#         self.ligand_to_koff = {}
#         for ligand_name, k_off in zip(ligand_names, k_offs):
#             try:
#                 self.ligand_to_koff[ligand_name] = float(k_off)
#             except ValueError as e:
#                 # TODO: THIS IS NOT SECURE... BE CAREFUL...
#                 self.ligand_to_koff[ligand_name] = float(exec(k_off))
        
        # Map bond types to integers (one-hot)
        self.bond_type_to_int = {
            "SINGLE": 1, "DOUBLE": 2, "TRIPLE": 3, "QUADRATIC": 4, "AROMATIC": 5}        
    
    @np.vectorize
    def _three_to_index(x:str):
        """Converts a byte-string of a 3-letter amino acid acronym to an integer
        index (one-hot)
        
        Args:
          x (str):
        Returns:
          index (int): The index of the amino acid
        """
        index = PDB.Polypeptide.three_to_index(x)
        return index
                
    def read_pdb(self, ligand_fname:str):
        """Read the associated protein in a .pdb file of a given ligand.
        
        Args:
          ligand_fname (str): The ligand file name associated with the protein to
            be queried. Ends with an extension: e.g., `.mol2`.
        Returns:
          protein (Bio.PDB.Structure.Structure): The protein structure
        """
        ligand_name, extension = os.path.splitext(ligand_fname)
        parent_dir = os.path.join(self.structures_dir, ligand_name)
        pdb_path = os.path.join(parent_dir, f"{ligand_name}_protein.pdb")
        
        protein = self.pdb_parser.get_structure(ligand_name, pdb_path)        
        return protein
    
    def read_ligand(self, ligand_fname:str):
        """Read the ligand in a .mol2 file
        
        Args:
          ligand_fname (str): The ligand file name to be queried.
        Returns:
          ligand (rdkit.Chem.Molecule): The ligand molecule
        """
        ligand_name, extension = os.path.splitext(ligand_fname)
        parent_dir = os.path.join(self.structures_dir, ligand_name)
        ligand_path = os.path.join(parent_dir, ligand_fname)
        
        ligand = Chem.MolFromMol2File(ligand_path)
        return ligand
    
    def _get_neighbor_residues(
        self,
        ligand:Chem.Mol,
        protein:PDB.Structure.Structure,
        n_neighbors:int=5,
        workers:int=1):
        """Extract the neighboring residues to each atom of a ligand. For full
        documentation on the nearest-neighbor process, see self.get_features().
        
        Args:
          ligand (rdkit.Chem.Mol)
          protein (Bio.PDB.Structure.Structure)
        
        Returns:
          residue_distances (np.ndarray): The residue distances from each
            residue. Given the array A, A[i, j] denotes the distance of the
            (j+1)^th residue-nearest-neighbor for atom i in the ligand.
          residue_names_index (np.ndarray): The corresponding residue name of
          the residue_distances.
        """
        atom_pos = np.array([
            list(ligand.GetConformer().GetAtomPosition(i))
            for i, atom in enumerate(atoms)])

        residue_pos = np.array([
            residue.center_of_mass() for residue in protein.get_residues()])
        residue_names = np.array([
            residue.get_resname() for residue in protein.get_residues()])
        
        # Query a KD-tree to find the 5 nearest residues to each ligand atom
        kd_tree = KDTree(residue_pos)
        residue_distances, residue_indices = kd_tree.query(
            x=atom_pos,
            k=n_neighbors,
            workers=workers)
        
        # Extract the name of each residue from the indices
        residue_names_nearest = residue_names[residue_indices]
        residue_names_index = self._three_to_index(residue_names_nearest)
        
        return (residue_distances, residue_names_index)
    
    def _get_ligand_adjacency(self, ligand:Chem.Mol):
        """Extract the adjacency matrix for a given ligand. See
        self.get_features() for information on the adjacency matrix.
        
        Args:
          ligand (rdkit.Mol)
        Returns:
          adjacency (np.ndarray)
          edge_features (np.ndarray)
        """
        adjacency = []
        edge_features = []
        
        n = ligand.GetNumAtoms()
        for i, atom in enumerate(ligand.GetAtoms()):
            for atom_neighbor in atom.GetNeighbors():
                j = atom_neighbor.GetIdx()
                adjacency.append((i, j, 1))
                
                bond = ligand.GetBondBetweenAtoms(i, j)
                bond_type = str(bond.GetBondType())
                edge_features.append((i, j, self.bond_type_to_int[bond_type]))
        
        adjacency = np.array(adjacency)
        edge_features = np.array(edge_features)
        return adjacency, edge_features
    
    def _get_ligand_atoms_names(self, ligand:Chem.Mol):
        """Extract the name (symbol) for each atom in a molecule. See
        self.get_features() for more information.
        
        Args:
          ligand (rdkit.Chem.Mol)
        Returns:
          atom_names (np.ndarray): The atom names in the form of a (n, 1) NumPy
            array, where the index of each row corresponds the atom index.
        """
        atom_names = np.array([
            atom.GetAtomicNum()
            for atom in ligand.GetAtoms()]).reshape(-1, 1)
        
        return atom_names
    
    def _get_protein_ligand_label(self, ligand_fname:str):
        """Extract the k_off values for a given protein-ligand complex. For more
        information, see the self.get_features().
        
        Args:
          ligand (str): The file name for 
        Returns:
          label (float): The k_off value corresponding to the protein-ligand
            complex.
        """
        label = self.ligand_to_koff[ligand_fname]
        return label
    
    def get_features(self, ligand_fname:str, n_neighbors:int=5, workers:int=1):
        """Extract the adjacency matrix, node features, edge features, and labels
        for a given protein-ligand complex.
        
        The adjacency matrix, a, is given in dense form such that the first and
        second elements in a row represent a source and destination node with an
        edge between the two. The third element in a row represents the weight of
        their edge (a constant 1 for our cases).
        
        The node features matrix, x, contains a single row for each node, and a
        corresponding column for each feature. There are (1 + n_neighbors * 2)
        total node features. They are...
        (1) The ligand atom type (1 element in the feature vector)
        (2) The protein nearest neighbor distances (n_neighbors elements)
        (3) The protein nearest neighbor residue names (n_neighbors elements)
        
        The edge features matrix, e, is similar to the adjacency matrix. Instead
        of utilizing an edge weight, the edge features matrix instead provides
        an additional column for each feature in the edges. Here, we only have a
        single feature, the bond type (e.g., SINGLE, DOUBLE, TRIPLE, ...).
        
        The label, y, provides a value for the protein-ligand complex which we
        are trying to predict. The label here is the off- or dissociation-rate
        constant, `k_off`. A smaller `k_off` constant indicates that the ligand
        is more tightly bound to its protein.
        
        Args:
          ligand_fname (str): The ligand file name. We will extract both the
            protein and the associated ligand.
          n_neighbors (int): The number of nearest-protein-neighbors to query for
            each atom in the ligand.
          workers (int): The number of parallel calls to make to the KD-tree when
            querying nearest neighbors. A value of -1 maximizes the number of
            workers utilized.
        
        Returns:
          a (np.ndarray): The adjacency matrix (list)
          e (np.ndarray): The edge feature adjacency matrix (list)
          x (np.ndarray): The edge features matrix
          y (np.ndarray): The label vector
        """
        ligand = self.read_ligand(ligand_fname)
        protein = self.read_pdb(ligand_fname)
        
        atom_names = self._get_ligand_atoms_names(ligand)
        distances, names = self._get_neighbor_residues(
            ligand,
            protein,
            n_neighbors=n_neighbors,
            workers=workers
        )
        
        a, e = self._get_ligand_adjacency(ligand)
        x = np.column_stack((atom_names, distances, names))
        y = self._get_protein_ligand_label(ligand_fname)
        return (a, x, e, y)

In [378]:
data_reader = DataReader(DATA_DIR)
HTML(data_reader.metadata.to_html())

Unnamed: 0,ligand name,koff/s-1
0,3hec_ligand_native_1.mol2,0.38
1,3heg_ligand_native_2.mol2,0.018
2,1kv2_ligand_native_3.mol2,0.000008
3,1kv1_ligand_native_4.mol2,0.062
4,3gcq_ligand_native_5.mol2,1.695e-3/0.046
5,3gcv_ligand_native_6.mol2,1.159e-3/0.119
6,3d83_ligand_native_10.mol2,0.0084
7,3d7z_ligand_native_11.mol2,0.023
8,3gcp_ligand_native_12.mol2,0.016
9,2yiw_ligand_native_16.mol2,0.00004


In [379]:
data_reader.get_features("3heg_ligand_native_2.mol2")

AttributeError: 'DataReader' object has no attribute 'ligand_to_koff'