In [36]:
%%bash
pip install spektral
pip install py3Dmol
pip install rdkit

Collecting rdkit
  Downloading rdkit-2022.3.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (36.8 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 36.8/36.8 MB 22.0 MB/s eta 0:00:00
Installing collected packages: rdkit
Successfully installed rdkit-2022.3.5




In [166]:
# Arrays and linear algebra
import numpy as np

# Structured data
import pandas as pd

# OS I/O
import os

# Execution timing
from time import process_time

# G/TPU tensors and neural networks
import tensorflow as tf
import spektral

# Computational biology/chemistry
from Bio.PDB import PDBParser
import py3Dmol
from rdkit import Chem

# Jupyter settings
from IPython.core.display import HTML

In [167]:
DATA_DIR = "../input/koff-dataset/koff_dataset"

In [230]:
class DataReader():
    def __init__(self, data_dir:str):
        """A data reader for processing the Koff Dataset. Reads the metadata and
        provides methods for reading ligands and proteins contain the data
        directory.
        
        Args:
          data_dir (str): The directory containing the koff_dataset
        """
        self.data_dir = data_dir
        self.structures_dir = os.path.join(self.data_dir, "initial_structure")
        self.metadata_fname = os.path.join(self.data_dir, "koff_index.xlsx")
        self.metadata_full = pd.read_excel(self.metadata_fname)
        self.metadata = self.metadata_full[["ligand name", "koff/s-1"]]
        self.pdb_parser = PDBParser()
                
    def read_pdb(self, ligand_fname:str):
        """Read the associated protein in a .pdb file of a given ligand.
        
        Args:
          ligand_fname (str): The ligand file name associated with the protein to
            be queried. Ends with an extension: e.g., `.mol2`.
        Returns:
          protein (BioPython Structure): The protein structure
        """
        ligand_name, extension = os.path.splitext(ligand_fname)
        parent_dir = os.path.join(self.structures_dir, ligand_name)
        pdb_path = os.path.join(parent_dir, f"{ligand_name}_protein.pdb")
        
        protein = self.pdb_parser.get_structure(ligand_name, pdb_path)        
        return protein
    
    def read_ligand(self, ligand_fname:str):
        """Read the ligand in a .mol2 file
        
        Args:
          ligand_fname (str): The ligand file name to be queried.
        Returns:
          ligand (rdkit.Chem.Molecule): The ligand molecule
        """
        ligand_name, extension = os.path.splitext(ligand_fname)
        parent_dir = os.path.join(self.structures_dir, ligand_name)
        ligand_path = os.path.join(parent_dir, ligand_fname)
        
        ligand = Chem.MolFromMol2File(ligand_path)
        return ligand
    
    def distance_matrix(self, ligand_fname:str):
        """Extract the pairwise distance matrix from a protein-ligand complex.
        
        Args:
          ligand_fname (str): The ligand file name. We will extract both the
          protein and the associated ligand.
        
        Returns:
          dists (np.ndarray): The pairwise distance matrix. Suppose M denotes our
            pairwise matrix. `M[i, j]` represents the euclidean distance from
            ligand element `i` to protein residue `j`.
        """
        protein = self.read_pdb(ligand_fname)
        residues = protein.get_residues()
        residue_names = (residue.get_resname() for residue in residues)
        residue_centers = (residue.center_of_mass() for residue in residues)
        
        ligand = self.read_ligand(ligand_fname)
        atoms = ligand.GetAtoms()
        atom_names = (atom.GetSymbol() for atom in atoms)
        atom_centers = ()
        

In [231]:
data_reader = DataReader(DATA_DIR)
data_reader.metadata

Unnamed: 0,ligand name,koff/s-1
0,3hec_ligand_native_1.mol2,0.38
1,3heg_ligand_native_2.mol2,0.018
2,1kv2_ligand_native_3.mol2,0.000008
3,1kv1_ligand_native_4.mol2,0.062
4,3gcq_ligand_native_5.mol2,1.695e-3/0.046
...,...,...
675,3oxc_ligand_native_783.mol2,0.00023
676,1hxw_ligand_native_784.mol2,0.00216
677,1ohr_ligand_native_785.mol2,0.00067
678,2bpx_ligand_native_786.mol2,0.00158


In [233]:
ligand = data_reader.read_ligand("3heg_ligand_native_2.mol2")
protein = data_reader.read_pdb("3heg_ligand_native_2.mol2")

In [247]:
atoms = ligand.GetAtoms()
for atom in atoms:
    print(atom.)

<rdkit.Chem.rdchem.Atom object at 0x7f6ec2ec19e0>
<rdkit.Chem.rdchem.Atom object at 0x7f6ec2e38ee0>
<rdkit.Chem.rdchem.Atom object at 0x7f6ec2eddf30>
<rdkit.Chem.rdchem.Atom object at 0x7f6ec2e38ee0>
<rdkit.Chem.rdchem.Atom object at 0x7f6ec2eddf30>
<rdkit.Chem.rdchem.Atom object at 0x7f6ec2e38ee0>
<rdkit.Chem.rdchem.Atom object at 0x7f6ec2eddf30>
<rdkit.Chem.rdchem.Atom object at 0x7f6ec2e38ee0>
<rdkit.Chem.rdchem.Atom object at 0x7f6ec2eddf30>
<rdkit.Chem.rdchem.Atom object at 0x7f6ec2e38ee0>
<rdkit.Chem.rdchem.Atom object at 0x7f6ec2eddf30>
<rdkit.Chem.rdchem.Atom object at 0x7f6ec2e38ee0>
<rdkit.Chem.rdchem.Atom object at 0x7f6ec2eddf30>
<rdkit.Chem.rdchem.Atom object at 0x7f6ec2e38ee0>
<rdkit.Chem.rdchem.Atom object at 0x7f6ec2eddf30>
<rdkit.Chem.rdchem.Atom object at 0x7f6ec2e38ee0>
<rdkit.Chem.rdchem.Atom object at 0x7f6ec2eddf30>
<rdkit.Chem.rdchem.Atom object at 0x7f6ec2e38ee0>
<rdkit.Chem.rdchem.Atom object at 0x7f6ec2eddf30>
<rdkit.Chem.rdchem.Atom object at 0x7f6ec2e38ee0>
