In [1]:
import pandas as pd
from pathlib import Path
import rdkit.Chem as Chem
from tqdm import tqdm
import requests

from torch_geometric.data import HeteroData
import itertools

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from rdkit import RDLogger
RDLogger.DisableLog("rdApp.*")

In [3]:
RAW_DATA = Path() / "data" / "raw"

#### Read Kindodata data

In [4]:
df = pd.read_csv(RAW_DATA / "data_clean.csv", index_col="ident")
df.columns

Index(['activities.standard_type', 'activities.standard_units',
       'activities.standard_value', 'assays.chembl_id',
       'assays.confidence_score', 'component_sequences.sequence',
       'compound_structures.canonical_smiles',
       'compound_structures.standard_inchi', 'docking.posit_probability',
       'docking.runtime', 'docking.score', 'docking.similar_kinase_chain',
       'docking.similar_kinase_pdb', 'docking.similar_ligand_pdb',
       'docs.authors', 'docs.chembl_id', 'docs.klifs_kinase_id',
       'docs.klifs_structure_id', 'docs.year', 'molecule_dictionary.chembl_id',
       'molecule_dictionary.max_phase', 'structure_ID',
       'target_dictionary.chembl_id', 'target_dictionary.kinase',
       'target_dictionary.species', 'target_dictionary.uniprot_id'],
      dtype='object')

#### Assign ligand PDB filepaths

In [5]:
ligand_pdb_files = {
    int(fp.stem.split("_")[0]): fp for fp in (RAW_DATA / "pdbs" / "ligand").iterdir()
}
next(iter(ligand_pdb_files.items())), len(ligand_pdb_files)

((23120667, PosixPath('data/raw/pdbs/ligand/23120667_ligand.pdb')), 155718)

In [6]:
df["ligand_pdb_file"] = [ligand_pdb_files[ident] if ident in ligand_pdb_files else None for ident in df.index]

# sanity check
assert df["ligand_pdb_file"].notna().all()
df["ligand_pdb_file"].head()

ident
16291323    data/raw/pdbs/ligand/16291323_ligand.pdb
16306943    data/raw/pdbs/ligand/16306943_ligand.pdb
16264754    data/raw/pdbs/ligand/16264754_ligand.pdb
16340050    data/raw/pdbs/ligand/16340050_ligand.pdb
16340956    data/raw/pdbs/ligand/16340956_ligand.pdb
Name: ligand_pdb_file, dtype: object

#### Get pocket PDB info from klifs

In [7]:
# NOTE some rows are missing a structure ID
df["structure_ID"].notna().all()

False

In [8]:
# drop those
df = df[df["structure_ID"].notna()]

In [9]:
# for some reason pandas reads structure ID as a float
df["structure_ID"] = df["structure_ID"].astype(int)

In [22]:
# get pocket pdbs
pbar = tqdm(df.iterrows(), total=len(df))
num_failed = 0
for ident, row in pbar:
    fp = RAW_DATA / "pdbs" / "pocket" / f"{ident}_pocket.pdb"
    if fp.exists():
        continue
    resp = requests.get(
        "https://klifs.net/api/structure_get_pocket", params={"structure_ID": row["structure_ID"]}
    )
    num_failed += int(not resp.ok)
    pbar.set_description(f"Num. failed klifs requests: {num_failed}")
    if resp.ok:
        fp.write_bytes(resp.content)

Num. failed klifs requests: 0:   0%|          | 148/110274 [00:38<7:51:20,  3.89it/s]


KeyboardInterrupt: 

In [38]:
data_list = []
for ident, row in tqdm(df.iterrows(), total=len(df)):
    data = HeteroData()
    ligand = Chem.MolFromPDBFile(str(row.pdb_file))
    pocket = ...
    # TODO
    ...

  3%|▎         | 2857/110274 [00:03<02:13, 803.98it/s]


KeyboardInterrupt: 