# overview

We start from the raw PDBbind dataset downloaded from http://www.pdbbind.org.cn/download.php

1. filter out those unable to process using RDKit.

2. Process the protein by only preserving the chains that with at least one atom within 10Å from any atom of the ligand.

3. Use p2rank to segment protein into blocks.

4. extract protein and ligand features.

5. construct the training and test dataset.


In [1]:
tankbind_src_folder_path = "/home/zoujl/TankBind/tankbind/"
import sys
sys.path.insert(0, tankbind_src_folder_path)

In [2]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm

# process the raw PDBbind dataset.

In [3]:

from utils import read_pdbbind_data

In [4]:
# raw PDBbind dataset could be downloaded from http://www.pdbbind.org.cn/download.php
pre = "../pdbbind2020/"
df_pdb_id = pd.read_csv(f'{pre}/index/INDEX_refined_name.2020', sep="  ", comment='#', header=None, names=['pdb', 'year', 'uid', 'd', 'e','f','g','h','i','j','k','l','m','n','o'], engine='python')
df_pdb_id = df_pdb_id[['pdb','uid']]
data = read_pdbbind_data(f'{pre}/index/INDEX_refined_data.2020')
data = data.merge(df_pdb_id, on=['pdb'])


In [5]:
data

Unnamed: 0,pdb,resolution,year,affinity,raw,ligand,uid
0,2r58,2.00,2007,2.00,Kd=10mM,MLY,Q9VHA0
1,3c2f,2.35,2008,2.00,Kd=10.1mM,PRP,P43619
2,3g2y,1.31,2009,2.00,Ki=10mM,GF4,Q9L5C8
3,3pce,2.06,1998,2.00,Ki=10mM,3HP,P00436
4,4qsu,1.90,2014,2.00,Kd=10mM,TDR,Q6PL18
...,...,...,...,...,...,...,...
5311,4f3c,1.93,2013,11.82,Ki=1.5pM,BIG,E8NLP5
5312,5bry,1.34,2015,11.82,Ki=0.0015nM,4UY,P03366
5313,1sl3,1.81,2004,11.85,Ki=1.4pM,170,P00734
5314,1ctu,2.30,1995,11.92,Ki=1.2pM,ZEB,P0ABF6


# ligand file should be readable by RDKit.

In [6]:
from feature_utils import read_mol

No CUDA runtime is found, using CUDA_HOME='/usr/local/cuda'


In [7]:
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*')
pdb_list = []
probem_list = []
for pdb in tqdm(data.pdb):
    sdf_fileName = f"{pre}/refined_set/{pdb}/{pdb}_ligand.sdf"
    mol2_fileName = f"{pre}/refined_set/{pdb}/{pdb}_ligand.mol2"
    mol, problem = read_mol(sdf_fileName, mol2_fileName)
    if problem:
        probem_list.append(pdb)
        continue
    pdb_list.append(pdb)

  0%|          | 1/5316 [00:00<10:07,  8.75it/s]

100%|██████████| 5316/5316 [00:47<00:00, 112.57it/s]


In [8]:
print(len(pdb_list))
print(len(probem_list))
data

5236
80


Unnamed: 0,pdb,resolution,year,affinity,raw,ligand,uid
0,2r58,2.00,2007,2.00,Kd=10mM,MLY,Q9VHA0
1,3c2f,2.35,2008,2.00,Kd=10.1mM,PRP,P43619
2,3g2y,1.31,2009,2.00,Ki=10mM,GF4,Q9L5C8
3,3pce,2.06,1998,2.00,Ki=10mM,3HP,P00436
4,4qsu,1.90,2014,2.00,Kd=10mM,TDR,Q6PL18
...,...,...,...,...,...,...,...
5311,4f3c,1.93,2013,11.82,Ki=1.5pM,BIG,E8NLP5
5312,5bry,1.34,2015,11.82,Ki=0.0015nM,4UY,P03366
5313,1sl3,1.81,2004,11.85,Ki=1.4pM,170,P00734
5314,1ctu,2.30,1995,11.92,Ki=1.2pM,ZEB,P0ABF6


In [10]:
data = data.query("pdb in @pdb_list").reset_index(drop=True)

In [11]:
data.shape

(5236, 7)

### for ease of RMSD evaluation later, we renumber the atom index to be consistent with the smiles

In [12]:
from feature_utils import write_renumbered_sdf

In [12]:
toFolder = f"{pre}/renumber_atom_index_same_as_smiles"
os.system(f"mkdir -p {toFolder}")
for pdb in tqdm(pdb_list):
    sdf_fileName = f"{pre}/refined_set/{pdb}/{pdb}_ligand.sdf"
    mol2_fileName = f"{pre}/refined_set/{pdb}/{pdb}_ligand.mol2"
    toFile = f"{toFolder}/{pdb}.sdf"
    write_renumbered_sdf(toFile, sdf_fileName, mol2_fileName)


100%|██████████| 5236/5236 [00:07<00:00, 694.88it/s]


# process PDBbind proteins, removing extra chains, cutoff 10A

In [14]:
toFolder = f"{pre}/protein_remove_extra_chains_10A/"
os.system(f"mkdir -p {toFolder}")

0

In [15]:
input_ = []
cutoff = 10
for pdb in data.pdb.values:
    pdbFile = f"{pre}/refined_set/{pdb}/{pdb}_protein.pdb"
    ligandFile = f"{pre}/renumber_atom_index_same_as_smiles/{pdb}.sdf"
    toFile = f"{toFolder}/{pdb}_protein.pdb"
    x = (pdbFile, ligandFile, cutoff, toFile)
    input_.append(x)

In [16]:
input_

[('../pdbbind2020//refined_set/2r58/2r58_protein.pdb',
  '../pdbbind2020//renumber_atom_index_same_as_smiles/2r58.sdf',
  10,
  '../pdbbind2020//protein_remove_extra_chains_10A//2r58_protein.pdb'),
 ('../pdbbind2020//refined_set/3c2f/3c2f_protein.pdb',
  '../pdbbind2020//renumber_atom_index_same_as_smiles/3c2f.sdf',
  10,
  '../pdbbind2020//protein_remove_extra_chains_10A//3c2f_protein.pdb'),
 ('../pdbbind2020//refined_set/3g2y/3g2y_protein.pdb',
  '../pdbbind2020//renumber_atom_index_same_as_smiles/3g2y.sdf',
  10,
  '../pdbbind2020//protein_remove_extra_chains_10A//3g2y_protein.pdb'),
 ('../pdbbind2020//refined_set/3pce/3pce_protein.pdb',
  '../pdbbind2020//renumber_atom_index_same_as_smiles/3pce.sdf',
  10,
  '../pdbbind2020//protein_remove_extra_chains_10A//3pce_protein.pdb'),
 ('../pdbbind2020//refined_set/4qsu/4qsu_protein.pdb',
  '../pdbbind2020//renumber_atom_index_same_as_smiles/4qsu.sdf',
  10,
  '../pdbbind2020//protein_remove_extra_chains_10A//4qsu_protein.pdb'),
 ('../pdbb

In [17]:
from feature_utils import select_chain_within_cutoff_to_ligand_v2

In [18]:
import mlcrate as mlc
import os
pool = mlc.SuperPool(64)
pool.pool.restart()
_ = pool.map(select_chain_within_cutoff_to_ligand_v2,input_)
pool.exit()

[mlcrate] 64 CPUs: 100%|██████████| 5236/5236 [00:36<00:00, 142.64it/s]


In [18]:
# previously, I found that 2r1w has no chain near the ligand.
data = data.query("pdb != '2r1w'").reset_index(drop=True)

In [31]:
import nglview   # conda install nglview -c conda-forge if import failure

pdb_id = "3p8p"

proteinFile = f"../pdbbind2020/refined_set/{pdb_id}/{pdb_id}_protein.pdb"
molFile = f"../pdbbind2020/refined_set/{pdb_id}/{pdb_id}_ligand.sdf"

removed_proteinFile = f"../pdbbind2020/protein_remove_extra_chains_10A/{pdb_id}_protein.pdb"
renumbered_molFile = f"../pdbbind2020/renumber_atom_index_same_as_smiles/{pdb_id}.sdf"

view = nglview.show_file(nglview.FileStructure(proteinFile), default=False)
view.add_representation('cartoon', selection='protein', color='white')
rdkit = view.add_component(nglview.FileStructure(molFile), default=False)
rdkit.add_ball_and_stick(color='red')

rdkit = view.add_component(nglview.FileStructure(removed_proteinFile), default=False)
rdkit.add_representation('cartoon', selection='protein', color='yellow')
rdkit = view.add_component(nglview.FileStructure(renumbered_molFile), default=False)
rdkit.add_ball_and_stick(color='green')

view

NGLWidget()

# p2rank segmentation

In [23]:
p2rank_prediction_folder = f"{pre}/p2rank_protein_remove_extra_chains_10A"
os.system(f"mkdir -p {p2rank_prediction_folder}")

ds = f"{p2rank_prediction_folder}/protein_list.ds"
with open(ds, "w") as out:
    for pdb in data.pdb.values:
        out.write(f"../protein_remove_extra_chains_10A/{pdb}_protein.pdb\n")

In [21]:
# takes about 30 minutes.
p2rank = "bash /home/zoujl/TankBind/packages/p2rank_2.3/prank"
cmd = f"{p2rank} predict {ds} -o {p2rank_prediction_folder}/p2rank -threads 16"
os.system(cmd)

----------------------------------------------------------------------------------------------
 P2Rank 2.3
----------------------------------------------------------------------------------------------

predicting pockets for proteins from dataset [protein_list.ds]
processing [4qsu_protein.pdb] (5/5236)
processing [4cs9_protein.pdb] (9/5236)
processing [3g2y_protein.pdb] (3/5236)
processing [3pce_protein.pdb] (4/5236)
processing [3c2f_protein.pdb] (2/5236)
processing [2r58_protein.pdb] (1/5236)
processing [4qsv_protein.pdb] (6/5236)
processing [2w8w_protein.pdb] (10/5236)
processing [3gv9_protein.pdb] (11/5236)
processing [4u54_protein.pdb] (7/5236)
processing [5cs3_protein.pdb] (15/5236)
processing [3ao4_protein.pdb] (8/5236)
processing [6abx_protein.pdb] (13/5236)
processing [4q90_protein.pdb] (14/5236)
processing [4tim_protein.pdb] (16/5236)
processing [6r9u_protein.pdb] (12/5236)
processing [5fe6_protein.pdb] (17/5236)
processing [6ghj_protein.pdb] (18/5236)
processing [3gqz_protei

0

In [25]:
pdb_list = data.pdb.values

In [46]:
tankbind_data_path = f"{pre}/tankbind_data"
os.system(f"mkdir -p {tankbind_data_path}")

0

In [47]:
name_list = pdb_list
d_list = []

for name in tqdm(name_list):
    p2rankFile = f"{pre}/p2rank_protein_remove_extra_chains_10A/p2rank/{name}_protein.pdb_predictions.csv"
    d = pd.read_csv(p2rankFile)
    d.columns = d.columns.str.strip()
    d_list.append(d.assign(name=name))
d = pd.concat(d_list).reset_index(drop=True)
d.reset_index(drop=True).to_feather(f"{tankbind_data_path}/p2rank_result.feather")

  0%|          | 0/5236 [00:00<?, ?it/s]

100%|██████████| 5236/5236 [01:05<00:00, 79.88it/s] 


In [48]:
d = pd.read_feather(f"{tankbind_data_path}/p2rank_result.feather")
d

Unnamed: 0,name,rank,score,probability,sas_points,surf_atoms,center_x,center_y,center_z,residue_ids,surf_atom_ids
0,2r58,1,7.74,0.249,51,26,-22.5809,-17.7873,-19.9953,A_324 A_327 A_330 A_332 A_348 A_351 A_355 A_3...,2291 2293 2347 2348 2349 2394 2430 2658 2661 ...
1,2r58,2,3.28,0.057,44,24,-33.7310,12.2990,-16.9301,A_213 A_214 A_220 A_222 A_255 A_256 A_258 A_2...,596 598 604 613 619 708 736 738 1244 1255 125...
2,2r58,3,1.75,0.013,41,23,-24.9639,13.8173,-15.1200,A_254 A_255 A_257 A_258 A_259 A_262 A_265 A_2...,1233 1241 1245 1249 1273 1298 1302 1303 1309 ...
3,3c2f,1,40.52,0.915,301,114,12.3269,-9.3542,31.6212,A_13 A_14 A_148 A_151 A_154 A_155 A_158 A_16 ...,72 85 87 88 89 94 98 117 129 132 172 178 182 ...
4,3c2f,2,33.04,0.880,229,99,24.7129,-15.0894,44.9685,A_141 A_142 A_143 A_144 A_165 A_166 A_169 A_1...,2137 2140 2144 2148 2152 2158 2161 2167 2169 ...
...,...,...,...,...,...,...,...,...,...,...,...
40322,1ctu,17,0.91,0.002,27,10,48.3192,73.2073,-0.8277,A_273 A_277 A_281 A_286 A_288,4082 4087 4135 4138 4198 4201 4257 4284 4285 ...
40323,6e9a,1,31.54,0.869,129,60,16.5019,22.1874,17.4029,A_23 A_25 A_27 A_28 A_29 A_30 A_32 A_47 A_48 ...,88 227 245 259 262 265 266 271 273 274 275 27...
40324,6e9a,2,1.77,0.014,30,16,3.5404,16.0525,29.4310,B_60 B_61 B_72 B_73 B_74 B_88 B_92,1492 1493 1495 1497 1500 1501 1598 1605 1608 ...
40325,6e9a,3,1.47,0.008,26,13,21.9780,40.2209,11.3321,A_60 A_61 A_72 A_73 A_74 A_88 A_92,571 575 581 678 685 688 693 694 813 814 817 8...


In [49]:
pockets_dict = {}
for name in tqdm(name_list):
    pockets_dict[name] = d[d.name == name].reset_index(drop=True)

100%|██████████| 5236/5236 [00:14<00:00, 373.77it/s]


In [50]:
pockets_dict['3p8p']

Unnamed: 0,name,rank,score,probability,sas_points,surf_atoms,center_x,center_y,center_z,residue_ids,surf_atom_ids
0,3p8p,1,19.39,0.706,119,64,3.2127,0.5526,4.006,A_10 A_11 A_12 A_128 A_131 A_132 A_133 A_135 ...,17 18 23 25 29 33 37 41 43 45 48 49 55 1043 1...
1,3p8p,2,9.02,0.317,41,25,-2.992,-3.8299,17.8217,A_129 A_145 A_171 A_173 A_176 A_268 A_269 A_2...,322 323 324 977 1015 1055 1067 1335 1340 1798...
2,3p8p,3,0.99,0.002,21,13,11.2635,-9.4229,-2.0525,A_183 A_185 A_210 A_232 A_254 A_255,2596 2600 2603 2626 2631 2635 2978 2981 2982 ...


In [52]:
import pickle

with open(f"{tankbind_data_path}/pockets_dict.pkl", 'wb') as f:
    pickle.dump(pockets_dict, f)


# protein feature

In [53]:
from feature_utils import get_protein_feature

In [54]:
input_ = []
protein_embedding_folder = f"{tankbind_data_path}/gvp_protein_embedding"
os.system(f"mkdir -p {protein_embedding_folder}")

for pdb in pdb_list:
    proteinFile = f"{pre}/protein_remove_extra_chains_10A/{pdb}_protein.pdb"
    toFile = f"{protein_embedding_folder}/{pdb}.pt"
    x = (pdb, proteinFile, toFile)
    input_.append(x)

In [55]:
input_

[('2r58',
  '../pdbbind2020//protein_remove_extra_chains_10A/2r58_protein.pdb',
  '../pdbbind2020//tankbind_data/gvp_protein_embedding/2r58.pt'),
 ('3c2f',
  '../pdbbind2020//protein_remove_extra_chains_10A/3c2f_protein.pdb',
  '../pdbbind2020//tankbind_data/gvp_protein_embedding/3c2f.pt'),
 ('3g2y',
  '../pdbbind2020//protein_remove_extra_chains_10A/3g2y_protein.pdb',
  '../pdbbind2020//tankbind_data/gvp_protein_embedding/3g2y.pt'),
 ('3pce',
  '../pdbbind2020//protein_remove_extra_chains_10A/3pce_protein.pdb',
  '../pdbbind2020//tankbind_data/gvp_protein_embedding/3pce.pt'),
 ('4qsu',
  '../pdbbind2020//protein_remove_extra_chains_10A/4qsu_protein.pdb',
  '../pdbbind2020//tankbind_data/gvp_protein_embedding/4qsu.pt'),
 ('4qsv',
  '../pdbbind2020//protein_remove_extra_chains_10A/4qsv_protein.pdb',
  '../pdbbind2020//tankbind_data/gvp_protein_embedding/4qsv.pt'),
 ('4u54',
  '../pdbbind2020//protein_remove_extra_chains_10A/4u54_protein.pdb',
  '../pdbbind2020//tankbind_data/gvp_protein

In [56]:
from Bio.PDB import PDBParser
from feature_utils import get_clean_res_list

pdb = "2r58"
proteinFile = f"../pdbbind2020//protein_remove_extra_chains_10A/{pdb}_protein.pdb"

parser = PDBParser(QUIET=True)
s = parser.get_structure(pdb, proteinFile)
print(f"> original residues: ", end=": ")
for i, res in enumerate(s.get_residues()):
    print(f"{i}: {res.resname}", end=" ")
print()

res_list = get_clean_res_list(s.get_residues(), verbose=False, ensure_ca_exist=True)
print(f"> cleaded residues length: {len(res_list)}")

protein_feature = get_protein_feature(res_list)
print(f"> protein_features (size: {len(protein_feature)}): ")
for feature in protein_feature:
    print(feature.shape)
protein_feature

> original residues: : 0: ALA 1: PHE 2: ASP 3: TRP 4: ASP 5: ALA 6: TYR 7: LEU 8: GLU 9: GLU 10: THR 11: GLY 12: SER 13: GLU 14: ALA 15: ALA 16: PRO 17: ALA 18: LYS 19: CYS 20: PHE 21: LYS 22: GLN 23: ALA 24: GLN 25: ASN 26: PRO 27: PRO 28: ASN 29: ASN 30: ASP 31: PHE 32: LYS 33: ILE 34: GLY 35: MET 36: LYS 37: LEU 38: GLU 39: ALA 40: LEU 41: ASP 42: PRO 43: ARG 44: ASN 45: VAL 46: THR 47: SER 48: THR 49: CYS 50: ILE 51: ALA 52: THR 53: VAL 54: VAL 55: GLY 56: VAL 57: LEU 58: GLY 59: SER 60: ARG 61: LEU 62: ARG 63: LEU 64: ARG 65: LEU 66: ASP 67: GLY 68: SER 69: ASP 70: SER 71: GLN 72: ASN 73: ASP 74: PHE 75: TRP 76: ARG 77: LEU 78: VAL 79: ASP 80: SER 81: THR 82: GLU 83: ILE 84: HIS 85: ALA 86: ILE 87: GLY 88: HIS 89: CYS 90: GLU 91: LYS 92: ASN 93: GLY 94: GLY 95: MET 96: LEU 97: GLN 98: PRO 99: PRO 100: LEU 101: GLY 102: PHE 103: CYS 104: MET 105: ASN 106: ALA 107: SER 108: SER 109: TRP 110: PRO 111: GLY 112: TYR 113: LEU 114: CYS 115: LYS 116: ILE 117: LEU 118: ASN 119: ASN 120: AL

(tensor([[-7.6500e+00, -1.3176e+01, -1.4296e+01],
         [-8.3320e+00, -1.4639e+01, -1.7768e+01],
         [-5.2450e+00, -1.5139e+01, -1.9945e+01],
         [-6.3580e+00, -1.3245e+01, -2.3043e+01],
         [-2.8680e+00, -1.3331e+01, -2.4631e+01],
         [-2.6540e+00, -1.7135e+01, -2.4507e+01],
         [-6.2790e+00, -1.7369e+01, -2.5657e+01],
         [-5.8080e+00, -1.5076e+01, -2.8679e+01],
         [-2.6760e+00, -1.6977e+01, -2.9711e+01],
         [-4.3860e+00, -2.0389e+01, -2.9578e+01],
         [-7.5050e+00, -1.9294e+01, -3.1444e+01],
         [-5.3490e+00, -1.7353e+01, -3.3945e+01],
         [-7.4220e+00, -1.4267e+01, -3.3148e+01],
         [-7.1560e+00, -1.0521e+01, -3.2521e+01],
         [-8.9540e+00, -8.5990e+00, -2.9779e+01],
         [-1.0928e+01, -5.5810e+00, -3.0978e+01],
         [-8.7970e+00, -2.4590e+00, -3.0298e+01],
         [-9.8890e+00, -1.2200e-01, -2.7473e+01],
         [-1.0768e+01,  2.5620e+00, -3.0078e+01],
         [-1.3819e+01,  4.7900e-01, -3.1172e+01],


In [57]:
from Bio.PDB import PDBParser
from feature_utils import get_clean_res_list
import torch
torch.set_num_threads(1)

def batch_run(x):
    protein_dict = {}
    pdb, proteinFile, toFile = x
    parser = PDBParser(QUIET=True)
    s = parser.get_structure(pdb, proteinFile)
    res_list = get_clean_res_list(s.get_residues(), verbose=False, ensure_ca_exist=True)
    protein_dict[pdb] = get_protein_feature(res_list)
    torch.save(protein_dict, toFile)

In [58]:
import mlcrate as mlc
import os
pool = mlc.SuperPool(64)
pool.pool.restart()
_ = pool.map(batch_run,input_)
pool.exit()

[mlcrate] 64 CPUs: 100%|██████████| 5236/5236 [01:32<00:00, 56.72it/s] 


In [39]:
protein_dict = {}
for pdb in tqdm(pdb_list):
    protein_dict.update(torch.load(f"{protein_embedding_folder}/{pdb}.pt"))


100%|██████████| 5236/5236 [05:03<00:00, 17.24it/s]


# Compound Features

In [40]:
from feature_utils import extract_torchdrug_feature_from_mol
compound_dict = {}
skip_pdb_list = []
for pdb in tqdm(pdb_list):
    mol, _ = read_mol(f"{pre}/renumber_atom_index_same_as_smiles/{pdb}.sdf", None)
    # extract features from sdf.
    try:
        compound_dict[pdb] = extract_torchdrug_feature_from_mol(mol, has_LAS_mask=True)  # self-dock set has_LAS_mask to true
    except Exception as e:
        print(e)
        skip_pdb_list.append(pdb)
        print(pdb)

100%|██████████| 5236/5236 [01:08<00:00, 76.84it/s] 


In [41]:
compound_features = compound_dict['2r58']
print(f"> compound_features (size: {len(compound_features)}): ")
for feature in compound_features:
    print(feature.shape)

compound_features

> compound_features (size: 5): 
(12, 3)
torch.Size([12, 56])
torch.Size([22, 3])
torch.Size([22, 19])
torch.Size([12, 12, 16])


(array([[-22.985, -16.836, -21.812],
        [-23.231, -17.077, -20.381],
        [-21.954, -17.324, -19.698],
        [-24.105, -18.251, -20.22 ],
        [-24.578, -18.371, -18.775],
        [-25.624, -19.467, -18.673],
        [-25.82 , -19.943, -17.242],
        [-27.107, -20.759, -17.144],
        [-27.431, -20.995, -15.702],
        [-26.969, -22.041, -17.928],
        [-26.241, -22.951, -17.527],
        [-27.567, -22.21 , -18.999]]),
 tensor([[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          1, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
          0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 1, 0, 0, 0, 0, 0, 0],
         [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0

In [42]:
torch.save(compound_dict, f"{tankbind_data_path}/compound_torchdrug_features.pt")

In [43]:
skip_pdb_list

[]

In [44]:
data = data.query("pdb not in @skip_pdb_list").reset_index(drop=True)

# construct dataset.

In [65]:
data

Unnamed: 0.1,Unnamed: 0,pdb,resolution,year,affinity,raw,ligand,uid,group,name
0,0,2r58,2.00,2007,2.00,Kd=10mM,MLY,Q9VHA0,train,2r58
1,1,3c2f,2.35,2008,2.00,Kd=10.1mM,PRP,P43619,train,3c2f
2,2,3g2y,1.31,2009,2.00,Ki=10mM,GF4,Q9L5C8,train,3g2y
3,3,3pce,2.06,1998,2.00,Ki=10mM,3HP,P00436,train,3pce
4,4,4qsu,1.90,2014,2.00,Kd=10mM,TDR,Q6PL18,train,4qsu
...,...,...,...,...,...,...,...,...,...,...
5231,5231,4f3c,1.93,2013,11.82,Ki=1.5pM,BIG,E8NLP5,train,4f3c
5232,5232,5bry,1.34,2015,11.82,Ki=0.0015nM,4UY,P03366,train,5bry
5233,5233,1sl3,1.81,2004,11.85,Ki=1.4pM,170,P00734,train,1sl3
5234,5234,1ctu,2.30,1995,11.92,Ki=1.2pM,ZEB,P0ABF6,train,1ctu


In [60]:
# load pre-processed data...

import pickle
import torch

data = pd.read_csv(f"{pre}/data.csv")
pdb_list = data.pdb.values

pockets_dict = {}
with open(f"{tankbind_data_path}/pockets_dict.pkl", 'rb') as f:
    pockets_dict = pickle.load(f)

protein_dict = {}
for pdb in tqdm(pdb_list):
    protein_dict.update(torch.load(f"{protein_embedding_folder}/{pdb}.pt"))

compound_dict = torch.load(f"{tankbind_data_path}/compound_torchdrug_features.pt")


100%|██████████| 5236/5236 [07:28<00:00, 11.67it/s]


In [62]:
# we use the time-split defined in EquiBind paper.
# https://github.com/HannesStark/EquiBind/tree/main/data
valid = np.loadtxt("/home/zoujl/TankBind/packages/EquiBind/timesplit_no_lig_overlap_val", dtype=str)
test = np.loadtxt("/home/zoujl/TankBind/packages/EquiBind/timesplit_test", dtype=str)
def assign_group(pdb, valid=valid, test=test):
    if pdb in valid:
        return 'valid'
    if pdb in test:
        return 'test'
    return 'train'

data['group'] = data.pdb.map(assign_group)

In [None]:
data.value_counts("group")

group
train    4858
valid     268
test      110
Name: count, dtype: int64

In [64]:
data['name'] = data['pdb']

In [77]:
print(f"pocket_coms from pocket_dict:\n{pockets_dict['2r58'].head(10)[['center_x', 'center_y', 'center_z']].values}")

print(f"protein_com from protein_dict:\n{protein_dict['2r58'][0].numpy().mean(axis=0).astype(float).reshape(1, 3)}")

print(f"compound_dict:\n{compound_dict['2r58'][0]}")

pocket_coms from pocket_dict:
[[-22.5809 -17.7873 -19.9953]
 [-33.731   12.299  -16.9301]
 [-24.9639  13.8173 -15.12  ]]
protein_com from protein_dict:
[[-24.28586197  -1.69748116 -17.80907059]]
compound_dict:
[[-22.985 -16.836 -21.812]
 [-23.231 -17.077 -20.381]
 [-21.954 -17.324 -19.698]
 [-24.105 -18.251 -20.22 ]
 [-24.578 -18.371 -18.775]
 [-25.624 -19.467 -18.673]
 [-25.82  -19.943 -17.242]
 [-27.107 -20.759 -17.144]
 [-27.431 -20.995 -15.702]
 [-26.969 -22.041 -17.928]
 [-26.241 -22.951 -17.527]
 [-27.567 -22.21  -18.999]]


In [78]:
info = []
for i, line in tqdm(data.iterrows(), total=data.shape[0]):
    pdb = line['pdb']
    uid = line['uid']
    # smiles = line['smiles']
    smiles = ""
    affinity = line['affinity']
    group = line['group']

    compound_name = line['name']
    protein_name = line['name']

    pocket = pockets_dict[pdb].head(10)
    pocket.columns = pocket.columns.str.strip()
    pocket_coms = pocket[['center_x', 'center_y', 'center_z']].values
    
    # native block.
    info.append([protein_name, compound_name, pdb, smiles, affinity, uid, None, True, False, group])
    # protein center as a block.
    protein_com = protein_dict[protein_name][0].numpy().mean(axis=0).astype(float).reshape(1, 3)
    info.append([protein_name, compound_name, pdb+"_c", smiles, affinity, uid, protein_com, False, False, group])
    # each pocket as a block 
    for idx, pocket_line in pocket.iterrows():
        pdb_idx = f"{pdb}_{idx}"
        info.append([protein_name, compound_name, pdb_idx, smiles, affinity, uid, pocket_coms[idx].reshape(1, 3), False, False, group])
info = pd.DataFrame(info, columns=['protein_name', 'compound_name', 'pdb', 'smiles', 'affinity', 'uid', 'pocket_com', 
                                   'use_compound_com', 'use_whole_protein',
                                  'group'])



  0%|          | 0/5236 [00:00<?, ?it/s]

100%|██████████| 5236/5236 [00:06<00:00, 816.95it/s]


In [79]:
info[info['protein_name'] == '2r58']

Unnamed: 0,protein_name,compound_name,pdb,smiles,affinity,uid,pocket_com,use_compound_com,use_whole_protein,group
0,2r58,2r58,2r58,,2.0,Q9VHA0,,True,False,train
1,2r58,2r58,2r58_c,,2.0,Q9VHA0,"[[-24.28586196899414, -1.6974811553955078, -17...",False,False,train
2,2r58,2r58,2r58_0,,2.0,Q9VHA0,"[[-22.5809, -17.7873, -19.9953]]",False,False,train
3,2r58,2r58,2r58_1,,2.0,Q9VHA0,"[[-33.731, 12.299, -16.9301]]",False,False,train
4,2r58,2r58,2r58_2,,2.0,Q9VHA0,"[[-24.9639, 13.8173, -15.12]]",False,False,train


In [80]:
info.to_csv(f"../pdbbind2020/tankbind_data/dataset.csv")

In [81]:
info.keys()

Index(['protein_name', 'compound_name', 'pdb', 'smiles', 'affinity', 'uid',
       'pocket_com', 'use_compound_com', 'use_whole_protein', 'group'],
      dtype='object')

In [82]:
from data import TankBindDataSet

In [83]:
toFilePre = f"{pre}/dataset"
os.system(f"mkdir -p {toFilePre}")

dataset = TankBindDataSet(toFilePre, data=info, protein_dict=protein_dict, compound_dict=compound_dict)

['../pdbbind2020/dataset/processed/data.pt', '../pdbbind2020/dataset/processed/protein.pt', '../pdbbind2020/dataset/processed/compound.pt']


In [84]:
dataset.data

Unnamed: 0,protein_name,compound_name,pdb,smiles,affinity,uid,pocket_com,use_compound_com,use_whole_protein,group,p_length,c_length,y_length,num_contact,native_num_contact
0,2r58,2r58,2r58,,2.00,Q9VHA0,,True,False,train,57,12,684,52,52
1,2r58,2r58,2r58_c,,2.00,Q9VHA0,"[[-24.28586196899414, -1.6974811553955078, -17...",False,False,train,141,12,1692,41,52
2,2r58,2r58,2r58_0,,2.00,Q9VHA0,"[[-22.5809, -17.7873, -19.9953]]",False,False,train,83,12,996,52,52
3,2r58,2r58,2r58_1,,2.00,Q9VHA0,"[[-33.731, 12.299, -16.9301]]",False,False,train,82,12,984,0,52
4,2r58,2r58,2r58_2,,2.00,Q9VHA0,"[[-24.9639, 13.8173, -15.12]]",False,False,train,84,12,1008,0,52
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42785,6e9a,6e9a,6e9a_c,,11.92,Q5RZ08,"[[13.7789945602417, 26.364662170410156, 19.357...",False,False,train,151,40,6040,396,396
42786,6e9a,6e9a,6e9a_0,,11.92,Q5RZ08,"[[16.5019, 22.1874, 17.4029]]",False,False,train,130,40,5200,396,396
42787,6e9a,6e9a,6e9a_1,,11.92,Q5RZ08,"[[3.5404, 16.0525, 29.431]]",False,False,train,90,40,3600,206,396
42788,6e9a,6e9a,6e9a_2,,11.92,Q5RZ08,"[[21.978, 40.2209, 11.3321]]",False,False,train,83,40,3320,150,396


In [85]:
dataset = TankBindDataSet(toFilePre)


['../pdbbind2020/dataset/processed/data.pt', '../pdbbind2020/dataset/processed/protein.pt', '../pdbbind2020/dataset/processed/compound.pt']


In [86]:
t = []
data = dataset.data
pre_pdb = None
for i, line in tqdm(data.iterrows(), total=data.shape[0]):
    pdb = line['compound_name']
    d = dataset[i]
    p_length = d['node_xyz'].shape[0]
    c_length = d['coords'].shape[0]
    y_length = d['y'].shape[0]
    num_contact = (d.y > 0).sum()
    t.append([i, pdb, p_length, c_length, y_length, num_contact])



100%|██████████| 42790/42790 [02:28<00:00, 288.14it/s]


In [97]:
t

Unnamed: 0,index,pdb,p_length,c_length,y_length,num_contact
0,0,2r58,57,12,684,52
1,1,2r58,141,12,1692,41
2,2,2r58,83,12,996,52
3,3,2r58,82,12,984,0
4,4,2r58,84,12,1008,0
...,...,...,...,...,...,...
42785,42785,6e9a,151,40,6040,396
42786,42786,6e9a,130,40,5200,396
42787,42787,6e9a,90,40,3600,206
42788,42788,6e9a,83,40,3320,150


In [95]:
data

Unnamed: 0,protein_name,compound_name,pdb,smiles,affinity,uid,pocket_com,use_compound_com,use_whole_protein,group,p_length,c_length,y_length,num_contact,native_num_contact
0,2r58,2r58,2r58,,2.00,Q9VHA0,,True,False,train,57,12,684,52,52
1,2r58,2r58,2r58_c,,2.00,Q9VHA0,"[[-24.28586196899414, -1.6974811553955078, -17...",False,False,train,141,12,1692,41,52
2,2r58,2r58,2r58_0,,2.00,Q9VHA0,"[[-22.5809, -17.7873, -19.9953]]",False,False,train,83,12,996,52,52
3,2r58,2r58,2r58_1,,2.00,Q9VHA0,"[[-33.731, 12.299, -16.9301]]",False,False,train,82,12,984,0,52
4,2r58,2r58,2r58_2,,2.00,Q9VHA0,"[[-24.9639, 13.8173, -15.12]]",False,False,train,84,12,1008,0,52
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42785,6e9a,6e9a,6e9a_c,,11.92,Q5RZ08,"[[13.7789945602417, 26.364662170410156, 19.357...",False,False,train,151,40,6040,396,396
42786,6e9a,6e9a,6e9a_0,,11.92,Q5RZ08,"[[16.5019, 22.1874, 17.4029]]",False,False,train,130,40,5200,396,396
42787,6e9a,6e9a,6e9a_1,,11.92,Q5RZ08,"[[3.5404, 16.0525, 29.431]]",False,False,train,90,40,3600,206,396
42788,6e9a,6e9a,6e9a_2,,11.92,Q5RZ08,"[[21.978, 40.2209, 11.3321]]",False,False,train,83,40,3320,150,396


In [96]:
data = data.drop(['p_length', 'c_length', 'y_length', 'num_contact', 'native_num_contact'], axis=1)

In [91]:
t = pd.DataFrame(t, columns=['index', 'pdb' ,'p_length', 'c_length', 'y_length', 'num_contact'])
t['num_contact'] = t['num_contact'].apply(lambda x: x.item())

In [98]:
data = pd.concat([data, t[['p_length', 'c_length', 'y_length', 'num_contact']]], axis=1)

In [99]:
native_num_contact = data.query("use_compound_com").set_index("protein_name")['num_contact'].to_dict()
data['native_num_contact'] = data.protein_name.map(native_num_contact)
# data['fract_of_native_contact'] = data['num_contact'] / data['native_num_contact']

In [100]:
torch.save(data, f"{toFilePre}/processed/data.pt")

In [101]:
info = torch.load(f"{toFilePre}/processed/data.pt")


In [105]:
test = info.query("group == 'test'").reset_index(drop=True)
test_pdb_list = info.query("group == 'test'").protein_name.unique()

In [106]:
test_pdb_list.size

110

In [108]:
subset_protein_dict = {}
for pdb in tqdm(test_pdb_list):
    subset_protein_dict[pdb] = protein_dict[pdb]

100%|██████████| 110/110 [00:00<00:00, 273487.52it/s]


In [109]:
subset_compound_dict = {}
for pdb in tqdm(test_pdb_list):
    subset_compound_dict[pdb] = compound_dict[pdb]

100%|██████████| 110/110 [00:00<00:00, 520737.52it/s]


In [110]:

toFilePre = f"{pre}/test_dataset"
os.system(f"mkdir -p {toFilePre}")
dataset = TankBindDataSet(toFilePre, data=test, protein_dict=subset_protein_dict, compound_dict=subset_compound_dict)

['../pdbbind2020/test_dataset/processed/data.pt', '../pdbbind2020/test_dataset/processed/protein.pt', '../pdbbind2020/test_dataset/processed/compound.pt']


In [None]:
def canonical_smiles(smiles):
    return Chem.MolToSmiles(Chem.MolFromSmiles(smiles))