In [1]:
import pickle
import os
import sys

from conf_ensemble_library import ConfEnsembleLibrary
from pdbbind_metadata_processor import PDBBindMetadataProcessor
from tqdm import tqdm
from ccdc.pharmacophore import Pharmacophore
from ccdc.utilities import Colour
from ccdc import io
from ccdc.protein import Protein
from ccdc.io import MoleculeReader, MoleculeWriter

In [2]:
with open('results/random_split_0_new_pdbbind/conf_results.p', 'rb') as f :
    results = pickle.load(f)

In [3]:
cel = ConfEnsembleLibrary()
cel.load_metadata()

In [4]:
def get_pdb_ids(smiles, cel) :
    try :
        ce = cel.load_ensemble_from_smiles(smiles)
        mol = ce.mol
        pdb_ids = [conf.GetProp('PDB_ID') for conf in mol.GetConformers()]
        return list(set(pdb_ids))
    except :
        return list(set())

In [5]:
test_set_pdbs = []
for smiles in results :
    pdb_ids = get_pdb_ids(smiles, cel)
    results[smiles]['PDB_ID'] = str(pdb_ids)
    test_set_pdbs.extend(pdb_ids)

In [6]:
len(test_set_pdbs)

1324

In [7]:
pmp = PDBBindMetadataProcessor()

In [8]:
table = pmp.get_master_dataframe()

In [9]:
active_table = table[(table['-logKd/Ki'] > 6) & (table['resolution'].str.replace('NMR', '0').astype(float) <= 2)]

In [10]:
protein_counts = active_table['protein name'].value_counts()

In [11]:
top_proteins_counts = protein_counts[protein_counts > 20]

In [12]:
top_proteins_counts

CARBONIC ANHYDRASE 2                             298
HIV-1 PROTEASE                                   195
BETA-SECRETASE 1                                 149
HEAT SHOCK PROTEIN HSP90-ALPHA                    91
BROMODOMAIN-CONTAINING PROTEIN 4                  89
CELL DIVISION PROTEIN KINASE 2                    51
TANKYRASE-2                                       49
BETA-LACTAMASE                                    47
SERINE/THREONINE-PROTEIN KINASE CHK1              45
MITOGEN-ACTIVATED PROTEIN KINASE 14               43
GLUTAMATE CARBOXYPEPTIDASE 2                      38
TRANSTHYRETIN                                     36
HEPATOCYTE GROWTH FACTOR RECEPTOR                 34
TYROSINE-PROTEIN KINASE BTK                       34
E3 UBIQUITIN-PROTEIN LIGASE MDM2                  34
CYCLIN-DEPENDENT KINASE 2                         33
GLUTAMATE RECEPTOR 2                              32
TRYPSIN                                           32
MITOGEN-ACTIVATED PROTEIN KINASE 1            

In [13]:
top_proteins = top_proteins_counts.index

In [15]:
for protein_name in top_proteins :
    print(protein_name)
    actives = active_table[(active_table['protein name'].str.contains(protein_name))]
    active_pdbs = actives['PDB code'].values
    #print(active_pdbs)
    train_set_actives = [pdb for pdb in active_pdbs if not pdb in test_set_pdbs]
    joined_name = protein_name.replace(" ", "_").replace("/", "-")
    ligands_path = os.path.join('pharmacophores',
                                f'{joined_name}_ligands_aligned.mol2')
    proteins_path = os.path.join('pharmacophores',
                                f'{joined_name}_proteins_aligned.mol2')
    with MoleculeWriter(ligands_path) as writer, MoleculeWriter(proteins_path) as protein_writer :
        for i, pdb_id in enumerate(tqdm(train_set_actives[:30])) :
            try :
                protein_path, ligand_pathes = pmp.get_pdb_id_pathes(pdb_id)
                ligand_path = ligand_pathes[0]
                ligand = MoleculeReader(ligand_path)[0]
                if i == 0 :
                    reference_protein = Protein.from_file(protein_path)
                    #print("Protein has {} chains and {} residues.".format(len(reference_protein.chains),len(reference_protein.residues)))
                    current_protein = reference_protein
                else :
                    current_protein = Protein.from_file(protein_path)
                    #print("Protein has {} chains and {} residues.".format(len(current_protein.chains),len(current_protein.residues)))
                    chain_superposition = Protein.ChainSuperposition()
                    (rmsd, transformation) = chain_superposition.superpose(reference_protein.chains[0], current_protein.chains[0])
                    #print(rmsd)
                    ligand.transform(transformation)
                protein_writer.write(current_protein)
                writer.write(ligand)
            except KeyboardInterrupt :
                sys.exit(0)
            except :
                print(f'error with {pdb_id}')

  7%|██▉                                         | 2/30 [00:00<00:02, 10.70it/s]

CARBONIC ANHYDRASE 2


100%|███████████████████████████████████████████| 30/30 [00:03<00:00,  9.18it/s]
  3%|█▍                                          | 1/30 [00:00<00:03,  9.22it/s]

HIV-1 PROTEASE


100%|███████████████████████████████████████████| 30/30 [00:02<00:00, 10.49it/s]
  0%|                                                    | 0/30 [00:00<?, ?it/s]

BETA-SECRETASE 1


100%|███████████████████████████████████████████| 30/30 [00:06<00:00,  4.99it/s]
  7%|██▉                                         | 2/30 [00:00<00:02, 12.38it/s]

HEAT SHOCK PROTEIN HSP90-ALPHA


100%|███████████████████████████████████████████| 30/30 [00:03<00:00,  9.35it/s]
 10%|████▍                                       | 3/30 [00:00<00:01, 23.12it/s]

BROMODOMAIN-CONTAINING PROTEIN 4


100%|███████████████████████████████████████████| 30/30 [00:01<00:00, 19.52it/s]
  7%|██▉                                         | 2/30 [00:00<00:04,  6.71it/s]

CELL DIVISION PROTEIN KINASE 2


100%|███████████████████████████████████████████| 30/30 [00:05<00:00,  5.92it/s]
  7%|██▉                                         | 2/30 [00:00<00:02, 10.91it/s]

TANKYRASE-2


100%|███████████████████████████████████████████| 30/30 [00:02<00:00, 11.64it/s]
  7%|██▉                                         | 2/30 [00:00<00:02, 10.84it/s]

BETA-LACTAMASE


100%|███████████████████████████████████████████| 30/30 [00:03<00:00,  8.54it/s]
  7%|██▉                                         | 2/30 [00:00<00:04,  6.98it/s]

SERINE/THREONINE-PROTEIN KINASE CHK1


100%|███████████████████████████████████████████| 30/30 [00:04<00:00,  7.17it/s]
  3%|█▍                                          | 1/30 [00:00<00:05,  5.11it/s]

MITOGEN-ACTIVATED PROTEIN KINASE 14


100%|███████████████████████████████████████████| 30/30 [00:05<00:00,  5.42it/s]
  3%|█▍                                          | 1/30 [00:00<00:08,  3.56it/s]

GLUTAMATE CARBOXYPEPTIDASE 2


100%|███████████████████████████████████████████| 30/30 [00:16<00:00,  1.84it/s]
  3%|█▍                                          | 1/30 [00:00<00:04,  5.94it/s]

TRANSTHYRETIN


100%|███████████████████████████████████████████| 30/30 [00:05<00:00,  5.02it/s]
  3%|█▍                                          | 1/30 [00:00<00:05,  5.78it/s]

HEPATOCYTE GROWTH FACTOR RECEPTOR


100%|███████████████████████████████████████████| 30/30 [00:04<00:00,  6.75it/s]
  7%|██▉                                         | 2/30 [00:00<00:03,  7.86it/s]

TYROSINE-PROTEIN KINASE BTK


100%|███████████████████████████████████████████| 30/30 [00:03<00:00,  9.50it/s]
 11%|████▉                                       | 3/27 [00:00<00:00, 24.44it/s]

E3 UBIQUITIN-PROTEIN LIGASE MDM2


100%|███████████████████████████████████████████| 27/27 [00:01<00:00, 22.74it/s]
  4%|█▌                                          | 1/28 [00:00<00:03,  6.90it/s]

CYCLIN-DEPENDENT KINASE 2


100%|███████████████████████████████████████████| 28/28 [00:04<00:00,  6.22it/s]
  7%|██▉                                         | 2/30 [00:00<00:04,  6.35it/s]

GLUTAMATE RECEPTOR 2


100%|███████████████████████████████████████████| 30/30 [00:08<00:00,  3.47it/s]
  3%|█▍                                          | 1/30 [00:00<00:05,  5.07it/s]

TRYPSIN


100%|███████████████████████████████████████████| 30/30 [00:03<00:00,  8.03it/s]
  3%|█▍                                          | 1/30 [00:00<00:03,  7.97it/s]

MITOGEN-ACTIVATED PROTEIN KINASE 1


100%|███████████████████████████████████████████| 30/30 [00:05<00:00,  5.84it/s]
  4%|█▊                                          | 1/25 [00:00<00:02,  8.31it/s]

TRYPSIN BETA


100%|███████████████████████████████████████████| 25/25 [00:02<00:00,  8.90it/s]
  5%|██                                          | 1/22 [00:00<00:03,  6.36it/s]

NEURAMINIDASE


100%|███████████████████████████████████████████| 22/22 [00:18<00:00,  1.19it/s]
  0%|                                                    | 0/25 [00:00<?, ?it/s]

CASEIN KINASE II, ALPHA SUBUNIT


100%|███████████████████████████████████████████| 25/25 [00:05<00:00,  4.62it/s]
  8%|███▌                                        | 2/25 [00:00<00:03,  6.44it/s]

COAGULATION FACTOR XA


100%|███████████████████████████████████████████| 25/25 [00:03<00:00,  6.32it/s]
  5%|██                                          | 1/22 [00:00<00:03,  6.32it/s]

TYROSINE-PROTEIN KINASE SYK


100%|███████████████████████████████████████████| 22/22 [00:02<00:00,  8.71it/s]
  9%|███▊                                        | 2/23 [00:00<00:02,  8.36it/s]

CGMP-DEPENDENT 3',5'-CYCLIC PHOSPHODIESTERASE


100%|███████████████████████████████████████████| 23/23 [00:04<00:00,  5.52it/s]
 24%|██████████▌                                 | 6/25 [00:00<00:00, 23.66it/s]

CREB-BINDING PROTEIN


100%|███████████████████████████████████████████| 25/25 [00:01<00:00, 21.96it/s]
  return func(self, *args, **kwargs)
0it [00:00, ?it/s]
 10%|████▍                                       | 2/20 [00:00<00:01, 11.12it/s]

MACROPHAGE METALLOELASTASE (MMP-12)
EPHRIN TYPE-A RECEPTOR 2


100%|███████████████████████████████████████████| 20/20 [00:01<00:00, 10.52it/s]
  0%|                                                    | 0/28 [00:00<?, ?it/s]

RNA-DIRECTED RNA POLYMERASE


100%|███████████████████████████████████████████| 28/28 [00:33<00:00,  1.20s/it]
  5%|██                                          | 1/22 [00:00<00:07,  2.66it/s]

QUEUINE TRNA-RIBOSYLTRANSFERASE


100%|███████████████████████████████████████████| 22/22 [00:06<00:00,  3.27it/s]
  0%|                                                    | 0/20 [00:00<?, ?it/s]

THROMBIN LIGHT CHAIN


100%|███████████████████████████████████████████| 20/20 [00:13<00:00,  1.50it/s]
  5%|██▎                                         | 1/19 [00:00<00:02,  8.72it/s]

TYROSINE-PROTEIN KINASE JAK2


100%|███████████████████████████████████████████| 19/19 [00:02<00:00,  7.25it/s]


In [14]:
test_set_actives = [pdb for pdb in active_pdbs if pdb in test_set_pdbs]

In [15]:
train_set_actives = [pdb for pdb in active_pdbs if not pdb in test_set_pdbs]

In [16]:
train_set_actives

['4hzt',
 '2zdz',
 '4r8y',
 '4b72',
 '4i0d',
 '2viz',
 '2qu3',
 '5hu0',
 '4djw',
 '4i0z',
 '4zpe',
 '4j0y',
 '4r91',
 '5f01',
 '4i12',
 '4dpf',
 '5he5',
 '5kqf',
 '4frj',
 '4fs4',
 '4ybi',
 '5f00',
 '2wf0',
 '4rrs',
 '4djv',
 '4zpf',
 '3ine',
 '3ufl',
 '3lpj',
 '4pzw',
 '2b8v',
 '4jpc',
 '4h3j',
 '5ezx',
 '4b1c',
 '6bfx',
 '4acx',
 '4dpi',
 '4r5n',
 '2p8h',
 '5enm',
 '6fgy',
 '5t1u',
 '3in3',
 '4djx',
 '4ha5',
 '3veu',
 '3vg1',
 '4zpg',
 '4j17',
 '4xxs',
 '4j0p',
 '4d88',
 '4jpe',
 '4rrn',
 '5hdx',
 '3ivh',
 '5hdz',
 '4ewo',
 '4acu',
 '2vj7',
 '3inf',
 '4r92',
 '2vie',
 '4b1d',
 '6uwp',
 '2ph6',
 '2qzk',
 '3l5e',
 '4b05',
 '2wf2',
 '4jp9',
 '2wez',
 '3zmg',
 '4x7i',
 '3inh',
 '5he4',
 '4j1e',
 '6e3z',
 '6ej2',
 '2b8l',
 '2xfi',
 '3tpp',
 '5hdv',
 '3cib',
 '3ooz',
 '4r95',
 '2wf4',
 '2irz',
 '2oah',
 '6eqm',
 '6uwv',
 '4n00',
 '4exg',
 '2qp8',
 '2qzl',
 '3l58',
 '5i3x',
 '4h3g',
 '4di2',
 '4djy',
 '2iqg',
 '2qmd',
 '3cid',
 '5he7',
 '6bfd',
 '4rcf',
 '5hd0',
 '2qmf',
 '2vnm',
 '2xfk',
 

In [17]:
test_set_actives

['3l5f',
 '4i0f',
 '6bfw',
 '4azy',
 '2vj9',
 '4fm7',
 '4j0v',
 '4rro',
 '4r93',
 '5i3v',
 '2vj6',
 '2xfj',
 '4i1c',
 '6od6',
 '3k5g',
 '4ke1']

In [18]:
with MoleculeWriter(f'{protein_name.replace(" ", "_")}_ligands_aligned.mol2') as writer, MoleculeWriter(f'{protein_name.replace(" ", "_")}_protein_aligned.pdb') as protein_writer :
    for i, pdb_id in enumerate(train_set_actives) :
        protein_path, ligand_pathes = pmp.get_pdb_id_pathes(pdb_id)
        ligand_path = ligand_pathes[0]
        ligand = MoleculeReader(ligand_path)[0]
        if i == 0 :
            reference_protein = Protein.from_file(protein_path)
            print("Protein has {} chains and {} residues.".format(len(reference_protein.chains),len(reference_protein.residues)))
            writer.write(ligand)
            protein_writer.write(reference_protein)
        else :
            current_protein = Protein.from_file(protein_path)
            print("Protein has {} chains and {} residues.".format(len(current_protein.chains),len(current_protein.residues)))
            chain_superposition = Protein.ChainSuperposition()
            (rmsd, transformation) = chain_superposition.superpose(reference_protein.chains[0], current_protein.chains[0])
            print(rmsd)
            ligand.transform(transformation)
            writer.write(ligand)
            protein_writer.write(current_protein)

Protein has 1 chains and 393 residues.
Protein has 1 chains and 370 residues.
0.44145808088663124
Protein has 1 chains and 387 residues.
0.48189136130929516
Protein has 1 chains and 376 residues.
0.541223537201981
Protein has 1 chains and 387 residues.
0.23158644292357541
Protein has 1 chains and 371 residues.
0.505283325730455
Protein has 1 chains and 368 residues.
0.5256661425655665
Protein has 1 chains and 390 residues.
0.4736642371788168
Protein has 1 chains and 390 residues.
0.4776349850383905
Protein has 1 chains and 390 residues.
0.1497160679046012
Protein has 1 chains and 369 residues.
0.5437287614312016
Protein has 1 chains and 379 residues.
0.4865058584906452
Protein has 1 chains and 387 residues.
0.48326760936517693
Protein has 1 chains and 384 residues.
0.49748154141366296
Protein has 1 chains and 374 residues.
0.42857625833854607
Protein has 1 chains and 375 residues.
0.5319889501246754
Protein has 1 chains and 387 residues.
0.49759031383948116
Protein has 1 chains and 358

In [19]:


csd = io.EntryReader('csd')
name = protein_name.replace(" ", "_")
mol2_file = f'{name}_ligands_aligned.mol2'
mol2_info = Pharmacophore.FeatureDatabase.DatabaseInfo(mol2_file, 0, Colour(0, 255, 0, 255))
csd_info = Pharmacophore.FeatureDatabase.DatabaseInfo(csd.file_name, 1000, Colour(255, 0, 0, 255))
pharmacophore_dir = 'pharmacophore_features/'
csdsqlx = os.path.join(os.getcwd(), os.path.basename(mol2_file).replace('.mol2', '.csdsqlx'))
mol2_sdb = Pharmacophore.FeatureDatabase.Creator.StructureDatabase(mol2_info, use_crystal_symmetry=False, structure_database_path=csdsqlx)
csd_sdb = Pharmacophore.FeatureDatabase.Creator.StructureDatabase(csd_info, use_crystal_symmetry=True)
creator = Pharmacophore.FeatureDatabase.Creator()
db = creator.create((mol2_sdb, csd_sdb))

In [20]:
db.write(f'{name}.feat')

In [8]:
csdsqlx

'/home/benoit/bioactive_conformation_predictor/HIV-1_PROTEASE_ligands_aligned.csdsqlx'

In [9]:
print(creator.StructureDatabase.default_csd_filters()) 

Settings(
	has_3d_coordinates = True
	no_disorder = Non-hydrogen
	no_powder = False
	only_organic = False
	only_organometallic = False
	max_r_factor = 10.0
	no_errors = False
	not_polymeric = True
	no_metals = False
	no_ions = False
	must_have_elements = []
	must_not_have_elements = [He (2), Be (4), Ne (10), Al (13), Si (14), Ar (18), Sc (21), Ti (22), V (23), Cr (24), Ga (31), Ge (32), As (33), Se (34), Kr (36), Rb (37), Y (39), Zr (40), Nb (41), Mo (42), Tc (43), Ru (44), Rh (45), Pd (46), Ag (47), Cd (48), In (49), Sn (50), Sb (51), Te (52), Xe (54), Cs (55), Ba (56), La (57), Ce (58), Pr (59), Nd (60), Pm (61), Sm (62), Eu (63), Gd (64), Tb (65), Dy (66), Ho (67), Er (68), Tm (69), Yb (70), Lu (71), Hf (72), Ta (73), W (74), Re (75), Os (76), Ir (77), Pt (78), Au (79), Hg (80), Tl (81), Pb (82), Bi (83), Po (84), Rn (86), Fr (87), Ra (88), Ac (89), Th (90), Pa (91), U (92)]
	max_hit_structures = 0
)
