<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Dataset-Statistics" data-toc-modified-id="Dataset-Statistics-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Dataset Statistics</a></span><ul class="toc-item"><li><span><a href="#BindingDB-Dataset-(Kd-value)" data-toc-modified-id="BindingDB-Dataset-(Kd-value)-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>BindingDB Dataset (Kd value)</a></span></li><li><span><a href="#Competition-Dataset" data-toc-modified-id="Competition-Dataset-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Competition Dataset</a></span></li><li><span><a href="#Overlapping-Data" data-toc-modified-id="Overlapping-Data-1.3"><span class="toc-item-num">1.3&nbsp;&nbsp;</span>Overlapping Data</a></span></li></ul></li><li><span><a href="#Chemical-Sampling" data-toc-modified-id="Chemical-Sampling-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Chemical Sampling</a></span><ul class="toc-item"><li><span><a href="#Structural-Similarity-with-Tanimoto-Distance" data-toc-modified-id="Structural-Similarity-with-Tanimoto-Distance-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Structural Similarity with Tanimoto Distance</a></span></li></ul></li></ul></div>

In [1]:
from Bio import SeqIO
import pandas as pd

# Dataset Statistics

## BindingDB Dataset (Kd value)

In [10]:
# 6 unique drugs in BindingDB with KD
chem_bdb = {'Nc1nc(N)c2cc(NCc3ccc(cc3)[N+]([O-])=O)ccc2n1',
            'Nc1nc(N)c2cc(ccc2n1)[N+]([O-])=O',
            'Nc1nc(N)c2cc(NCc3ccc(O)cc3)ccc2n1',
            'COc1ccc(CNc2ccc3nc(N)nc(N)c3c2)cc1',
            'Nc1nc(N)c2cc(NCCn3c(nc4cc(Cl)c(Cl)cc34)C(F)(F)F)ccc2n1',
            'Nc1ccc2nc(N)nc(N)c2c1'}

# 2 unique protein in BindingDB with KD
prot_bdb = {'MTAPTVPVALVTGAAKRLGRSIAEGLHAEGYAVCLHYHRSAAEANALSATLNARRPNSAITVQADLSNVATAPVSGADGSAPVTLFTRCAELVAACYTHWGRCDVLVNNASSFYPTPLLRNDEDGHEPCVGDREAMETATADLFGSNAIAPYFLIKAFAHRFAGTPAKHRGTNYSIINMVDAMTNQPLLGYTIYTMAKGALEGLTRSAALELAPLQIRVNGVGPGLSVLVDDMPPAVWEGHRSKVPLYQRDSSAAEVSDVVIFLCSSKAKYITGTCVKVDGGYSLTRA',
            'MSRAAARFKIPMPETKADFAFPSLRAFSIVVALDMQHGIGDGESIPWRVPEDMTFFKNQTTLLRNKKPPTEKKRNAVVMGRKTWESVPVKFRPLKGRLNIVLSSKATVEELLAPLPEGQRAAAAQDVVVVNGGLAEALRLLARPLYCSSIETAYCVGGAQVYADAMLSPCIEKLQEVYLTRIYATAPACTRFFPFPPENAATAWDLASSQGRRKSEAEGLEFEICKYVPRNHEERQYLELIDRIMKTGIVKEDRTGVGTISLFGAQMRFSLRDNRLPLLTTKRVFWRGVCEELLWFLRGETSAQLLADKDIHIWDGNGSREFLDSRGLTENKEMDLGPVYGFQWRHFGADYKGFEANYDGEGVDQIKLIVETIKTNPNDRRLLVTAWNPCALQKMALPPCHLLAQFYVNTDTSELSCMLYQRSCDMGLGVPFNIASYALLTILIAKATGLRPGELVHTLGDAHVYRNHVDALKAQLERVPHAFPTLIFKEERQYLEDYELTDMEVIDYVPHPAIKMEMAV'}


## Competition Dataset

In [6]:
# proteins
path = "data/Targets/l.major.fasta"
records = list(SeqIO.parse(path, "fasta"))
prot_cha = {i.seq._data for i in records}               # 8,495 proteins

path = "data/Targets/preferredTargets.unique.fasta"
records_pref = list(SeqIO.parse(path, "fasta"))
prot_cha_pref = {i.seq._data for i in records_pref}     # 34,594 proteins

path = "data/Targets/all_targets.fasta"
records_all = list(SeqIO.parse(path, "fasta"))
prot_cha_all = {i.seq._data for i in records_all}       # 79,982 proteins

# chemicals
ddd = pd.read_csv("data/Molecules/drugBank_leishmania.smiles")
d = set(ddd['smiles'])

ddd = pd.read_csv("data/Molecules/drugCentral.csv", sep=',', header=0, usecols=[1, 5])
dp = set(ddd['SMILES'])

ddd = pd.read_csv("data/Molecules/endogenous.csv", sep=',', header=0)
dpp = set(ddd['smiles'])

ddd = pd.read_csv("data/Molecules/in-trials.csv", sep=',', header=0)
dppp = set(ddd['smiles'])

ddd = pd.read_csv("data/Molecules/world.csv", sep=',', header=0)
dpppp = set(ddd['smiles'])

chem_all = d | dp | dpp | dppp | dpppp                  # 94,053 chemiclas


## Overlapping Data

In [11]:
# proteins
overlap_prot = prot_cha & prot_bdb              # 2 overlapping protein with l.major
overlap_prot_pref = prot_cha_pref & prot_bdb    # 2 overlapping protein with l.prefferredTargets
overlap_prot_all = prot_cha_all & prot_bdb      # 2 overlapping protein with l.all

# chemicals
overlap_chem = chem_all & chem_bdb              # No overlapping chemiclas


# Chemical Sampling

## Structural Similarity with Tanimoto Distance

In [16]:
from rdkit import Chem
from rdkit.DataStructs import FingerprintSimilarity
from sklearn.cluster import AffinityPropagation
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plt
from itertools import cycle


In [23]:
m1 = Chem.MolFromSmiles('Nc1ccc2nc(N)nc(N)c2c1')
m2 = Chem.MolFromSmiles('COc1ccc(CNc2ccc3nc(N)nc(N)c3c2)cc1')
FingerprintSimilarity(Chem.RDKFingerprint(m1), Chem.RDKFingerprint(m2))

n_bdb = len(chem_bdb)
chems = list(chem_bdb) + list(chem_all)                                          # 94,059 chemicals


In [24]:
chem_hasMol = [Chem.MolFromSmiles(i) for i in chems if Chem.MolFromSmiles(i)]    # 94,059 chemicals


In [27]:
import time

n = 100
start = time.time()
similarity = np.zeros((n, n))
for i in range(1, n):
    for j in range(i, n):
        similarity[i, j] = FingerprintSimilarity(Chem.RDKFingerprint(chem_hasMol[i]), 
                                                 Chem.RDKFingerprint(chem_hasMol[j]))

es = (time.time() - start)/n/n
print("each similarity calculation: ", es, " s")
nn = len(chems)
print("estimate time for 94,053 chems: ", es*nn*nn/60/60, " h")


each similarity calculation:  0.0019063586950302126  s
estimate time for 94,053 chems:  4684.927054435236  h


In [29]:
es*nn*6/60

17.931019249584676

In [None]:
similarity = similarity + similarity.T + np.eye(n)


In [None]:
# Clusting
clustering = AffinityPropagation(random_state=5).fit(similarity)
cluster_centers_indices = af.cluster_centers_indices_
labels = af.labels_

n_clusters_ = len(cluster_centers_indices)
clustering.cluster_centers_

print('Estimated number of clusters: %d' % n_clusters_)
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X, labels, metric='sqeuclidean'))
