In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append("..")  # noqa: E402

import random

from rdkit import Chem
import numpy as np
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import plotly.express as px
import pandas as pd
import nanopq

from src.dataset.components.lmdb import LMDBDataset
from src.dataset.components.smiles_writer import parse_mol

  from .autonotebook import tqdm as notebook_tqdm
2024-03-16 14:36:17,626	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


In [3]:
import matplotlib.pyplot as plt
import matplotlib.font_manager
print(f"available fonts: {sorted([f.name for f in matplotlib.font_manager.fontManager.ttflist])}")

plt.style.use('seaborn-v0_8-muted')

plt.rcParams["figure.dpi"] = 300
plt.rcParams["savefig.dpi"] = 300
plt.rcParams["savefig.format"] = "pdf"
plt.rcParams["savefig.bbox"] = "tight"
plt.rcParams["savefig.pad_inches"] = 0.1

plt.rcParams['figure.titlesize'] = 18
plt.rcParams['axes.titlesize'] = 18
plt.rcParams['font.family'] = 'Helvetica'
plt.rcParams['font.size'] = 18

plt.rcParams["lines.linewidth"] = 2
plt.rcParams['axes.labelsize'] = 16
plt.rcParams['axes.labelweight'] = 'bold'
plt.rcParams['xtick.labelsize'] = 16
plt.rcParams['ytick.labelsize'] = 16
plt.rcParams['legend.fontsize'] = 16
plt.rcParams['axes.linewidth'] = 2
plt.rcParams['axes.titlepad'] = 6

plt.rcParams['mathtext.fontset'] = 'dejavuserif'
plt.rcParams['mathtext.it'] = 'serif:italic'
plt.rcParams['lines.marker'] = ""
plt.rcParams['legend.frameon'] = False

available fonts: ['Arial', 'DejaVu Sans', 'DejaVu Sans', 'DejaVu Sans', 'DejaVu Sans', 'DejaVu Sans', 'DejaVu Sans', 'DejaVu Sans Display', 'DejaVu Sans Mono', 'DejaVu Sans Mono', 'DejaVu Sans Mono', 'DejaVu Sans Mono', 'DejaVu Sans Mono', 'DejaVu Sans Mono', 'DejaVu Serif', 'DejaVu Serif', 'DejaVu Serif', 'DejaVu Serif', 'DejaVu Serif', 'DejaVu Serif', 'DejaVu Serif Display', 'Helvetica', 'Helvetica', 'Helvetica', 'Helvetica', 'Helvetica Compressed', 'Helvetica Light', 'Helvetica Rounded', 'Palatino Linotype', 'Palatino Linotype', 'STIXGeneral', 'STIXGeneral', 'STIXGeneral', 'STIXGeneral', 'STIXNonUnicode', 'STIXNonUnicode', 'STIXNonUnicode', 'STIXNonUnicode', 'STIXSizeFiveSym', 'STIXSizeFourSym', 'STIXSizeFourSym', 'STIXSizeOneSym', 'STIXSizeOneSym', 'STIXSizeThreeSym', 'STIXSizeThreeSym', 'STIXSizeTwoSym', 'STIXSizeTwoSym', 'Times New Roman', 'cmb10', 'cmex10', 'cmmi10', 'cmr10', 'cmss10', 'cmsy10', 'cmtt10']


In [4]:
dataset = LMDBDataset("/data/screening/smilesdb/smilesdb.lmdb")
print(dataset.summary)

{'dude': 1197947, 'litpcba': 383085, 'muv': 93085, 'pcba': 437195, 'pdbbind2020': 15556, 'full': 1741307}


In [5]:
def get_random_fp(dataset, key="fp", n_point=1000):
    indices = random.sample(range(len(dataset)), n_point)
    fps = np.array([dataset[i][key] for i in indices])
    sources = [
        dataset[i]["src"] if "src" in dataset[i] else "" for i in indices
    ]
    return fps, indices, sources

In [8]:
random.seed(0)
SET_SIZE = 1000
feature = "fp"

dataset = LMDBDataset("/data/screening/smilesdb/smilesdb.lmdb")
dataset.set_default_split("pdbbind2020")
pdbbind_set, pdbbind_indices, pdbbind_sources = get_random_fp(
    dataset, feature, SET_SIZE
)

dataset.set_default_split("dude")
dude_set, dude_indices, dude_sources = get_random_fp(dataset, feature, SET_SIZE)

dataset.set_default_split("litpcba")
litpcba_set, litpcba_indices, litpcba_sources = get_random_fp(
    dataset, feature, SET_SIZE
)

dataset.set_default_split("muv")
muv_set, muv_indices, muv_sources = get_random_fp(dataset, feature, SET_SIZE)

dataset.set_default_split("pcba")
pcba_set, pcba_indices, pcba_sources = get_random_fp(dataset, feature, SET_SIZE)

In [14]:
X = np.vstack((pdbbind_set, dude_set, litpcba_set, muv_set, pcba_set))
tsne = TSNE(n_components=2, random_state=0)
X_2d = tsne.fit_transform(X)

df = pd.DataFrame(X_2d, columns=["TSNE-1", "TSNE-2"])
df["index"] = np.concatenate(
    [pdbbind_indices, dude_indices, litpcba_indices, muv_indices, pcba_indices]
)
df["src"] = np.concatenate(
    [pdbbind_sources, dude_sources, litpcba_sources, muv_sources, pcba_sources]
)
df["dataset"] = (
    ["pdbbind"] * SET_SIZE
    + ["dude"] * SET_SIZE
    + ["litpcba"] * SET_SIZE
    + ["muv"] * SET_SIZE
    + ["pcba"] * SET_SIZE
)

fig = px.scatter(
    df, x="TSNE-1", y="TSNE-2", color="dataset", hover_data=["index", "src"]
)
fig.update_layout(
    title_text="t-SNE visualization of fingerprints from different datasets"
)
fig.show()

In [15]:
random.seed(0)
SET_SIZE = 1000
feature = "unimol"

dataset = LMDBDataset("/data/screening/smilesdb/smilesdb.lmdb")
dataset.set_default_split("pdbbind2020")
pdbbind_set, pdbbind_indices, pdbbind_sources = get_random_fp(
    dataset, feature, SET_SIZE
)

dataset.set_default_split("dude")
dude_set, dude_indices, dude_sources = get_random_fp(dataset, feature, SET_SIZE)

dataset.set_default_split("litpcba")
litpcba_set, litpcba_indices, litpcba_sources = get_random_fp(
    dataset, feature, SET_SIZE
)

dataset.set_default_split("muv")
muv_set, muv_indices, muv_sources = get_random_fp(dataset, feature, SET_SIZE)

dataset.set_default_split("pcba")
pcba_set, pcba_indices, pcba_sources = get_random_fp(dataset, feature, SET_SIZE)

In [16]:
X = np.vstack((pdbbind_set, dude_set, litpcba_set, muv_set, pcba_set))
tsne = TSNE(n_components=2, random_state=0)
X_2d = tsne.fit_transform(X)

df = pd.DataFrame(X_2d, columns=["TSNE-1", "TSNE-2"])
df["index"] = np.concatenate(
    [pdbbind_indices, dude_indices, litpcba_indices, muv_indices, pcba_indices]
)
df["src"] = np.concatenate(
    [pdbbind_sources, dude_sources, litpcba_sources, muv_sources, pcba_sources]
)
df["dataset"] = (
    ["pdbbind"] * SET_SIZE
    + ["dude"] * SET_SIZE
    + ["litpcba"] * SET_SIZE
    + ["muv"] * SET_SIZE
    + ["pcba"] * SET_SIZE
)

fig = px.scatter(
    df, x="TSNE-1", y="TSNE-2", color="dataset", hover_data=["index", "src"]
)
fig.update_layout(
    title_text="t-SNE visualization of unimol features from different datasets"
)
fig.show()

In [15]:
def pq_index(X, M, Ks, use_optim: bool = False):
    if use_optim:
        pq = nanopq.OPQ(M=M, Ks=Ks, verbose=False)
    else:
        pq = nanopq.PQ(M=M, Ks=Ks, verbose=False)
    pq.fit(X.astype(np.float32))
    X_code = pq.encode(X.astype(np.float32))
    X_recon = pq.decode(X_code)
    recon_error = np.mean((X - X_recon) ** 2)
    print(f"M: {M}, Ks: {Ks}, use_optim: {use_optim}:")
    print(f"Reconstruction error: {recon_error:.4f}")
    print("")
    return X_code


In [11]:
X_code = pq_index(X, M=64, Ks=64, use_optim=False)

M: 64, Ks: 64, metric : <class 'numpy.uint8'>, code_dtype: l2
iter: 20, seed: 123
Training the subspace: 0 / 64
Training the subspace: 1 / 64
Training the subspace: 2 / 64



One of the clusters is empty. Re-run kmeans with a different initialization.



Training the subspace: 3 / 64
Training the subspace: 4 / 64
Training the subspace: 5 / 64


Training the subspace: 6 / 64
Training the subspace: 7 / 64
Training the subspace: 8 / 64
Training the subspace: 9 / 64
Training the subspace: 10 / 64
Training the subspace: 11 / 64
Training the subspace: 12 / 64
Training the subspace: 13 / 64
Training the subspace: 14 / 64
Training the subspace: 15 / 64
Training the subspace: 16 / 64
Training the subspace: 17 / 64
Training the subspace: 18 / 64
Training the subspace: 19 / 64
Training the subspace: 20 / 64
Training the subspace: 21 / 64
Training the subspace: 22 / 64
Training the subspace: 23 / 64
Training the subspace: 24 / 64
Training the subspace: 25 / 64
Training the subspace: 26 / 64
Training the subspace: 27 / 64
Training the subspace: 28 / 64
Training the subspace: 29 / 64
Training the subspace: 30 / 64
Training the subspace: 31 / 64
Training the subspace: 32 / 64
Training the subspace: 33 / 64
Training the subspace: 34 / 64
Training the subspace: 35 / 64
Training the subspace: 36 / 64
Training the subspace: 37 / 64
Training the

In [12]:
tsne = TSNE(n_components=2, random_state=0)
X_2d = tsne.fit_transform(X_code)

df = pd.DataFrame(X_2d, columns=["TSNE-1", "TSNE-2"])
df["index"] = np.concatenate(
    [pdbbind_indices, dude_indices, litpcba_indices, muv_indices]
)
df["src"] = np.concatenate(
    [pdbbind_sources, dude_sources, litpcba_sources, muv_sources]
)
df["dataset"] = (
    ["pdbbind"] * SET_SIZE
    + ["dude"] * SET_SIZE
    + ["litpcba"] * SET_SIZE
    + ["muv"] * SET_SIZE
)

fig = px.scatter(
    df, x="TSNE-1", y="TSNE-2", color="dataset", hover_data=["index", "src"]
)
fig.update_layout(
    title_text="t-SNE visualization of fingerprints from different datasets"
)
fig.show()

In [13]:
X_code = pq_index(X, M=64, Ks=64, use_optim=True)

M: 64, Ks: 64, metric : <class 'numpy.uint8'>, code_dtype: l2
OPQ rotation training: 0 / 10
M: 64, Ks: 64, metric : <class 'numpy.uint8'>, code_dtype: l2
iter: 1, seed: 123
Training the subspace: 0 / 64
Training the subspace: 1 / 64
Training the subspace: 2 / 64
Training the subspace: 3 / 64
Training the subspace: 4 / 64
Training the subspace: 5 / 64
Training the subspace: 6 / 64
Training the subspace: 7 / 64
Training the subspace: 8 / 64
Training the subspace: 9 / 64
Training the subspace: 10 / 64
Training the subspace: 11 / 64
Training the subspace: 12 / 64
Training the subspace: 13 / 64
Training the subspace: 14 / 64
Training the subspace: 15 / 64
Training the subspace: 16 / 64
Training the subspace: 17 / 64
Training the subspace: 18 / 64
Training the subspace: 19 / 64
Training the subspace: 20 / 64
Training the subspace: 21 / 64
Training the subspace: 22 / 64
Training the subspace: 23 / 64
Training the subspace: 24 / 64
Training the subspace: 25 / 64
Training the subspace: 26 / 64



One of the clusters is empty. Re-run kmeans with a different initialization.



Training the subspace: 44 / 64
Training the subspace: 45 / 64
Training the subspace: 46 / 64
Training the subspace: 47 / 64
Training the subspace: 48 / 64
Training the subspace: 49 / 64
Training the subspace: 50 / 64
Training the subspace: 51 / 64
Training the subspace: 52 / 64
Training the subspace: 53 / 64
Training the subspace: 54 / 64
Training the subspace: 55 / 64
Training the subspace: 56 / 64
Training the subspace: 57 / 64
Training the subspace: 58 / 64
Training the subspace: 59 / 64
Training the subspace: 60 / 64
Training the subspace: 61 / 64
Training the subspace: 62 / 64
Training the subspace: 63 / 64
Encoding the subspace: 0 / 64
Encoding the subspace: 1 / 64
Encoding the subspace: 2 / 64
Encoding the subspace: 3 / 64
Encoding the subspace: 4 / 64
Encoding the subspace: 5 / 64
Encoding the subspace: 6 / 64
Encoding the subspace: 7 / 64
Encoding the subspace: 8 / 64
Encoding the subspace: 9 / 64
Encoding the subspace: 10 / 64
Encoding the subspace: 11 / 64
Encoding the subsp

In [14]:
tsne = TSNE(n_components=2, random_state=0)
X_2d = tsne.fit_transform(X_code)

df = pd.DataFrame(X_2d, columns=["TSNE-1", "TSNE-2"])
df["index"] = np.concatenate(
    [pdbbind_indices, dude_indices, litpcba_indices, muv_indices]
)
df["src"] = np.concatenate(
    [pdbbind_sources, dude_sources, litpcba_sources, muv_sources]
)
df["dataset"] = (
    ["pdbbind"] * SET_SIZE
    + ["dude"] * SET_SIZE
    + ["litpcba"] * SET_SIZE
    + ["muv"] * SET_SIZE
)

fig = px.scatter(
    df, x="TSNE-1", y="TSNE-2", color="dataset", hover_data=["index", "src"]
)
fig.update_layout(
    title_text="t-SNE visualization of fingerprints from different datasets"
)
fig.show()

In [17]:
encodes = {}
for M, Ks in [(256, 16), (128, 32), (64, 64), (32, 128), (16, 256), (8, 512)]:
    for use_optim in [False, True]:
        key = f"M{M}_Ks{Ks}_use_optim{use_optim}"
        encodes[key] = pq_index(X, M, Ks, use_optim)


One of the clusters is empty. Re-run kmeans with a different initialization.



M: 256, Ks: 16, use_optim: False:
Reconstruction error: 0.0178

M: 256, Ks: 16, use_optim: True:
Reconstruction error: 0.0082

M: 128, Ks: 32, use_optim: False:
Reconstruction error: 0.0152

M: 128, Ks: 32, use_optim: True:
Reconstruction error: 0.0084

M: 64, Ks: 64, use_optim: False:
Reconstruction error: 0.0151

M: 64, Ks: 64, use_optim: True:
Reconstruction error: 0.0095

M: 32, Ks: 128, use_optim: False:
Reconstruction error: 0.0177

M: 32, Ks: 128, use_optim: True:
Reconstruction error: 0.0115

M: 16, Ks: 256, use_optim: False:
Reconstruction error: 0.0214

M: 16, Ks: 256, use_optim: True:
Reconstruction error: 0.0134

M: 8, Ks: 512, use_optim: False:
Reconstruction error: 0.0233

M: 8, Ks: 512, use_optim: True:
Reconstruction error: 0.0145

