In [24]:

import sys

sys.path.append('..')

import pandas as pd
import plotly.express as px
import umap.umap_ as umap
import torch
import numpy as np

# 1. Load your CSV
df_chem = pd.read_csv("../datasets/cpg0012/chemical_annotations.csv").reset_index(drop=True).dropna(subset=["CPD_SMILES"]).reset_index(drop=True)
df_subset = df_chem # df_chem.sample(n=1000, random_state=42).reset_index(drop=True)
# 2. Encode SMILES to embeddings
from src.models.gcmol import GCMol
model = GCMol(model_name="../models/gcmol")

all_smiles = df_subset["CPD_SMILES"].tolist()
# You might do this in batches if you have a large number
emb_list = model.mol_encode(all_smiles)  # returns a tensor of shape [N, embedding_dim]
embeddings = emb_list.detach().cpu().numpy()

# 3. Dimensionality reduction with UMAP
reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, random_state=42)
umap_2d = reducer.fit_transform(embeddings)  # shape: [N, 2]

# 4. Build a dataframe with the 2D coords
plot_df = pd.DataFrame({
    "x": umap_2d[:, 0],
    "y": umap_2d[:, 1],
    "cpd_name": df_subset["CPD_NAME"].fillna("unknown"),
    "cpd_smiles": df_subset["CPD_SMILES"].fillna("unknown"),
    "source_name": df_subset["SOURCE_NAME"].fillna("unknown")
    # add more columns if you want them as hover data
})


'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



In [25]:
fig = px.scatter(
    plot_df, x="x", y="y",
    hover_name="cpd_name",
    hover_data=["source_name" , "cpd_name","cpd_smiles"],
    title="UMAP of GCMol Embeddings"
)
fig.show()

In [20]:
df_subset

Unnamed: 0,BROAD_ID,CPD_NAME,CPD_NAME_TYPE,CPD_SAMPLE_ID,DOS_LIBRARY,SOURCE_NAME,CHEMIST_NAME,VENDOR_CATALOG_ID,CPD_SMILES,USERCOMMENT
0,BRD-A63834895-001-05-0,BRD-A63834895,BROAD_CPD_ID,SA1856343,,Enamine Ltd.,,T0515-2859,CC1CC(C)CN(C1)C(=S)NC(=O)c1cc(ccc1C)S(=O)(=O)N...,
1,BRD-K92904754-001-01-3,BRD-K92904754,BROAD_CPD_ID,SA430467,RCM full Library,Broad Institute of MIT and Harvard,,,CNC[C@H]1OCCCC[C@H](C)Oc2ccc(NS(=O)(=O)c3ccc(C...,
2,BRD-K26603007-001-01-7,BRD-K26603007,BROAD_CPD_ID,SA861031,SnAr Sulfonamide Library,Broad Institute of MIT and Harvard,,,C[C@@H](CO)N1C[C@H](C)[C@H](CN(C)C(=O)c2cnccn2...,
3,BRD-A84245072-003-05-5,BRD-A84245072,BROAD_CPD_ID,SA1857023,,Asinex Ltd.,,ASN 05346383,COCCn1nnnc1C(N1CCc2ccccc2C1)c1ccc(O)c(OC)c1,
4,BRD-K88797027-001-01-8,BRD-K88797027,BROAD_CPD_ID,SA1459264,Oxazocane Library,Broad Institute of MIT and Harvard,,,COC(=O)C[C@@H]1CC[C@H]2[C@@H](COC[C@@H](O)CN2S...,
...,...,...,...,...,...,...,...,...,...,...
995,BRD-K02707544-001-02-2,BRD-K02707544,BROAD_CPD_ID,SA666699,Click Library,Broad Institute of MIT and Harvard,,,C[C@@H](CO)N1C[C@@H](C)[C@H](CN(C)C(=O)c2ccc3c...,
996,BRD-K09971865-001-03-6,BRD-K09971865,BROAD_CPD_ID,SA1861029,,Asinex Ltd.,,BAS 00512945,CCCCc1ccc(\C=C(/C#N)C(S)=N)s1,
997,BRD-K15471766-001-01-3,BRD-K15471766,BROAD_CPD_ID,SA797909,Benzofuran Library,Broad Institute of MIT and Harvard,,,OC[C@H]1O[C@@H](CC(O)=O)C[C@H]2[C@@H]1Oc1ccc(N...,
998,BRD-A88772644-001-05-7,BRD-A88772644,BROAD_CPD_ID,SA1860236,,InterBioScreen Ltd.,,STOCK2S-25882,CC1CCCN(C1)C1=C(NS(=O)(=O)c2ccc(Cl)cc2)C(=O)c2...,
