## **t-SNE of Molecular Structures (Fingerprint Space)**
This workflow merges curated SMILES from multiple sources, builds molecular fingerprints (default: Morgan radius 2, 1024 bits) and runs t-SNE on the resulting matrix to visualize chemotype neighborhoods. We provide per-dataset coloring and export the embedding for reuse.

### **1. Notebook and data preparation**

#### **1.1. Configure the environment**

In [11]:
### Import necessary libraries and configure settings

import os
from pathlib import Path
import random

import pandas as pd
import numpy as np

from rdkit import Chem
from rdkit.Chem import rdFingerprintGenerator

from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE
import plotly.express as px
from plotly import io as pio

In [40]:
### Define parameters and paths for reproducibility and portability

SEED = 42
random.seed(SEED)
np.random.seed(SEED)

INPUT_SIDAS_DIANAT = "../raw_data/dbs_curated_sidas_dianat.csv"
INPUT_UNPD_A = "../raw_data/0_UNPD_A_curated.xlsx"
OUTPUT_DIR = Path("../results/outputs_tsne")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

COL_ID = "ID"
COL_DATASET = "DATASET"
COL_SMILES = "curated_smiles"

COLOR_MAP = {
    "UNPD-A": "#D9DDDC",
    "DiaNat-DB": "#979745",
    "SG-accumulated":"#4AB82B",
    "SR-accumulated":"#304BE3",
    "SH-accumulated":"#19A3A6",
}

DATASET_ORDER = ["UNPD-A", "DiaNat-DB", "SG-accumulated", "SR-accumulated", "SH-accumulated"]

#### **1.2. Load, harmonize, and validate SMILES**
Keep only required columns, standardize names, and drop invalid SMILES.

In [34]:
### Load data and keep required columns

df_main = pd.read_csv(INPUT_SIDAS_DIANAT, dtype=str)
df_unpd = pd.read_excel(INPUT_UNPD_A, dtype=str)

df_unpd = df_unpd.rename(columns={"curated_SMILES": COL_SMILES})
df_unpd[COL_DATASET] = "UNPD-A"
df_unpd = df_unpd[[COL_ID, COL_DATASET, COL_SMILES]]

df_main = df_main[[COL_ID, COL_DATASET, COL_SMILES]]

df = pd.concat([df_main, df_unpd], ignore_index=True)

print(df.shape)
df.head(3)

(15606, 3)


Unnamed: 0,ID,DATASET,curated_smiles
0,DiaNatDB-1,DiaNat-DB,O=C(O)C1=C[C@@H](O)[C@@H](O)[C@H](O)C1
1,DiaNatDB-10,DiaNat-DB,C=C(C)[C@@H]1CC[C@]2(C)CC[C@]3(C)[C@H](CC[C@@H...
2,DiaNatDB-100,DiaNat-DB,C=C(CC[C@@H](C)[C@H]1CC[C@@]2(C)C3CC[C@H]4C(C)...


In [7]:
### Validate SMILES and filter invalid rows

def is_valid_smiles(smiles: str) -> bool:
    """
    Return True if SMILES parses into a valid RDKit Mol, else False.
    """
    try:
        mol = Chem.MolFromSmiles(smiles)
        return mol is not None
    except Exception:
        return False
    
mask_valid = df[COL_SMILES].notna() & df[COL_SMILES].astype(str).apply(is_valid_smiles)
df = df[mask_valid].reset_index(drop=True)
print(f"After filtering invalid SMILES: {df.shape}")
df.head(3)

After filtering invalid SMILES: (15606, 3)


Unnamed: 0,ID,DATASET,curated_smiles
0,DiaNatDB-1,DiaNat-DB,O=C(O)C1=C[C@@H](O)[C@@H](O)[C@H](O)C1
1,DiaNatDB-10,DiaNat-DB,C=C(C)[C@@H]1CC[C@]2(C)CC[C@]3(C)[C@H](CC[C@@H...
2,DiaNatDB-100,DiaNat-DB,C=C(CC[C@@H](C)[C@H]1CC[C@@]2(C)C3CC[C@H]4C(C)...


### **2. Compute structural fingerprints**
Generate a bit vector per molecule.

In [None]:
df_fps = pd.concat([df,
                    pd.DataFrame([list(rdFingerprintGenerator.GetMorganGenerator(
                        radius = 2, fpSize = 1024, includeChirality = True).GetFingerprint(
                            Chem.MolFromSmiles(smiles)
                            ).ToBitString()) for smiles in df[COL_SMILES]])],
                    axis = 1)

fingerprints_csv = OUTPUT_DIR /"sidas_dianat_unpda_fingerprints_morgan2_1024.csv"
df_fps.to_csv(fingerprints_csv, index=False)
print(f"Saved: {fingerprints_csv}")

print(f"After computing fingerprints: {df_fps.shape}")
df_fps.tail(3)

Saved: ../results/outputs_tsne/sidas_dianat_unpda_fingerprints_morgan2_1024.csv
After computing fingerprints: (15606, 1027)


Unnamed: 0,ID,DATASET,curated_smiles,0,1,2,3,4,5,6,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
15603,UNPD_subset_A_14998,UNPD-A,C[C@]12CC[C@H](O[C@@H]3OC[C@H](O)[C@@H](O)[C@H...,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
15604,UNPD_subset_A_14999,UNPD-A,COc1ccc(C(=O)O[C@H]2CC[C@@]3(C)[C@@H]4CC[C@H]5...,0,0,0,0,0,0,0,...,0,0,0,1,0,1,0,0,0,0
15605,UNPD_subset_A_15000,UNPD-A,CO[C@@]12C[C@H](C)O[C@@H](O[C@@H]3C[C@@H]4C[C@...,0,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0


### **3. t-SNE workflow overview**
* Select only descriptor (bits) columns.
* Standardize columns (mean 0, variance 1).
* Fit t-SNE and compute transformed components.
* Visualize **dimension 1 vs. dimension 2**.

In [None]:
data_tsne = df_fps.drop(columns=[COL_ID, COL_DATASET, COL_SMILES])
data_tsne = StandardScaler().fit_transform(data_tsne)

[[-0.1236468  -0.54046401 -0.1490126  ... -0.12391363 -0.19978665
  -0.09684233]
 [-0.1236468  -0.54046401 -0.1490126  ... -0.12391363  5.00533939
  -0.09684233]]


In [29]:
N = len(data_tsne)

perp_max = max(5, min(50, int((N - 1) / 3)))
PERPLEXITY = min(30, perp_max) # default 30 unless capped by sample size

print(f"t-SNE with N={N}, perplexity={PERPLEXITY}")

tsne = TSNE(
    n_components=2,
    verbose=1,
    perplexity=PERPLEXITY,
    n_iter=1000,
    random_state=SEED,
)

tsne_results = tsne.fit_transform(data_tsne) # shape (N, 2)
print(tsne_results[:5, :])

t-SNE with N=15606, perplexity=30
[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 15606 samples in 0.027s...




[t-SNE] Computed neighbors for 15606 samples in 1.258s...
[t-SNE] Computed conditional probabilities for sample 1000 / 15606
[t-SNE] Computed conditional probabilities for sample 2000 / 15606
[t-SNE] Computed conditional probabilities for sample 3000 / 15606
[t-SNE] Computed conditional probabilities for sample 4000 / 15606
[t-SNE] Computed conditional probabilities for sample 5000 / 15606
[t-SNE] Computed conditional probabilities for sample 6000 / 15606
[t-SNE] Computed conditional probabilities for sample 7000 / 15606
[t-SNE] Computed conditional probabilities for sample 8000 / 15606
[t-SNE] Computed conditional probabilities for sample 9000 / 15606
[t-SNE] Computed conditional probabilities for sample 10000 / 15606
[t-SNE] Computed conditional probabilities for sample 11000 / 15606
[t-SNE] Computed conditional probabilities for sample 12000 / 15606
[t-SNE] Computed conditional probabilities for sample 13000 / 15606
[t-SNE] Computed conditional probabilities for sample 14000 / 15606

In [35]:
### Assemble tidy embedding DataFrame
tsne_df = df.copy()
tsne_df["tSNE1"] = tsne_results[:, 0]
tsne_df["tSNE2"] = tsne_results[:, 1]

### Stable category order for coloring

tsne_df[COL_DATASET] = pd.Categorical(tsne_df[COL_DATASET], categories=DATASET_ORDER, ordered=True)

tsne_csv = OUTPUT_DIR/f"tsne_embedding_morgan2_1024.csv"
tsne_df.to_csv(tsne_csv, index=False)
print(f"Saved: {tsne_csv}")
tsne_df.head(3)

Saved: ../results/outputs_tsne/tsne_embedding_morgan2_1024.csv


Unnamed: 0,ID,DATASET,curated_smiles,tSNE1,tSNE2
0,DiaNatDB-1,DiaNat-DB,O=C(O)C1=C[C@@H](O)[C@@H](O)[C@H](O)C1,22.562714,38.051029
1,DiaNatDB-10,DiaNat-DB,C=C(C)[C@@H]1CC[C@]2(C)CC[C@]3(C)[C@H](CC[C@@H...,-68.434692,-16.327606
2,DiaNatDB-100,DiaNat-DB,C=C(CC[C@@H](C)[C@H]1CC[C@@]2(C)C3CC[C@H]4C(C)...,-36.625305,-40.842918


### **4. Interactive t-SNE scatter (all datasets)**

Points are colored by dataset with a stable color map. Hover shows ID.

In [41]:
### Plotly scatter with stable colors and category order
fig_tsne = px.scatter(
    tsne_df,
    x="tSNE1",
    y="tSNE2",
    color=COL_DATASET,
    color_discrete_map=COLOR_MAP,
    category_orders={COL_DATASET: DATASET_ORDER},
    title=False,
    hover_data=[COL_ID],
    width=1000,
    height=800,
)

### Improve layout and axis titles with dynamic variance
fig_tsne.update_traces(marker=dict(size=10))
fig_tsne.update_layout(
    xaxis=dict(showgrid=False, showline=True, linewidth = 1, linecolor = 'black', mirror = True, tickfont = dict(size = 24, color = 'black')),
    yaxis = dict(showgrid = False, showline = True, linewidth = 1, linecolor = 'black', mirror = True, tickfont = dict(size = 24, color = 'black')),
    plot_bgcolor = 'white', paper_bgcolor = 'white',
    legend_title_text="Dataset", showlegend=True
)
fig_tsne.update_xaxes(
    title_text="tSNE dimension 1", range=[-92, 92],
    title_font=dict(size=20, color= "black")
)
fig_tsne.update_yaxes(
    title_text="tSNE dimension 1", range=[-92, 92],
    title_font=dict(size=20, color= "black")
)

### Save the static PNG
png_all = OUTPUT_DIR / f"tsne_all_morgan2_1024.png"
pio.write_image(fig_tsne, str(png_all), width=1000, height=800, scale=3)
print(f"Saved: {png_all}")

fig_tsne.show()

Saved: ../results/outputs_tsne/tsne_all_morgan2_1024.png


### **5. Individual t-SNE scatter plots**

Filter by dataset to produce per-dataset PNGs reusing the same styling.

In [42]:
### Generate individual figures per dataset
for dataset in [d for d in DATASET_ORDER if d != "UNPD-A"]:
    subset = tsne_df[tsne_df[COL_DATASET].isin(["UNPD-A", dataset])].copy()
    
    fig_ind = px.scatter(
        subset,
        x="tSNE1", y="tSNE2",
        color=COL_DATASET,
        color_discrete_map=COLOR_MAP,
        category_orders={COL_DATASET: ["UNPD-A", dataset]},
        hover_data=[COL_ID],
        width=1000, height=800
    )
    
    fig_ind.update_traces(marker = dict(size = 10))
    fig_ind.update_layout(
        xaxis = dict(showgrid = False, showline = True, linewidth = 1, linecolor = 'black', mirror = True, tickfont = dict(size = 24, color = 'black')),
        yaxis = dict(showgrid = False, showline = True, linewidth = 1, linecolor = 'black', mirror = True, tickfont = dict(size = 24, color = 'black')),
        plot_bgcolor = 'white', paper_bgcolor = 'white',
        showlegend = False
    )
    fig_ind.update_xaxes(
    title_text="tSNE dimension 1", range=[-92, 92],
    title_font=dict(size=20, color= "black")
    )
    fig_ind.update_yaxes(
        title_text="tSNE dimension 1", range=[-92, 92],
        title_font=dict(size=20, color= "black")
    )

    ### Save the static PNG
    png_individual = OUTPUT_DIR / f"tsne_{dataset.lower().replace('-', '_')}_morgan2_1024.png"
    pio.write_image(fig_ind, str(png_individual), width=1000, height=800, scale=3)
    print(f"Saved: {png_individual}")

    fig_ind.show()

Saved: ../results/outputs_tsne/tsne_dianat_db_morgan2_1024.png


Saved: ../results/outputs_tsne/tsne_sg_accumulated_morgan2_1024.png


Saved: ../results/outputs_tsne/tsne_sr_accumulated_morgan2_1024.png


Saved: ../results/outputs_tsne/tsne_sh_accumulated_morgan2_1024.png
