# UMAP Training Pipeline

This notebook trains UMAP for chemical space visualization using:
- MACCS fingerprints (167 bits)
- RDKit descriptors (130 features)

The trained model is saved to `backend/ml/models/umap.pkl` for use by the API.

In [1]:
import numpy as np
import pandas as pd
import joblib
import umap
from pathlib import Path

from rdkit import Chem
from rdkit.Chem import Descriptors
from sklearn.preprocessing import StandardScaler
from skfp.fingerprints import MACCSFingerprint

import plotly.express as px

  from .autonotebook import tqdm as notebook_tqdm


## Configuration

In [25]:
# Paths
DATA_PATH = "training_data.csv"
OUTPUT_DIR = Path("ml/models")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# UMAP parameters
UMAP_PARAMS = {
    "n_neighbors": 150,
    "min_dist": 0.2,
    "metric": "euclidean",
    "random_state": 42,
}

# Descriptor columns (must match backend/services/core_utils/descriptors.py)
FINAL_DESC_COLS = [
    'MaxAbsEStateIndex', 'MinAbsEStateIndex', 'MinEStateIndex', 'qed', 'SPS',
    'MolWt', 'NumRadicalElectrons', 'MaxPartialCharge', 'MinPartialCharge',
    'FpDensityMorgan1', 'AvgIpc', 'BalabanJ', 'Ipc',
    'PEOE_VSA1', 'PEOE_VSA10', 'PEOE_VSA11', 'PEOE_VSA12', 'PEOE_VSA13',
    'PEOE_VSA14', 'PEOE_VSA2', 'PEOE_VSA3', 'PEOE_VSA4', 'PEOE_VSA5',
    'PEOE_VSA6', 'PEOE_VSA7', 'PEOE_VSA8', 'PEOE_VSA9',
    'SMR_VSA1', 'SMR_VSA10', 'SMR_VSA2', 'SMR_VSA3', 'SMR_VSA4',
    'SMR_VSA5', 'SMR_VSA6', 'SMR_VSA7', 'SMR_VSA9',
    'SlogP_VSA1', 'SlogP_VSA10', 'SlogP_VSA11', 'SlogP_VSA12', 'SlogP_VSA2',
    'SlogP_VSA3', 'SlogP_VSA4', 'SlogP_VSA7', 'SlogP_VSA8', 'TPSA',
    'EState_VSA1', 'EState_VSA11', 'EState_VSA2', 'EState_VSA3', 'EState_VSA4',
    'EState_VSA5', 'EState_VSA6', 'EState_VSA7', 'EState_VSA8', 'EState_VSA9',
    'VSA_EState2', 'VSA_EState3', 'VSA_EState4', 'VSA_EState5',
    'VSA_EState7', 'VSA_EState8', 'VSA_EState9',
    'FractionCSP3', 'NHOHCount',
    'NumAliphaticCarbocycles', 'NumAliphaticHeterocycles',
    'NumAromaticCarbocycles', 'NumAromaticHeterocycles',
    'NumAromaticRings', 'RingCount', 'MolLogP',
    'fr_Al_COO', 'fr_Al_OH', 'fr_ArN', 'fr_Ar_COO', 'fr_Ar_NH', 'fr_Ar_OH',
    'fr_C_O', 'fr_C_S', 'fr_HOCCN', 'fr_Imine', 'fr_NH1', 'fr_NH2', 'fr_N_O',
    'fr_Ndealkylation1', 'fr_Ndealkylation2', 'fr_SH', 'fr_alkyl_carbamate',
    'fr_allylic_oxid', 'fr_amidine', 'fr_aniline', 'fr_aryl_methyl', 'fr_azo',
    'fr_barbitur', 'fr_benzodiazepine', 'fr_bicyclic', 'fr_dihydropyridine',
    'fr_epoxide', 'fr_ester', 'fr_ether', 'fr_furan', 'fr_guanido',
    'fr_hdrzine', 'fr_hdrzone', 'fr_imidazole', 'fr_imide', 'fr_ketone',
    'fr_lactam', 'fr_lactone', 'fr_methoxy', 'fr_morpholine', 'fr_nitro',
    'fr_oxazole', 'fr_oxime', 'fr_para_hydroxylation', 'fr_phos_acid',
    'fr_piperdine', 'fr_piperzine', 'fr_priamide', 'fr_pyridine', 'fr_quatN',
    'fr_sulfide', 'fr_sulfonamd', 'fr_sulfone', 'fr_term_acetylene',
    'fr_tetrazole', 'fr_thiazole', 'fr_thiophene', 'fr_unbrch_alkane', 'fr_urea'
]

## Helper Functions

In [3]:
def compute_descriptors(smiles_list: list[str], descriptor_names: list[str]) -> pd.DataFrame:
    """Compute RDKit descriptors for a list of SMILES."""
    desc_funcs = dict(Descriptors._descList)
    selected = [(n, desc_funcs[n]) for n in descriptor_names if n in desc_funcs]

    data = []
    for smi in smiles_list:
        mol = Chem.MolFromSmiles(smi)
        if mol is None:
            data.append([np.nan] * len(selected))
            continue

        row = []
        for _, fn in selected:
            try:
                row.append(fn(mol))
            except Exception:
                row.append(np.nan)
        data.append(row)

    return pd.DataFrame(data, columns=[n for n, _ in selected])


def build_hybrid_features(
    smiles_list: list[str],
    maccs_gen: MACCSFingerprint,
    scaler: StandardScaler,
    desc_cols: list[str],
    fit_scaler: bool = False
) -> pd.DataFrame:
    """Build MACCS + scaled descriptors feature matrix."""
    # MACCS fingerprints
    mols = [Chem.MolFromSmiles(smi) for smi in smiles_list]
    X_maccs = maccs_gen.transform(mols)
    X_maccs = pd.DataFrame(
        X_maccs,
        columns=[f"MACCS_{i}" for i in range(X_maccs.shape[1])]
    )

    # Descriptors
    X_desc = compute_descriptors(smiles_list, desc_cols)
    X_desc = X_desc[desc_cols]

    # Scale descriptors
    if fit_scaler:
        X_desc_scaled = scaler.fit_transform(X_desc)
    else:
        X_desc_scaled = scaler.transform(X_desc)

    X_desc_scaled = pd.DataFrame(X_desc_scaled, columns=desc_cols)

    return pd.concat([X_maccs, X_desc_scaled], axis=1)

## Load Training Data

In [4]:
# Load data
df = pd.read_csv(DATA_PATH)
print(f"Loaded {len(df)} samples")
print(f"Columns: {df.columns.tolist()}")
print(f"\nClass distribution:")
print(df["ACTIVITY"].value_counts())

Loaded 11930 samples
Columns: ['ACTIVITY', 'smiles']

Class distribution:
ACTIVITY
1    6314
0    5616
Name: count, dtype: int64


In [None]:
smiles_list = df["smiles"].tolist()
y = df["ACTIVITY"].values
print(f"Processing {len(smiles_list)} molecules...")

## Build Features (MACCS + Descriptors)

In [6]:
# Initialize generators
maccs_gen = MACCSFingerprint(n_jobs=-1)
scaler = StandardScaler()

# Build features
X = build_hybrid_features(
    smiles_list,
    maccs_gen=maccs_gen,
    scaler=scaler,
    desc_cols=FINAL_DESC_COLS,
    fit_scaler=True
)

print(f"Feature matrix shape: {X.shape}")
print(f"  - MACCS: {sum(1 for c in X.columns if c.startswith('MACCS_'))} features")
print(f"  - Descriptors: {len(FINAL_DESC_COLS)} features")

Feature matrix shape: (11930, 297)
  - MACCS: 166 features
  - Descriptors: 131 features


## Train UMAP

In [23]:
print(f"Training UMAP with params: {UMAP_PARAMS}")

reducer = umap.UMAP(**UMAP_PARAMS)
X_2d = reducer.fit_transform(X.values)

print(f"UMAP coordinates shape: {X_2d.shape}")

Training UMAP with params: {'n_neighbors': 150, 'min_dist': 0.2, 'metric': 'euclidean', 'random_state': 42}



n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.


Spectral initialisation failed! The eigenvector solver
failed. This is likely due to too small an eigengap. Consider
adding some noise or jitter to your data.

Falling back to random initialisation!



UMAP coordinates shape: (11930, 2)


## Visualize Chemical Space

In [27]:
# Create visualization dataframe
viz_df = pd.DataFrame({
    "UMAP_1": X_2d[:, 0],
    "UMAP_2": X_2d[:, 1],
    "Activity": ["Toxic" if a == 1 else "Non-toxic" for a in y],
    "SMILES": smiles_list
})

fig = px.scatter(
    viz_df,
    x="UMAP_1",
    y="UMAP_2",
    color="Activity",
    color_discrete_map={"Toxic": "red", "Non-toxic": "blue"},
    hover_data=["SMILES"],
    title="Chemical Space (UMAP of MACCS + Descriptors)",
    opacity=0.6
)

fig.update_layout(
    width=900,
    height=700,
)

fig.show()

## Save Artifacts

In [19]:
# Save UMAP model
umap_path = OUTPUT_DIR / "umap.pkl"
joblib.dump(reducer, umap_path)
print(f"Saved UMAP model to: {umap_path}")

# Save scaler (if not already saved with XGBoost model)
scaler_path = OUTPUT_DIR / "desc_scaler.pkl"
joblib.dump(scaler, scaler_path)
print(f"Saved descriptor scaler to: {scaler_path}")

# Save UMAP coordinates for reference
coords_path = OUTPUT_DIR / "umap_coords.csv"
viz_df.to_csv(coords_path, index=False)
print(f"Saved UMAP coordinates to: {coords_path}")

print(f"\nAll artifacts saved to: {OUTPUT_DIR.absolute()}")

Saved UMAP model to: ml\models\umap.pkl
Saved descriptor scaler to: ml\models\desc_scaler.pkl
Saved UMAP coordinates to: ml\models\umap_coords.csv

All artifacts saved to: d:\ml_week_chemistry\backend\ml\models


## Test Inference

In [20]:
# Test with a sample molecule
test_smiles = ["CCO", "c1ccccc1", "CC(=O)Oc1ccccc1C(=O)O"]  # ethanol, benzene, aspirin

# Build features for test molecules
X_test = build_hybrid_features(
    test_smiles,
    maccs_gen=maccs_gen,
    scaler=scaler,
    desc_cols=FINAL_DESC_COLS,
    fit_scaler=False
)

# Project to UMAP space
coords_test = reducer.transform(X_test.values)

print("Test molecule projections:")
for smi, (x, y) in zip(test_smiles, coords_test):
    print(f"  {smi}: ({x:.4f}, {y:.4f})")

Test molecule projections:
  CCO: (28.7222, 21.6113)
  c1ccccc1: (28.8688, 22.2589)
  CC(=O)Oc1ccccc1C(=O)O: (28.5807, 21.4524)


## Notes

After running this notebook:

1. The UMAP model is saved to `backend/ml/models/umap.pkl`
2. The descriptor scaler is saved to `backend/ml/models/desc_scaler.pkl`
3. Make sure the XGBoost model (`backend/ml/models/xgb_model.pkl`) uses the same scaler

The backend API will use these artifacts for inference.