In [3]:
from rdkit import Chem
from rdkit.Chem import AllChem
import numpy as np
import umap
import pandas as pd


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
def morgan_fp(smiles, radius=2, n_bits=1024):
    mol = Chem.MolFromSmiles(smiles)
    return np.array(
        AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits)
    )


In [3]:
clean_df = pd.read_csv("training_data.csv")
X = np.stack(clean_df["smiles"].apply(morgan_fp))
y = clean_df["ACTIVITY"].values



In [None]:
reducer = umap.UMAP(
    n_neighbors=50,
    min_dist=0.1,
    metric="euclidean",
    random_state=42
)

X_2d = reducer.fit_transform(X)



gradient function is not yet implemented for jaccard distance metric; inverse_transform will be unavailable


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



In [43]:
import joblib
joblib.dump(reducer, "umap_model.pkl")

['umap_model.pkl']

In [44]:
import plotly.express as px


In [45]:
fig = px.scatter(
    x=X_2d[:, 0],
    y=X_2d[:, 1],
    color=y,
    labels={"color": "hERG activity"},
    title="Chemical space (UMAP of Morgan fingerprints)"
)

fig.show()


In [25]:
import numpy as np
from sklearn.manifold import TSNE
from sklearn import datasets
import matplotlib.pyplot as plt

In [37]:
tsne = TSNE(
    n_components=2,
    perplexity=100,
    learning_rate=200,
    max_iter=1000,
    metric="euclidean",
    random_state=42
)

X_tsne = tsne.fit_transform(X)


In [38]:
import plotly.express as px


In [40]:
df_tsne = pd.DataFrame({
    "x": X_tsne[:, 0],
    "y": X_tsne[:, 1],
    # "z": X_tsne[:, 2],  
    "activity": y
})


In [41]:
fig = px.scatter(
    df_tsne,
    x="x",
    y="y",
    color="activity",
    title="Chemical space (t-SNE)",
    labels={"activity": "hERG activity"},
    opacity=0.7
)

fig.show()


In [None]:
import numpy as np
import pandas as pd
import joblib
import umap

from rdkit import Chem
from rdkit.Chem import Descriptors
from sklearn.preprocessing import StandardScaler

# from molfeat.trans.fp import MACCSFingerprint


In [4]:
from skfp.fingerprints import (
    ECFPFingerprint,
    MACCSFingerprint,
    AtomPairFingerprint,
    TopologicalTorsionFingerprint,
    RDKitFingerprint,
    AvalonFingerprint
)

ModuleNotFoundError: No module named 'skfp'

In [None]:
FINAL_DESC_COLS = ['MaxAbsEStateIndex',
 'MinAbsEStateIndex',
 'MinEStateIndex',
 'qed',
 'SPS',
 'MolWt',
 'NumRadicalElectrons',
 'MaxPartialCharge',
 'MinPartialCharge',
 'FpDensityMorgan1',
 'AvgIpc',
 'BalabanJ',
 'Ipc',
 'PEOE_VSA1',
 'PEOE_VSA10',
 'PEOE_VSA11',
 'PEOE_VSA12',
 'PEOE_VSA13',
 'PEOE_VSA14',
 'PEOE_VSA2',
 'PEOE_VSA3',
 'PEOE_VSA4',
 'PEOE_VSA5',
 'PEOE_VSA6',
 'PEOE_VSA7',
 'PEOE_VSA8',
 'PEOE_VSA9',
 'SMR_VSA1',
 'SMR_VSA10',
 'SMR_VSA2',
 'SMR_VSA3',
 'SMR_VSA4',
 'SMR_VSA5',
 'SMR_VSA6',
 'SMR_VSA7',
 'SMR_VSA9',
 'SlogP_VSA1',
 'SlogP_VSA10',
 'SlogP_VSA11',
 'SlogP_VSA12',
 'SlogP_VSA2',
 'SlogP_VSA3',
 'SlogP_VSA4',
 'SlogP_VSA7',
 'SlogP_VSA8',
 'TPSA',
 'EState_VSA1',
 'EState_VSA11',
 'EState_VSA2',
 'EState_VSA3',
 'EState_VSA4',
 'EState_VSA5',
 'EState_VSA6',
 'EState_VSA7',
 'EState_VSA8',
 'EState_VSA9',
 'VSA_EState2',
 'VSA_EState3',
 'VSA_EState4',
 'VSA_EState5',
 'VSA_EState7',
 'VSA_EState8',
 'VSA_EState9',
 'FractionCSP3',
 'NHOHCount',
 'NumAliphaticCarbocycles',
 'NumAliphaticHeterocycles',
 'NumAromaticCarbocycles',
 'NumAromaticHeterocycles',
 'NumAromaticRings',
 'RingCount',
 'MolLogP',
 'fr_Al_COO',
 'fr_Al_OH',
 'fr_ArN',
 'fr_Ar_COO',
 'fr_Ar_NH',
 'fr_Ar_OH',
 'fr_C_O',
 'fr_C_S',
 'fr_HOCCN',
 'fr_Imine',
 'fr_NH1',
 'fr_NH2',
 'fr_N_O',
 'fr_Ndealkylation1',
 'fr_Ndealkylation2',
 'fr_SH',
 'fr_alkyl_carbamate',
 'fr_allylic_oxid',
 'fr_amidine',
 'fr_aniline',
 'fr_aryl_methyl',
 'fr_azo',
 'fr_barbitur',
 'fr_benzodiazepine',
 'fr_bicyclic',
 'fr_dihydropyridine',
 'fr_epoxide',
 'fr_ester',
 'fr_ether',
 'fr_furan',
 'fr_guanido',
 'fr_hdrzine',
 'fr_hdrzone',
 'fr_imidazole',
 'fr_imide',
 'fr_ketone',
 'fr_lactam',
 'fr_lactone',
 'fr_methoxy',
 'fr_morpholine',
 'fr_nitro',
 'fr_oxazole',
 'fr_oxime',
 'fr_para_hydroxylation',
 'fr_phos_acid',
 'fr_piperdine',
 'fr_piperzine',
 'fr_priamide',
 'fr_pyridine',
 'fr_quatN',
 'fr_sulfide',
 'fr_sulfonamd',
 'fr_sulfone',
 'fr_term_acetylene',
 'fr_tetrazole',
 'fr_thiazole',
 'fr_thiophene',
 'fr_unbrch_alkane',
 'fr_urea']


In [None]:
def compute_descriptors(smiles_list, descriptor_names):
    desc_funcs = dict(Descriptors._descList)
    selected = [(n, desc_funcs[n]) for n in descriptor_names if n in desc_funcs]

    data = []
    for smi in smiles_list:
        mol = Chem.MolFromSmiles(smi)
        if mol is None:
            data.append([np.nan] * len(selected))
            continue

        row = []
        for _, fn in selected:
            try:
                row.append(fn(mol))
            except Exception:
                row.append(np.nan)
        data.append(row)

    return pd.DataFrame(data, columns=[n for n, _ in selected])


In [None]:
def build_hybrid_features(
    smiles_list,
    maccs_gen,
    scaler,
    desc_cols,
    fit_scaler: bool = False
):
    # MACCS
    X_maccs = maccs_gen.transform(smiles_list)
    X_maccs = pd.DataFrame(
        X_maccs,
        columns=[f"MACCS_{i}" for i in range(X_maccs.shape[1])]
    )

    # descriptors
    X_desc = compute_descriptors(smiles_list, desc_cols)
    X_desc = X_desc[desc_cols]

    if fit_scaler:
        X_desc_scaled = scaler.fit_transform(X_desc)
    else:
        X_desc_scaled = scaler.transform(X_desc)

    X_desc_scaled = pd.DataFrame(X_desc_scaled, columns=desc_cols)

    return pd.concat([X_maccs, X_desc_scaled], axis=1)


In [None]:
def train_umap(
    X: pd.DataFrame,
    n_neighbors=50,
    min_dist=0.1,
    metric="euclidean",
    random_state=42
):
    reducer = umap.UMAP(
        n_neighbors=n_neighbors,
        min_dist=min_dist,
        metric=metric,
        random_state=random_state
    )
    coords = reducer.fit_transform(X.values)
    return reducer, coords


In [None]:
def fit_umap_pipeline(
    smiles_list,
    output_dir="artifacts/umap"
):
    maccs_gen = MACCSFingerprint(n_jobs=-1)
    scaler = StandardScaler()

    X = build_hybrid_features(
        smiles_list,
        maccs_gen=maccs_gen,
        scaler=scaler,
        desc_cols=FINAL_DESC_COLS,
        fit_scaler=True
    )

    reducer, coords = train_umap(X)

    # save artifacts
    joblib.dump(reducer, f"{output_dir}/umap_model.pkl")
    joblib.dump(scaler, f"{output_dir}/desc_scaler.pkl")
    joblib.dump(maccs_gen, f"{output_dir}/maccs_gen.pkl")

    X.to_parquet(f"{output_dir}/features.parquet")

    coords_df = pd.DataFrame(coords, columns=["umap_x", "umap_y"])
    coords_df["smiles"] = smiles_list
    coords_df.to_csv(f"{output_dir}/umap_coords.csv", index=False)

    return reducer, X, coords_df


In [None]:
def transform_smiles_umap(
    smiles_list,
    artifacts_dir="artifacts/umap"
):
    reducer = joblib.load(f"{artifacts_dir}/umap_model.pkl")
    scaler = joblib.load(f"{artifacts_dir}/desc_scaler.pkl")
    maccs_gen = joblib.load(f"{artifacts_dir}/maccs_gen.pkl")

    X = build_hybrid_features(
        smiles_list,
        maccs_gen=maccs_gen,
        scaler=scaler,
        desc_cols=FINAL_DESC_COLS,
        fit_scaler=False
    )

    coords = reducer.transform(X.values)

    return (
        X,
        pd.DataFrame(coords, columns=["umap_x", "umap_y"])
    )
