# Clustering Products
Check if there are any trends in the actives that being produced. Specifically from Chris Zhangs paper building blocks with a similar binding potency were clustered together. Does that assertion still hold up with the current library.

In [22]:
import sys
import os
import numpy as np
from tqdm.auto import tqdm
import pandas as pd
import polars as pl
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from rdkit import Chem
from rdkit.Chem import AllChem, DataStructs
import umap
import hdbscan

In [2]:
project_root = '/Users/aakankschitnandkeolyar/Desktop/PRISMS'
if project_root not in sys.path:
    sys.path.append(project_root)

In [3]:
from PRISMS.library_analysis.library_analysis_utils import compile_product_scores, compile_product_smiles

In [5]:
def read_smi_file(file_path):
    """
    Read a .smi file and create a dictionary with building block codes as keys and SMILES as items.
    
    :param file_path: The path to the .smi file.
    :return: A dictionary with building block codes as keys and SMILES as items.
    """
    building_block_dict = {}
    
    with open(file_path, 'r') as file:
        for line in file:
            smiles, code = line.strip().split()
            building_block_dict[code] = smiles
    
    return building_block_dict

In [6]:
# Load Building Block Data
# Generate Dictionary to map product codes to product SMILES
prod_SMILES_dir = "/Users/aakankschitnandkeolyar/Desktop/TS_Chem_Space/Thrombin/Linear_amide"
prod_smiles_dict = compile_product_smiles(prod_SMILES_dir)

In [16]:
prod_smiles = [prod_smiles_dict[x] for x in prod_smiles_dict.keys()]

In [14]:
def generate_rdkit_fingerprints(smiles_list, n_bits=2048):
    """
    Generate binary RDKit fingerprints from a list of SMILES strings.
    """
    mols = [Chem.MolFromSmiles(smiles) for smiles in smiles_list]
    fingerprints = []
    for mol in mols:
        if mol is not None:
            fp = Chem.RDKFingerprint(mol, fpSize=n_bits)
            arr = np.zeros((1,))
            DataStructs.ConvertToNumpyArray(fp, arr)
            fingerprints.append(arr)
    return np.vstack(fingerprints)

In [17]:
def analyze_molecular_space(smiles_list, 
                          umap_n_neighbors=15,
                          umap_n_components=2,
                          umap_min_dist=0.1,
                          hdbscan_min_samples=5,
                          hdbscan_min_cluster_size=5,
                          tsne_perplexity=30,
                          random_state=42):
    """
    Analyze molecular space using fingerprints, UMAP, HDBSCAN, and t-SNE.
    
    Parameters:
    -----------
    smiles_list : list
        List of SMILES strings
    umap_n_neighbors : int
        Number of neighbors for UMAP
    umap_n_components : int
        Number of components for UMAP reduction
    umap_min_dist : float
        Minimum distance for UMAP
    hdbscan_min_samples : int
        Minimum samples for HDBSCAN
    hdbscan_min_cluster_size : int
        Minimum cluster size for HDBSCAN
    tsne_perplexity : float
        Perplexity parameter for t-SNE
    random_state : int
        Random seed for reproducibility
    """
    # Generate fingerprints
    print("Generating fingerprints...")
    fps = generate_rdkit_fingerprints(smiles_list)
    
    # UMAP transformation
    print("Performing UMAP dimensionality reduction...")
    umap_reducer = umap.UMAP(
        n_neighbors=umap_n_neighbors,
        n_components=umap_n_components,
        min_dist=umap_min_dist,
        random_state=random_state
    )
    umap_embedding = umap_reducer.fit_transform(fps)
    
    # HDBSCAN clustering
    print("Performing HDBSCAN clustering...")
    clusterer = hdbscan.HDBSCAN(
        min_samples=hdbscan_min_samples,
        min_cluster_size=hdbscan_min_cluster_size
    )
    cluster_labels = clusterer.fit_predict(umap_embedding)
    
    # t-SNE transformation
    print("Performing t-SNE visualization...")
    tsne = TSNE(
        n_components=2,
        perplexity=tsne_perplexity,
        random_state=random_state
    )
    tsne_embedding = tsne.fit_transform(fps)
    
    return umap_embedding, cluster_labels, tsne_embedding


In [18]:
def plot_molecular_space(umap_embedding, cluster_labels, tsne_embedding):
    """
    Create visualizations of the molecular space using UMAP and t-SNE results.
    """
    # Set up the figure with two subplots
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
    
    # Plot UMAP results
    scatter1 = ax1.scatter(
        umap_embedding[:, 0],
        umap_embedding[:, 1],
        c=cluster_labels,
        cmap='tab20',
        s=50,
        alpha=0.7
    )
    ax1.set_title('UMAP Projection with HDBSCAN Clusters')
    ax1.set_xlabel('UMAP 1')
    ax1.set_ylabel('UMAP 2')
    
    # Plot t-SNE results
    scatter2 = ax2.scatter(
        tsne_embedding[:, 0],
        tsne_embedding[:, 1],
        c=cluster_labels,
        cmap='tab20',
        s=50,
        alpha=0.7
    )
    ax2.set_title('t-SNE Visualization with HDBSCAN Clusters')
    ax2.set_xlabel('t-SNE 1')
    ax2.set_ylabel('t-SNE 2')
    
    # Add colorbars
    plt.colorbar(scatter1, ax=ax1, label='Cluster')
    plt.colorbar(scatter2, ax=ax2, label='Cluster')
    
    plt.tight_layout()
    return fig

In [24]:
umap_embedding, cluster_labels, tsne_embedding = analyze_molecular_space(
    prod_smiles,
    umap_n_neighbors=15,
    umap_n_components=2,
    umap_min_dist=0.1,
    hdbscan_min_samples=5,
    hdbscan_min_cluster_size=5,
    tsne_perplexity=30
)

Generating fingerprints...


KeyboardInterrupt: 