In [3]:
! pip install pandas scikit-learn umap-learn hdbscan matplotlib


Collecting umap-learn
  Downloading umap_learn-0.5.7-py3-none-any.whl.metadata (21 kB)
Collecting hdbscan
  Downloading hdbscan-0.8.40-cp312-cp312-win_amd64.whl.metadata (15 kB)
Collecting pynndescent>=0.5 (from umap-learn)
  Downloading pynndescent-0.5.13-py3-none-any.whl.metadata (6.8 kB)
Downloading umap_learn-0.5.7-py3-none-any.whl (88 kB)
Downloading hdbscan-0.8.40-cp312-cp312-win_amd64.whl (726 kB)
   ---------------------------------------- 0.0/726.2 kB ? eta -:--:--
   --------------------------------------- 726.2/726.2 kB 15.0 MB/s eta 0:00:00
Downloading pynndescent-0.5.13-py3-none-any.whl (56 kB)
Installing collected packages: pynndescent, hdbscan, umap-learn
Successfully installed hdbscan-0.8.40 pynndescent-0.5.13 umap-learn-0.5.7


In [4]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import umap
import hdbscan
import matplotlib.pyplot as plt

In [7]:
# Load your data
df = pd.read_csv('dimension_reduction.csv')
df.columns

Index(['substance', 'GCMS breakdown product', 'active agent', 'adulterant',
       'amphetamine & derivatives', 'anabolic', 'analgesic', 'anesthetic',
       'anti-anxiety', 'anti-tussive',
       ...
       'bromazolam', '1,3-Diacetin', 'metonitazene', 'medetomidine',
       'bis sebacate', 'molecular_weight', 'XLogP', 'HBondDonorCount',
       'HBondAcceptorCount', 'tpsa'],
      dtype='object', length=107)

In [None]:
numerical_cols = ['num1', 'num2']
binary_cols = list(df.loc[:, 'adulterant':'weak opioid'].columns)  # already 0/1

In [2]:
# Preprocessing pipeline
preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), numerical_cols),
    ('cat', OneHotEncoder(sparse=False, handle_unknown='ignore'), categorical_cols)
])

# Fit and transform the data
X_processed = preprocessor.fit_transform(df)

# Dimensionality reduction with UMAP
reducer = umap.UMAP(n_components=2, random_state=42)
X_umap = reducer.fit_transform(X_processed)

# Clustering with HDBSCAN
clusterer = hdbscan.HDBSCAN(min_cluster_size=10)
labels = clusterer.fit_predict(X_umap)

# Plot
plt.figure(figsize=(10, 6))
plt.scatter(X_umap[:, 0], X_umap[:, 1], c=labels, cmap='Spectral', s=50)
plt.title('UMAP Projection with HDBSCAN Clustering')
plt.xlabel('UMAP 1')
plt.ylabel('UMAP 2')
plt.colorbar(label='Cluster')
plt.show()

NameError: name 'reduced' is not defined