In [None]:
import pandas as pd
import sqlite3
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

DB_PATH = "../../datasets/CCSMLDatabase.db"
TABLE = "master_clean"

conn = sqlite3.connect(DB_PATH)
df = pd.read_sql_query(
    f"SELECT * FROM master_clean",
    conn,
)
conn.close()

# Global style settings
plt.rcParams.update({
    "font.family": "sans-serif",
    "font.sans-serif": ["Arial", "Helvetica", "DejaVu Sans"], # Standard crisp fonts
    "axes.linewidth": 1.5,               # Thicker axis lines
    "axes.spines.top": False,            # Remove top spine
    "axes.spines.right": False,          # Remove right spine
    "xtick.major.width": 1.5,            # Match tick thickness to axis
    "ytick.major.width": 1.5,
    "xtick.direction": "out",            # Ticks point outside
    "ytick.direction": "out",
    "font.size": 10,
    "axes.labelsize": 11,
    "axes.labelweight": "normal",
})

n = df.shape[0]
df.columns

OperationalError: unable to open database file

PCA/UMAP Analysis

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import umap

# Assuming 'superclass' and other metadata are columns to exclude from the math
features = df.select_dtypes(include=[np.number]).columns.tolist()
X = df[features].values

# 2. Pipeline: Scale -> PCA -> UMAP
X_scaled = StandardScaler().fit_transform(X)

pca = PCA(n_components=min(50, X.shape[1]))
X_pca = pca.fit_transform(X_scaled)

# UMAP projection
reducer = umap.UMAP(
    n_neighbors=15, 
    min_dist=0.1, 
    metric='euclidean', 
    random_state=42
)
embedding = reducer.fit_transform(X_pca)

codes, uniques = pd.factorize(df['superclass'].fillna('Unknown'))

fig, ax = plt.subplots(figsize=(8, 6), constrained_layout=True)
scatter = ax.scatter(
    embedding[:, 0], 
    embedding[:, 1], 
    c=codes, 
    cmap='tab20', 
    s=5, 
    alpha=0.8,
    edgecolors='none'
)

legend = ax.legend(
    *scatter.legend_elements(),
    title="Superclass",
    loc="center left",
    bbox_to_anchor=(1, 0.5),
    frameon=False,
    markerscale=1.5
)
for i, text in enumerate(legend.get_texts()):
    text.set_text(uniques[i])

ax.set_title("UMAP Projection of Chemical Space", loc='left', fontweight='bold', pad=15)
ax.set_xticks([])
ax.set_yticks([])
ax.set_xlabel('UMAP 1', fontweight='bold')
ax.set_ylabel('UMAP 2', fontweight='bold')

ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)

plt.show()

3d

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde
from mpl_toolkits.mplot3d import Axes3D

# 1. Prepare Data
x = embedding[:, 0]
y = embedding[:, 1]

# 2. Calculate Kernel Density Estimation (KDE)
# This creates the "bivariate normal" heights
k = gaussian_kde(np.vstack([x, y]))
xi, yi = np.mgrid[x.min():x.max():100j, y.min():y.max():100j]
zi = k(np.vstack([xi.flatten(), yi.flatten()]))

# 3. Plotting (Object-Oriented)
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')

# Surface plot
surf = ax.plot_surface(xi, yi, zi.reshape(xi.shape), 
                       cmap='viridis', 
                       edgecolor='none', 
                       alpha=0.8,
                       antialiased=True)

# Formatting
ax.set_title('3D Density Surface of Chemical Space', fontweight='bold', pad=20)
ax.set_xlabel('UMAP 1')
ax.set_ylabel('UMAP 2')
ax.set_zlabel('Density')

# Clean up the view
ax.view_init(elev=30, azim=45) # Adjust angle to see the "hills"
ax.grid(False) # Cleaner look for papers
plt.show()