<a href="https://colab.research.google.com/github/apoorvapu/data_science/blob/main/10proteins.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install RDKit

Collecting RDKit
  Downloading rdkit-2025.9.1-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (4.1 kB)
Downloading rdkit-2025.9.1-cp312-cp312-manylinux_2_28_x86_64.whl (36.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m36.2/36.2 MB[0m [31m25.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: RDKit
Successfully installed RDKit-2025.9.1


In [4]:
!pip install chembl_webresource_client

Collecting chembl_webresource_client
  Downloading chembl_webresource_client-0.10.9-py3-none-any.whl.metadata (1.4 kB)
Collecting requests-cache~=1.2 (from chembl_webresource_client)
  Downloading requests_cache-1.2.1-py3-none-any.whl.metadata (9.9 kB)
Collecting cattrs>=22.2 (from requests-cache~=1.2->chembl_webresource_client)
  Downloading cattrs-25.3.0-py3-none-any.whl.metadata (8.4 kB)
Collecting url-normalize>=1.4 (from requests-cache~=1.2->chembl_webresource_client)
  Downloading url_normalize-2.2.1-py3-none-any.whl.metadata (5.6 kB)
Downloading chembl_webresource_client-0.10.9-py3-none-any.whl (55 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.2/55.2 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading requests_cache-1.2.1-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.4/61.4 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading cattrs-25.3.0-py3-none-any.whl (70 kB)
[2K   [90m━━━━━━━━━━━━━━

In [None]:
!pip install umap-learn

In [17]:
# ===============================================================
# FULL PUBLISHABLE 10-TARGET CHEMINFORMATICS PIPELINE
# Includes: QC, fingerprints, CIs, paired tests, sensitivity,
# scaffold cliffs, UMAP (bit + embedding), ML baseline,
# PCA-Morgan embedding (fast), and all figures.
#
# Suitable for Journal of Cheminformatics / CSBJ / Molecules
# ===============================================================


import os, time, gc
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict

from rdkit import Chem
from rdkit.Chem import AllChem, MACCSkeys, Descriptors
from rdkit.Chem.Scaffolds import MurckoScaffold
from chembl_webresource_client.new_client import new_client

from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor
from scipy.stats import wilcoxon
import umap

sns.set_style("whitegrid")
plt.rcParams["figure.dpi"] = 120

# ---------------- CONFIG ----------------
TARGETS = {
    "Kinase": "CHEMBL203",
    "GPCR": "CHEMBL217",
    "Protease": "CHEMBL204",
    "NuclearReceptor": "CHEMBL206",
    "IonChannel": "CHEMBL240",
    "Metalloenzyme": "CHEMBL205",
    "Hydrolase": "CHEMBL220",
    "Polymerase": "CHEMBL238",
    "Transporter": "CHEMBL228",
    "Phosphodiesterase": "CHEMBL2034"
}

MAX_FETCH = 7000
FINAL_N = 1200
PAIR_SAMPLE = 150000
BOOTSTRAP_N = 400
SIM_THRESH = 0.85
CLIFF_ACT = 2.0
RND = 42

OUTDIR = "publishable_results"
os.makedirs(OUTDIR, exist_ok=True)

# ---------------- Helpers ----------------
def fetch_and_qc(tid, max_rows=MAX_FETCH):
    acts = new_client.activity.filter(
        target_chembl_id=tid,
        pchembl_value__isnull=False,
        standard_type__in=["IC50","Ki","Kd","EC50"]
    ).only(["canonical_smiles","pchembl_value"])[:max_rows]

    df = pd.DataFrame(acts)
    if df.empty: return None
    df['pchembl_value'] = pd.to_numeric(df['pchembl_value'], errors='coerce')
    df = df.dropna(subset=['canonical_smiles','pchembl_value'])

    df = df.sort_values('pchembl_value', ascending=False).drop_duplicates('canonical_smiles')

    df['mol'] = df['canonical_smiles'].apply(Chem.MolFromSmiles)
    df = df.dropna(subset=['mol'])

    df['mw'] = df['mol'].apply(Descriptors.MolWt)
    df = df[(df['mw']>=150) & (df['mw']<=650)]

    if len(df) >= 8:
        m = df['pchembl_value'].mean()
        s = df['pchembl_value'].std()
        if s and s > 0:
            df = df[np.abs(df['pchembl_value'] - m) <= 3*s]

    return df.reset_index(drop=True)

def stratified_sample(df, n=FINAL_N, bins=5):
    df = df.copy()
    df["bin"] = pd.qcut(df["pchembl_value"], bins, labels=False, duplicates="drop")
    per = n // bins; rem = n % bins
    parts=[]
    for b in sorted(df['bin'].unique()):
        g = df[df['bin']==b]
        k = min(len(g), per + (1 if b<rem else 0))
        parts.append(g.sample(k, random_state=RND))
    out = pd.concat(parts)
    if len(out) < n:
        out = out.sample(n, replace=True, random_state=RND)
    return out.drop(columns=["bin"]).reset_index(drop=True)

def fp_bitvecs(mols, fp='morgan2', radius=2, nBits=1024):
    out=[]
    for m in mols:
        try:
            if fp=='morgan2':
                bv = AllChem.GetMorganFingerprintAsBitVect(m, radius, nBits)
            elif fp=='morgan3':
                bv = AllChem.GetMorganFingerprintAsBitVect(m, radius, nBits)
            elif fp=='rdkit':
                bv = Chem.RDKFingerprint(m, fpSize=nBits)
            else:
                bv = MACCSkeys.GenMACCSKeys(m)
            out.append(bv)
        except:
            out.append(None)
    return out

from rdkit import DataStructs

def bitvecs_to_matrix(bitvecs, nBits):
    M = np.zeros((len(bitvecs), nBits), dtype=np.uint8)
    arr = np.zeros(nBits, dtype=np.int8)
    for i,bv in enumerate(bitvecs):
        if bv is None: continue
        DataStructs.ConvertToNumpyArray(bv, arr)
        M[i] = arr
    return M

def sample_pairs(n, k):
    i = np.random.randint(0,n,k)
    j = np.random.randint(0,n,k)
    mask = i!=j
    return i[mask], j[mask]

def pair_stats_from_matrix(M, acts, k):
    n = len(acts)
    i,j = sample_pairs(n, k)
    inter = np.sum(M[i] & M[j], axis=1)
    union = np.sum(M[i] | M[j], axis=1) + 1e-9
    sim = inter/union
    dA = np.abs(acts[i]-acts[j])

    pear = np.corrcoef(sim, dA)[0,1] if len(sim)>10 else np.nan
    high = sim >= SIM_THRESH
    cliffs = (dA >= CLIFF_ACT) & high
    cliff_rate = np.sum(cliffs)/max(1, np.sum(high))
    return pear, cliff_rate

def murcko(sm):
    try:
        return Chem.MolToSmiles(MurckoScaffold.GetScaffoldForMol(
            Chem.MolFromSmiles(sm)
        ))
    except:
        return ""

# ---------------- Fetch all targets ----------------
datasets = {}
for fam, tid in TARGETS.items():
    print(f"Fetching {tid} ({fam}) ... ", end="")
    df = fetch_and_qc(tid)
    if df is None or len(df)<800:
        raise ValueError(f"{tid} insufficient")
    df = stratified_sample(df, FINAL_N)
    df["Family"] = fam
    df["Scaffold"] = df["canonical_smiles"].apply(murcko)
    datasets[tid] = df
    print(len(df))

# ---------------- Fingerprints ----------------
fps = [
    ("Morgan2_1024","morgan2",2,1024),
    ("Morgan3_1024","morgan3",3,1024),
    ("RDKitFP_2048","rdkit",None,2048),
    ("MACCS_167","maccs",None,167),      # FIXED
]

per_target_bitmats = {}
summary_rows=[]

for tid, df in datasets.items():
    mols = df["mol"].tolist()
    acts = df["pchembl_value"].values.astype(np.float32)
    per_fp={}
    for name,key,r,nB in fps:
        bv = fp_bitvecs(mols, fp=key, radius=r or 2, nBits=nB)
        M = bitvecs_to_matrix(bv, nB)
        pear, cliff = pair_stats_from_matrix(M, acts, PAIR_SAMPLE//4)
        summary_rows.append({"Target":tid, "Family":df["Family"].iloc[0],
                             "Fingerprint":name, "Pearson_r":pear, "Cliff_rate":cliff})
        per_fp[name]=(M,acts)
    per_target_bitmats[tid]=per_fp
    gc.collect()

results_df = pd.DataFrame(summary_rows)
results_df.to_csv(f"{OUTDIR}/initial_results.csv", index=False)

# ---------------- Bootstrap CIs ----------------
def bootstrap_pear(M, acts):
    p,_ = pair_stats_from_matrix(M, acts, PAIR_SAMPLE//12)
    return p

boot_rows=[]
for tid, per_fp in per_target_bitmats.items():
    for name,(M,acts) in per_fp.items():
        vals=[]
        for _ in range(BOOTSTRAP_N):
            vals.append(bootstrap_pear(M, acts))
        lo = np.nanpercentile(vals,2.5)
        hi = np.nanpercentile(vals,97.5)
        boot_rows.append({"Target":tid,"Fingerprint":name,"mean":np.nanmean(vals),"lo":lo,"hi":hi})

boot_df = pd.DataFrame(boot_rows)
boot_df.to_csv(f"{OUTDIR}/bootstrap_cis.csv", index=False)

# ---------------- Paired Test (Morgan2 vs RDKit) ----------------
pairs=[]
for tid in datasets:
    a = results_df.query("Target==@tid & Fingerprint=='Morgan2_1024'").Pearson_r.values
    b = results_df.query("Target==@tid & Fingerprint=='RDKitFP_2048'").Pearson_r.values
    if len(a)>0 and len(b)>0:
        pairs.append((a[0],b[0]))
if len(pairs)>3:
    a=np.array([x[0] for x in pairs]); b=np.array([x[1] for x in pairs])
    stat,pval=wilcoxon(a,b)
    print("Wilcoxon Morgan2 vs RDKit:",stat,pval)

# ---------------- Sensitivity (Morgan2 bits & radius) ----------------
sens=[]
for tid, df in datasets.items():
    mols = df["mol"].tolist()
    acts = df["pchembl_value"].values.astype(np.float32)
    for r in [2,3]:
        for bits in [1024,2048]:
            bv = fp_bitvecs(mols, 'morgan2', r, bits)
            M = bitvecs_to_matrix(bv, bits)
            pear,cliff = pair_stats_from_matrix(M,acts,PAIR_SAMPLE//12)
            sens.append({"Target":tid,"Radius":r,"Bits":bits,"Pearson_r":pear,"Cliff_rate":cliff})
sens_df = pd.DataFrame(sens)
sens_df.to_csv(f"{OUTDIR}/sensitivity.csv",index=False)

# ---------------- Murcko Scaffold + Cliffs ----------------
scaf_rows=[]
cliff_examples=[]
for tid,df in datasets.items():
    acts = df["pchembl_value"].values
    scafs = df["Scaffold"].values
    M,acts_local = per_target_bitmats[tid]["Morgan2_1024"]
    i,j = sample_pairs(len(df), 40000)
    inter = np.sum(M[i]&M[j],axis=1); union = np.sum(M[i]|M[j],axis=1)+1e-9
    sim = inter/union
    dA = np.abs(acts[i]-acts[j])
    mask = (sim>=SIM_THRESH)&(dA>=CLIFF_ACT)
    if np.sum(mask)>0:
        same = np.sum(scafs[i][mask]==scafs[j][mask])
        frac = same/np.sum(mask)
    else:
        frac = 0
    scaf_rows.append({"Target":tid,"Cliffs":int(np.sum(mask)),"SameScaffoldFrac":frac})
    for idx in np.where(mask)[0][:4]:
        ii=i[idx]; jj=j[idx]
        cliff_examples.append({"Target":tid,
                               "SMI_A":df.loc[ii,"canonical_smiles"],
                               "SMI_B":df.loc[jj,"canonical_smiles"],
                               "sim":float(sim[idx]),"dA":float(dA[idx]),
                               "sameScaf":scafs[ii]==scafs[jj]})
pd.DataFrame(scaf_rows).to_csv(f"{OUTDIR}/scaffold_cliffs.csv",index=False)
pd.DataFrame(cliff_examples).to_csv(f"{OUTDIR}/cliff_examples.csv",index=False)

# ---------------- UMAP (Fingerprint) ----------------
pool_X=[]; pool_y=[]
for tid,(perfp) in per_target_bitmats.items():
    df = datasets[tid]
    n=min(300,len(df))
    sel=df.sample(n,random_state=RND)
    idx=sel.index.values
    M,_=perfp["Morgan2_1024"]
    pool_X.append(M[idx])
    pool_y.append(sel['pchembl_value'].values)
pool_X = np.vstack(pool_X)
pool_y = np.concatenate(pool_y)

um = umap.UMAP(n_components=2,metric='jaccard',random_state=RND)
U = um.fit_transform(pool_X)
plt.figure(figsize=(7,5))
sc=plt.scatter(U[:,0],U[:,1],c=pool_y,cmap="Spectral",s=8)
plt.colorbar(sc,label="pChEMBL")
plt.title("UMAP of Morgan2 bit-vectors")
plt.tight_layout()
plt.savefig(f"{OUTDIR}/umap_morgan.png",dpi=300)
plt.close()

# ==========================================================
#              PCA–MORGAN EMBEDDING BLOCK (FAST)
# ==========================================================

print("Building PCA–Morgan embedding...")

def count_morgan(m, radius=2, nBits=2048):
    try:
        fp = AllChem.GetMorganFingerprint(m, radius)
        vec = np.zeros(nBits, dtype=np.float32)
        for idx,cnt in fp.GetNonzeroElements().items():
            vec[idx % nBits] += cnt
        return vec
    except:
        return np.zeros(nBits, dtype=np.float32)

# pooled count vectors
pooled=[]; pooled_meta=[]
for tid,df in datasets.items():
    X = np.vstack([count_morgan(m) for m in df["mol"].tolist()])
    pooled.append(X)
pooled = np.vstack(pooled)

scaler = StandardScaler(with_mean=False)
pooled_s = scaler.fit_transform(pooled)

svd = TruncatedSVD(n_components=300,random_state=RND)
emb_all = svd.fit_transform(pooled_s)

# split back
emb_per_target={}
start=0
for tid,df in datasets.items():
    n=len(df)
    emb_per_target[tid]=emb_all[start:start+n]
    start+=n

# Evaluate embedding correlation
embed_rows=[]
for tid,df in datasets.items():
    emb = emb_per_target[tid]
    acts = df["pchembl_value"].values.astype(np.float32)
    n=len(df)
    k=min(70000,n*(n-1)//2)
    i=np.random.randint(0,n,k)
    j=np.random.randint(0,n,k)
    mask=i!=j; i=i[mask]; j=j[mask]
    e1=emb[i]; e2=emb[j]
    s = np.sum(e1*e2,axis=1)/(np.linalg.norm(e1,axis=1)*np.linalg.norm(e2,axis=1)+1e-9)
    dA=np.abs(acts[i]-acts[j])
    pear=np.corrcoef(s,dA)[0,1]
    cliff=np.sum((s>=SIM_THRESH)&(dA>=CLIFF_ACT))/max(1,np.sum(s>=SIM_THRESH))
    embed_rows.append({"Target":tid,"Family":df["Family"].iloc[0],
                       "Fingerprint":"Embedding_PCA","Pearson_r":pear,"Cliff_rate":cliff})
embed_df=pd.DataFrame(embed_rows)
embed_df.to_csv(f"{OUTDIR}/embedding_results.csv",index=False)

results_df = pd.concat([results_df,embed_df],ignore_index=True)
results_df.to_csv(f"{OUTDIR}/results_with_embedding.csv",index=False)

# ---------------- UMAP (Embedding) ----------------
pool_E=[]; pool_y=[]

for tid,df in datasets.items():
    E = emb_per_target[tid]
    n=min(300,len(E))
    idx=np.random.choice(len(E),n,replace=False)
    pool_E.append(E[idx])
    pool_y.append(df["pchembl_value"].values[idx])

pool_E=np.vstack(pool_E)
pool_y=np.concatenate(pool_y)

um2 = umap.UMAP(n_components=2,random_state=RND)
UE = um2.fit_transform(pool_E)
plt.figure(figsize=(7,5))
sc=plt.scatter(UE[:,0],UE[:,1],c=pool_y,cmap="Spectral",s=8)
plt.colorbar(sc,label="pChEMBL")
plt.title("UMAP of PCA-Morgan embedding")
plt.tight_layout()
plt.savefig(f"{OUTDIR}/umap_embedding.png",dpi=300)
plt.close()

# ---------------- ML baselines ----------------
ml_rows=[]
for tid,df in datasets.items():
    M,_ = per_target_bitmats[tid]["Morgan2_1024"]
    X = M.astype(np.uint8)
    y = df["pchembl_value"].values.astype(np.float32)
    kf=KFold(n_splits=5,shuffle=True,random_state=RND)
    maes=[]; r2s=[]
    for tr,te in kf.split(X):
        model=RandomForestRegressor(n_estimators=150,n_jobs=-1,random_state=RND)
        model.fit(X[tr],y[tr])
        yp=model.predict(X[te])
        maes.append(np.mean(np.abs(yp-y[te])))
        ss_res=np.sum((y[te]-yp)**2)
        ss_tot=np.sum((y[te]-np.mean(y[te]))**2)
        r2s.append(1-ss_res/ss_tot)
    ml_rows.append({"Target":tid,"Model":"RF_Morgan2","MAE":np.mean(maes),"R2":np.mean(r2s)})

ml_rows_emb=[]
for tid,df in datasets.items():
    E = emb_per_target[tid]
    y=df["pchembl_value"].values.astype(np.float32)
    kf=KFold(n_splits=5,shuffle=True,random_state=RND)
    maes=[]; r2s=[]
    for tr,te in kf.split(E):
        model=RandomForestRegressor(n_estimators=150,n_jobs=-1,random_state=RND)
        model.fit(E[tr],y[tr])
        yp=model.predict(E[te])
        maes.append(np.mean(np.abs(yp-y[te])))
        ss_res=np.sum((y[te]-yp)**2)
        ss_tot=np.sum((y[te]-np.mean(y[te]))**2)
        r2s.append(1-ss_res/ss_tot)
    ml_rows_emb.append({"Target":tid,"Model":"RF_Embedding","MAE":np.mean(maes),"R2":np.mean(r2s)})

ml_all=pd.DataFrame(ml_rows+ml_rows_emb)
ml_all.to_csv(f"{OUTDIR}/ml_all.csv",index=False)




Fetching CHEMBL203 (Kinase) ... 1200
Fetching CHEMBL217 (GPCR) ... 1200
Fetching CHEMBL204 (Protease) ... 1200
Fetching CHEMBL206 (NuclearReceptor) ... 1200
Fetching CHEMBL240 (IonChannel) ... 1200
Fetching CHEMBL205 (Metalloenzyme) ... 1200
Fetching CHEMBL220 (Hydrolase) ... 1200
Fetching CHEMBL238 (Polymerase) ... 1200
Fetching CHEMBL228 (Transporter) ... 1200
Fetching CHEMBL2034 (Phosphodiesterase) ... 1200


[1;30;43mStreaming output truncated to the last 5000 lines.[0m


Wilcoxon Morgan2 vs RDKit: 10.0 0.083984375


[1;30;43mStreaming output truncated to the last 5000 lines.[0m


Building PCA–Morgan embedding...


[1;30;43mStreaming output truncated to the last 5000 lines.[0m


FINISHED. All results saved in: publishable_results


In [23]:
# ---------------- Figures ----------------

PROTEIN_NAMES = {
    "CHEMBL203": "EGFR (Kinase)",
    "CHEMBL217": "D2 Receptor (GPCR)",
    "CHEMBL204": "Thrombin (Protease)",
    "CHEMBL206": "ESR1 (Nuclear Rec.)",
    "CHEMBL240": "hERG (Ion Channel)",
    "CHEMBL205": "CA-II (Metalloenzyme)",
    "CHEMBL220": "AChE (Hydrolase)",
    "CHEMBL238": "HIV RT (Polymerase)",
    "CHEMBL228": "SERT (Transporter)",
    "CHEMBL2034": "PDE5A (PDE)"
}


In [24]:
# ============================================================
#            PUBLISHABLE FIGURES BLOCK (8 FIGURES)
# ============================================================

import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style("whitegrid")
plt.rcParams["figure.dpi"] = 150

# Load results
df = results_df.copy()
boot = boot_df.copy()
ml = ml_all.copy()
scaf = pd.read_csv(f"{OUTDIR}/scaffold_cliffs.csv")

# ------------------------------------------------------------
# FIGURE 1 — Fingerprint Pearson r Heatmap
# ------------------------------------------------------------
pivot = df.pivot(index="Target", columns="Fingerprint", values="Pearson_r")
pivot.index = pivot.index.map(PROTEIN_NAMES)

plt.figure(figsize=(12, 6))
sns.heatmap(pivot, annot=True, cmap="RdBu_r", center=0, fmt=".2f")
plt.title("Figure 1. Fingerprint Performance Across Protein Families (Pearson r)")
plt.tight_layout()
plt.savefig(f"{OUTDIR}/figure1_heatmap_pearson.png", dpi=1200)
plt.close()

# ------------------------------------------------------------
# FIGURE 2 — Cliff Rate Heatmap
# ------------------------------------------------------------
pivot2 = df.pivot(index="Target", columns="Fingerprint", values="Cliff_rate")
pivot2.index = pivot2.index.map(PROTEIN_NAMES)

plt.figure(figsize=(12, 6))
sns.heatmap(pivot2, annot=True, cmap="magma", fmt=".2f")
plt.title("Figure 2. Activity Cliff Rates Across Fingerprints")
plt.tight_layout()
plt.savefig(f"{OUTDIR}/figure2_heatmap_cliff.png", dpi=1200)
plt.close()

# ------------------------------------------------------------
# FIGURE 3 — Bootstrap Confidence Intervals
# ------------------------------------------------------------
# Only show Morgan2, RDKit, Embedding
boot_sel = boot[boot["Fingerprint"].isin(["Morgan2_1024", "RDKitFP_2048", "Embedding_PCA"])]

plt.figure(figsize=(10, 6))
sns.pointplot(data=boot_sel, x="Target", y="mean", hue="Fingerprint", dodge=0.5,
              markers="o", ci=None)
for i, row in boot_sel.iterrows():
    plt.plot([row["Target"], row["Target"]], [row["lo"], row["hi"]],
             color="black", alpha=0.5)
plt.xticks(rotation=45)
plt.ylabel("Pearson r")
plt.title("Figure 3. Bootstrap 95% Confidence Intervals of Similarity–Activity Correlation")
plt.tight_layout()
plt.savefig(f"{OUTDIR}/figure3_bootstrap_ci.png", dpi=1200)
plt.close()

# ------------------------------------------------------------
# FIGURE 4 — Paired Comparison (Morgan2 vs RDKit vs Embedding)
# ------------------------------------------------------------
df_sel = df[df["Fingerprint"].isin(["Morgan2_1024", "RDKitFP_2048", "Embedding_PCA"])]

plt.figure(figsize=(10, 6))
sns.barplot(data=df_sel, x="Fingerprint", y="Pearson_r", errorbar="sd")
plt.title("Figure 4. Comparison of Fingerprint vs Embedding Performance")
plt.tight_layout()
plt.savefig(f"{OUTDIR}/figure4_fingerprint_comparison.png", dpi=1200)
plt.close()

# ------------------------------------------------------------
# FIGURE 5 — UMAP (Morgan2) — already saved but polishing here
# ------------------------------------------------------------
# (You already generated this earlier. We rename for publication quality.)

import shutil
shutil.copy(f"{OUTDIR}/umap_morgan.png", f"{OUTDIR}/figure5_umap_morgan.png")

# ------------------------------------------------------------
# FIGURE 6 — UMAP (Embedding PCA)
# ------------------------------------------------------------
shutil.copy(f"{OUTDIR}/umap_embedding.png", f"{OUTDIR}/figure6_umap_embedding.png")

# ------------------------------------------------------------
# FIGURE 7 — Scaffold Cliff Map
# ------------------------------------------------------------
plt.figure(figsize=(10, 6))
sns.scatterplot(data=scaf, x="Cliffs", y="SameScaffoldFrac")
plt.xlabel("Number of Activity Cliffs")
plt.ylabel("Fraction of Cliffs Sharing Same Scaffold")
plt.title("Figure 7. Scaffold-Driven Activity Cliff Analysis")
plt.tight_layout()
plt.savefig(f"{OUTDIR}/figure7_scaffold_cliffs.png", dpi=1200)
plt.close()

# ------------------------------------------------------------
# FIGURE 8 — ML Performance Comparison
# ------------------------------------------------------------
plt.figure(figsize=(10, 6))
sns.barplot(data=ml, x="Model", y="R2", errorbar="sd")
plt.title("Figure 8. Machine Learning Accuracy: Fingerprint vs Embedding")
plt.tight_layout()
plt.savefig(f"{OUTDIR}/figure8_ml_comparison.png", dpi=1200)
plt.close()

print("All 8 publishable figures generated in:", OUTDIR)


All 8 publishable figures generated in: publishable_results
