<a href="https://colab.research.google.com/github/apoorvapu/data_science/blob/main/10proteins.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install RDKit

Collecting RDKit
  Downloading rdkit-2025.9.1-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (4.1 kB)
Downloading rdkit-2025.9.1-cp312-cp312-manylinux_2_28_x86_64.whl (36.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m36.2/36.2 MB[0m [31m25.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: RDKit
Successfully installed RDKit-2025.9.1


In [4]:
!pip install chembl_webresource_client

Collecting chembl_webresource_client
  Downloading chembl_webresource_client-0.10.9-py3-none-any.whl.metadata (1.4 kB)
Collecting requests-cache~=1.2 (from chembl_webresource_client)
  Downloading requests_cache-1.2.1-py3-none-any.whl.metadata (9.9 kB)
Collecting cattrs>=22.2 (from requests-cache~=1.2->chembl_webresource_client)
  Downloading cattrs-25.3.0-py3-none-any.whl.metadata (8.4 kB)
Collecting url-normalize>=1.4 (from requests-cache~=1.2->chembl_webresource_client)
  Downloading url_normalize-2.2.1-py3-none-any.whl.metadata (5.6 kB)
Downloading chembl_webresource_client-0.10.9-py3-none-any.whl (55 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.2/55.2 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading requests_cache-1.2.1-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.4/61.4 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading cattrs-25.3.0-py3-none-any.whl (70 kB)
[2K   [90m━━━━━━━━━━━━━━

In [14]:
# fast_10target_similarity.py
# Ultra-optimized pipeline for 10 high-volume ChEMBL targets (vectorized Tanimoto)
# Produces: similarity-vs-activity correlations, cliff rates, and ML baseline figures
#
# Requirements: rdkit, chembl_webresource_client, numpy, pandas, matplotlib, seaborn, scikit-learn

import os, time, gc
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from rdkit import Chem
from rdkit.Chem import AllChem, MACCSkeys, Descriptors
from chembl_webresource_client.new_client import new_client
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold

sns.set_style("whitegrid")
plt.rcParams["figure.dpi"] = 120

# ---------------- CONFIG ----------------
TARGETS = {
    "Kinase": "CHEMBL203",           # EGFR
    "GPCR": "CHEMBL217",             # Dopamine D2
    "Protease": "CHEMBL204",         # Thrombin
    "NuclearReceptor": "CHEMBL206",  # ESR1
    "IonChannel": "CHEMBL240",       # hERG
    "Metalloenzyme": "CHEMBL205",    # CA-II
    "Hydrolase": "CHEMBL220",        # AChE
    "Polymerase": "CHEMBL238",       # HIV RT
    "Transporter": "CHEMBL228",      # SERT
    "Phosphodiesterase": "CHEMBL2034" # PDE5A
}

MAX_FETCH = 7000       # modest fetch per target
FINAL_N = 1200         # per-target sample size (balance speed & power)
PAIR_SAMPLE = 150000   # number of random pairs to sample for similarity-activity correlation
SIM_THRESH = 0.85
CLIFF_ACT = 2.0
OUT = "outputs_10targets_fast"
os.makedirs(OUT, exist_ok=True)
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

# ---------------- helper functions ----------------
def fetch_and_qc(tid, max_rows=MAX_FETCH):
    acts = new_client.activity.filter(
        target_chembl_id=tid, pchembl_value__isnull=False,
        standard_type__in=["IC50","Ki","Kd","EC50"]
    ).only(["canonical_smiles","pchembl_value"])[:max_rows]
    df = pd.DataFrame(acts)
    if df.empty:
        return None
    df['pchembl_value'] = pd.to_numeric(df['pchembl_value'], errors='coerce')
    df = df.dropna(subset=['canonical_smiles','pchembl_value']).copy()
    df = df.sort_values('pchembl_value', ascending=False).drop_duplicates('canonical_smiles', keep='first').copy()
    df['mol'] = df['canonical_smiles'].apply(Chem.MolFromSmiles)
    df = df.dropna(subset=['mol']).copy()
    df['mw'] = df['mol'].apply(Descriptors.MolWt)
    df = df[(df['mw'] >= 150) & (df['mw'] <= 650)].copy()
    if len(df) >= 5:
        mean = df['pchembl_value'].mean(); sd = df['pchembl_value'].std()
        if pd.notnull(sd) and sd > 0:
            df = df[np.abs(df['pchembl_value'] - mean) <= 3*sd].copy()
    return df.reset_index(drop=True)

def stratified_sample(df, n=FINAL_N, bins=5):
    if len(df) <= n: return df.copy()
    df = df.copy(); df['bin'] = pd.qcut(df['pchembl_value'], q=bins, labels=False, duplicates='drop')
    per = n // bins; rem = n % bins
    parts=[]
    for b in sorted(df['bin'].unique()):
        g = df[df['bin'] == b]
        k = min(per + (1 if b < rem else 0), len(g))
        parts.append(g.sample(k, random_state=RANDOM_SEED))
    return pd.concat(parts).drop(columns=['bin']).reset_index(drop=True)

def fp_to_numpy_bitmatrix(bitvecs, nBits):
    """Convert list of RDKit ExplicitBitVect to boolean numpy array shape (n, nBits)."""
    n = len(bitvecs)
    M = np.zeros((n, nBits), dtype=np.uint8)
    for i, bv in enumerate(bitvecs):
        try:
            arr = np.zeros((nBits,), dtype=np.uint8)
            Chem.DataStructs.ConvertToNumpyArray(bv, arr)
            M[i] = arr
        except Exception:
            # leave as zeros if conversion fails
            pass
    return M

def compute_pair_stats_from_bitmatrix(bitmat, acts, pair_sample=PAIR_SAMPLE):
    """
    Compute sampled pair Tanimoto similarities and activity diffs using vectorized operations.
    Returns Pearson r and cliff rate.
    """
    n = bitmat.shape[0]
    # precompute bit counts
    bit_counts = bitmat.sum(axis=1).astype(np.int32)
    # sample pair indices (upper triangle)
    max_pairs = n*(n-1)//2
    k = min(pair_sample, max_pairs)
    # uniform random pairs: sample indices by rejection
    idx_i = np.random.randint(0, n, size=k)
    idx_j = np.random.randint(0, n, size=k)
    mask = idx_i != idx_j
    idx_i = idx_i[mask]; idx_j = idx_j[mask]
    # limit to exactly k pairs
    if len(idx_i) > k:
        idx_i = idx_i[:k]; idx_j = idx_j[:k]
    # compute intersections using vectorized dot (for each unique i compute with its j subset to reduce ops)
    sims = np.empty(len(idx_i), dtype=np.float32)
    # Optimize by grouping by unique i
    unique_i, positions = np.unique(idx_i, return_inverse=True)
    for ui in np.unique(unique_i):
        mask_ui = (idx_i == ui)
        js = idx_j[mask_ui]
        # intersections = bitmat[ui] & bitmat[js] -> dot product
        inter = (bitmat[js] & bitmat[ui]).sum(axis=1).astype(np.int32)
        union = bit_counts[ui] + bit_counts[js] - inter
        # avoid division by zero
        simvals = np.where(union>0, inter / union.astype(np.float32), 0.0)
        sims[mask_ui] = simvals
    # activity diffs
    act_diffs = np.abs(acts[idx_i] - acts[idx_j])
    # pearson
    if len(sims) < 10:
        pearson = np.nan
    else:
        pearson = np.corrcoef(sims, act_diffs)[0,1]
    # cliff rate among sampled pairs
    high_sim_mask = sims >= SIM_THRESH
    cliff_rate = np.sum((act_diffs >= CLIFF_ACT) & high_sim_mask) / max(1, np.sum(high_sim_mask))
    return pearson, cliff_rate, len(sims)

# ---------------- MAIN ----------------
start_time = time.time()
datasets = {}
print("Fetching and QC-ing targets (this will be done only once per target)...")
for fam, tid in TARGETS.items():
    print(f"  {fam}: {tid} ...", end='', flush=True)
    df = fetch_and_qc(tid, max_rows=MAX_FETCH)
    if df is None or df['canonical_smiles'].nunique() < 800:
        raise SystemExit(f"\nTarget {tid} returned insufficient usable compounds after QC.")
    df = stratified_sample(df, n=FINAL_N)
    datasets[tid] = df
    print(f" {len(df)} compounds (usable).")

# ---------------- compute fingerprints (bit matrices) ----------------
print("\nComputing fingerprints and bit matrices (vectorized)...")
fps_specs = {
    "Morgan2_1024": ("morgan2", 1024, lambda m: AllChem.GetMorganFingerprintAsBitVect(m,2,nBits=1024)),
    "Morgan3_1024": ("morgan3", 1024, lambda m: AllChem.GetMorganFingerprintAsBitVect(m,3,nBits=1024)),
    "RDKitFP_2048": ("rdkit", 2048, lambda m: Chem.RDKFingerprint(m, fpSize=2048)),
    "MACCS_166": ("maccs", 166, lambda m: MACCSkeys.GenMACCSKeys(m))
}

results = []
per_target_bitmats = {}  # keep for ML later (Morgan2)
for tid, df in datasets.items():
    mols = df['mol'].tolist()
    acts = df['pchembl_value'].values.astype(np.float32)
    per_fp_bitmat = {}
    for name, (label, nBits, fpfunc) in fps_specs.items():
        # compute RDKit bit vectors
        bitvecs = []
        for m in mols:
            try:
                bv = fpfunc(m)
                bitvecs.append(bv)
            except:
                bitvecs.append(None)
        M = fp_to_numpy_bitmatrix(bitvecs, nBits)
        per_fp_bitmat[name] = M
        # compute pair stats sampled
        pear, cliff, pairs_used = compute_pair_stats_from_bitmatrix(M, acts, pair_sample=PAIR_SAMPLE//len(fps_specs))
        results.append({
            "Target": tid,
            "Family": [k for k,v in TARGETS.items() if v==tid][0],
            "Fingerprint": name,
            "Pearson_r": float(pear) if not np.isnan(pear) else None,
            "Cliff_rate": float(cliff),
            "Pairs_used": int(pairs_used)
        })
    # store Morgan2 bit matrix for ML
    per_target_bitmats[tid] = (per_fp_bitmat["Morgan2_1024"], acts)
    gc.collect()

results_df = pd.DataFrame(results)

# ---------------- FIGURES ----------------
print("\nCreating figures...")
# Fig 1: heatmap target x fingerprint Pearson r
pivot = results_df.pivot_table(index='Target', columns='Fingerprint', values='Pearson_r')
plt.figure(figsize=(10,6))
sns.heatmap(pivot, annot=True, fmt=".2f", cmap="coolwarm", center=0)
plt.title("Pearson r (similarity vs |Δ pChEMBL|)")
plt.tight_layout()
plt.savefig(os.path.join(OUT, "fig1_heatmap.png"), dpi=300)
plt.close()

# Fig 2: family summary mean Pearson
fam_mean = results_df.groupby('Family')['Pearson_r'].mean().sort_values()
plt.figure(figsize=(8,5))
fam_mean.plot(kind='barh')
plt.xlabel("Mean Pearson r")
plt.title("Family-level mean Pearson r")
plt.tight_layout()
plt.savefig(os.path.join(OUT, "fig2_family_mean.png"), dpi=300)
plt.close()

# Fig 3: fingerprint comparison mean
fp_mean = results_df.groupby('Fingerprint')['Pearson_r'].mean().sort_values()
plt.figure(figsize=(6,4))
fp_mean.plot(kind='barh')
plt.xlabel("Mean Pearson r across targets")
plt.title("Fingerprint performance")
plt.tight_layout()
plt.savefig(os.path.join(OUT, "fig3_fp_compare.png"), dpi=300)
plt.close()

# Fig 4: cliff rates by family
cliff_by_family = results_df.groupby('Family')['Cliff_rate'].mean().sort_values()
plt.figure(figsize=(8,5))
(cliff_by_family*100).plot(kind='barh')
plt.xlabel("Activity cliff rate (%) among sampled high-similarity pairs")
plt.title("Cliff prevalence by family")
plt.tight_layout()
plt.savefig(os.path.join(OUT, "fig4_cliff_by_family.png"), dpi=300)
plt.close()

# Fig 5: ML benchmark (RandomForest on Morgan2)
print("Running a lightweight ML benchmark (5-fold RF) on Morgan2 bits...")
ml_rows=[]
for tid, (Xbits, acts) in per_target_bitmats.items():
    # use boolean->int matrix
    X = Xbits.astype(np.uint8)
    y = acts
    kf = KFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)
    maes=[]; r2s=[]
    for tr, te in kf.split(X):
        model = RandomForestRegressor(n_estimators=100, n_jobs=-1, random_state=RANDOM_SEED)
        model.fit(X[tr], y[tr])
        yp = model.predict(X[te])
        maes.append(np.mean(np.abs(yp - y[te])))
        ss_res = np.sum((y[te]-yp)**2); ss_tot = np.sum((y[te]-np.mean(y[te]))**2)
        r2s.append(1 - ss_res/ss_tot if ss_tot>0 else 0.0)
    ml_rows.append({"Target": tid, "MAE": np.mean(maes), "R2": np.mean(r2s)})

ml_df = pd.DataFrame(ml_rows).set_index('Target')
fig, ax = plt.subplots(1,2, figsize=(12,6))
ml_df['MAE'].sort_values().plot(kind='barh', ax=ax[0]); ax[0].set_title("RF MAE (5-fold)")
ml_df['R2'].sort_values().plot(kind='barh', ax=ax[1]); ax[1].set_title("RF R² (5-fold)")
plt.tight_layout()
plt.savefig(os.path.join(OUT, "fig5_ml_benchmark.png"), dpi=300)
plt.close()

# ---------------- SAVE SUMMARY ----------------
results_df.to_csv(os.path.join(OUT, "results_summary.csv"), index=False)
ml_df.to_csv(os.path.join(OUT, "ml_summary.csv"))

elapsed = time.time() - start_time
print(f"\nFinished in {elapsed:.1f} s. Outputs in folder: {OUT}")
print("\nSummary:")
print(results_df)


Fetching and QC-ing targets (this will be done only once per target)...
  Kinase: CHEMBL203 ... 1200 compounds (usable).
  GPCR: CHEMBL217 ... 1200 compounds (usable).
  Protease: CHEMBL204 ... 1200 compounds (usable).
  NuclearReceptor: CHEMBL206 ... 1200 compounds (usable).
  IonChannel: CHEMBL240 ... 1200 compounds (usable).
  Metalloenzyme: CHEMBL205 ... 1200 compounds (usable).
  Hydrolase: CHEMBL220 ... 1200 compounds (usable).
  Polymerase: CHEMBL238 ... 1200 compounds (usable).
  Transporter: CHEMBL228 ... 1200 compounds (usable).
  Phosphodiesterase: CHEMBL2034 ... 1200 compounds (usable).

Computing fingerprints and bit matrices (vectorized)...


[1;30;43mStreaming output truncated to the last 5000 lines.[0m



Creating figures...
Running a lightweight ML benchmark (5-fold RF) on Morgan2 bits...

Finished in 542.2 s. Outputs in folder: outputs_10targets_fast

Summary:
        Target             Family   Fingerprint  Pearson_r  Cliff_rate  \
0    CHEMBL203             Kinase  Morgan2_1024  -0.061836    0.000000   
1    CHEMBL203             Kinase  Morgan3_1024  -0.058074    0.000000   
2    CHEMBL203             Kinase  RDKitFP_2048  -0.060661    0.075000   
3    CHEMBL203             Kinase     MACCS_166        NaN    0.000000   
4    CHEMBL217               GPCR  Morgan2_1024  -0.048275    0.000000   
5    CHEMBL217               GPCR  Morgan3_1024  -0.063175    0.000000   
6    CHEMBL217               GPCR  RDKitFP_2048  -0.042861    0.020408   
7    CHEMBL217               GPCR     MACCS_166        NaN    0.000000   
8    CHEMBL204           Protease  Morgan2_1024  -0.060009    0.000000   
9    CHEMBL204           Protease  Morgan3_1024  -0.060610    0.250000   
10   CHEMBL204           