# DRD2 Similarity–Activity Analysis
This notebook retrieves Dopamine D2 Receptor (CHEMBL217) data from ChEMBL, cleans it, computes molecular fingerprints, and analyzes the relationship between molecular similarity and bioactivity (pChEMBL values).

In [None]:
# ================================
# FULL, CORRECTED DRD2 ANALYSIS (Colab-ready)
# ================================
!pip install chembl_webresource_client rdkit-pypi pandas matplotlib seaborn scikit-learn xgboost

from chembl_webresource_client.new_client import new_client
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display
from rdkit import Chem
from rdkit.Chem import Draw, DataStructs
from rdkit.Chem import AllChem

# Use MorganGenerator if available to avoid deprecation; otherwise fallback
use_morgan_generator = False
try:
    from rdkit.Chem.rdMolDescriptors import MorganGenerator
    gen = MorganGenerator(radius=2)  # radius 2, common choice
    use_morgan_generator = True
    print("Using MorganGenerator for fingerprints.")
except Exception as e:
    print("MorganGenerator not available, using GetMorganFingerprintAsBitVect (fallback).")
    gen = None

def smiles_to_bitvect(smiles, nBits=2048, radius=2):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    try:
        if use_morgan_generator:
            bv = gen.GetFingerprintAsBitVect(mol, nBits=nBits)
        else:
            bv = AllChem.GetMorganFingerprintAsBitVect(mol, radius=radius, nBits=nBits)
        return bv
    except Exception as e:
        return None

# 1) Download DRD2 data
target_id = "CHEMBL217"
print("Downloading ChEMBL activities for", target_id)

activities = new_client.activity.filter(target_chembl_id=target_id).only(
    ["molecule_chembl_id", "canonical_smiles", "standard_type",
     "standard_relation", "standard_value", "standard_units", "pchembl_value"]
)
df = pd.DataFrame(activities)
print("Total raw activity rows downloaded:", len(df))

# 2) Cleaning
df = df.dropna(subset=["canonical_smiles"]).reset_index(drop=True)
df["pchembl_value"] = pd.to_numeric(df["pchembl_value"], errors="coerce")
df = df.dropna(subset=["pchembl_value"]).reset_index(drop=True)
df = df.drop_duplicates(subset="canonical_smiles").reset_index(drop=True)
df['mol'] = df['canonical_smiles'].apply(lambda s: Chem.MolFromSmiles(s))
df = df.dropna(subset=['mol']).reset_index(drop=True)
df['fp'] = df['canonical_smiles'].apply(lambda s: smiles_to_bitvect(s))
df = df.dropna(subset=['fp']).reset_index(drop=True)
df['pchembl_value'] = df['pchembl_value'].astype(float)
print("Unique molecules:", len(df))

# 3) Similarity sampling
N = len(df)
n_samples = min(5000, (N*(N-1))//2)
pairs = []
seen = set()
attempts = 0
while len(pairs) < n_samples and attempts < n_samples*10:
    i, j = random.randrange(N), random.randrange(N)
    if i == j:
        attempts += 1; continue
    key = (min(i,j), max(i,j))
    if key in seen:
        attempts += 1; continue
    seen.add(key)
    attempts += 1
    fp1, fp2 = df.at[i,'fp'], df.at[j,'fp']
    sim = DataStructs.TanimotoSimilarity(fp1, fp2)
    act_diff = abs(df.at[i,'pchembl_value'] - df.at[j,'pchembl_value'])
    pairs.append({'i': i, 'j': j, 'similarity': sim, 'activity_difference': act_diff})
pairs_df = pd.DataFrame(pairs)
print("Pairs sampled:", len(pairs_df))

# 4) Scatter: similarity vs activity difference
plt.figure(figsize=(8,6))
sns.scatterplot(data=pairs_df.sample(min(len(pairs_df),3000)),
                x='similarity', y='activity_difference', alpha=0.25, s=10)
plt.xlabel("Tanimoto similarity"); plt.ylabel("|Δ pChEMBL|")
plt.title("Similarity vs Activity Difference"); plt.grid(True); plt.show()

# 5) Histogram of pChEMBL
plt.figure(figsize=(8,5))
sns.histplot(df['pchembl_value'], bins=30, kde=True)
plt.xlabel("pChEMBL"); plt.title("Distribution of pChEMBL"); plt.show()

# 6) Binned stats
bin_edges = np.linspace(0.0, 1.0, 11)
pairs_df['sim_bin'] = pd.cut(pairs_df['similarity'], bins=bin_edges, include_lowest=True, right=True)
bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2.0
intervals = pd.IntervalIndex.from_breaks(bin_edges, closed='right')
center_map = {intervals[i]: bin_centers[i] for i in range(len(bin_centers))}
pairs_df['sim_bin_center'] = pairs_df['sim_bin'].map(center_map)
binned_stats = pairs_df.groupby('sim_bin_center')['activity_difference'].agg(['mean','std','count']).reset_index()
print(binned_stats.head())

plt.figure(figsize=(8,6))
plt.errorbar(binned_stats['sim_bin_center'], binned_stats['mean'],
             yerr=binned_stats['std'], fmt='o-', capsize=5)
plt.xlabel("Tanimoto similarity (bin centers)"); plt.ylabel("Avg |Δ pChEMBL|")
plt.title("Binned Activity Difference vs Similarity"); plt.grid(True); plt.show()

# 7) Example molecules grid
sample_mols = df.sample(min(9, len(df)))['mol'].tolist()
img = Draw.MolsToGridImage(sample_mols, molsPerRow=3, subImgSize=(200,200))
display(img)

# 8) Most similar and least similar molecules
pairs_df_sorted = pairs_df.sort_values('similarity', ascending=False).reset_index(drop=True)
for idx in range(len(pairs_df_sorted)):
    i,j = int(pairs_df_sorted.at[idx,'i']), int(pairs_df_sorted.at[idx,'j'])
    if df.at[i,'canonical_smiles'] != df.at[j,'canonical_smiles']:
        most_sim_row = pairs_df_sorted.loc[idx]; break
i,j = most_sim_row['i'], most_sim_row['j']
display(Draw.MolsToGridImage([df.at[i,'mol'], df.at[j,'mol']], molsPerRow=2))

pairs_df_sorted_low = pairs_df.sort_values('similarity', ascending=True).reset_index(drop=True)
for idx in range(len(pairs_df_sorted_low)):
    i,j = int(pairs_df_sorted_low.at[idx,'i']), int(pairs_df_sorted_low.at[idx,'j'])
    if df.at[i,'canonical_smiles'] != df.at[j,'canonical_smiles']:
        least_sim_row = pairs_df_sorted_low.loc[idx]; break
i,j = least_sim_row['i'], least_sim_row['j']
display(Draw.MolsToGridImage([df.at[i,'mol'], df.at[j,'mol']], molsPerRow=2))

# 9) Save outputs
df_out = df[['molecule_chembl_id','canonical_smiles','pchembl_value']]
df_out.to_csv("DRD2_unique_molecules_pchembl.csv", index=False)
pairs_df.to_csv("DRD2_similarity_pairs_sampled.csv", index=False)
print("Saved output CSV files.")