<a href="https://colab.research.google.com/github/apoorvapu/data_science/blob/main/meeting1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# ================================
# STEP 1: Install dependencies
# ================================
!pip install chembl_webresource_client rdkit pandas matplotlib seaborn scikit-learn

from chembl_webresource_client.new_client import new_client
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem, Draw, Descriptors, DataStructs
import matplotlib.pyplot as plt
import seaborn as sns
import random


Collecting chembl_webresource_client
  Downloading chembl_webresource_client-0.10.9-py3-none-any.whl.metadata (1.4 kB)
Collecting rdkit
  Downloading rdkit-2025.3.6-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (4.1 kB)
Collecting requests-cache~=1.2 (from chembl_webresource_client)
  Downloading requests_cache-1.2.1-py3-none-any.whl.metadata (9.9 kB)
Collecting cattrs>=22.2 (from requests-cache~=1.2->chembl_webresource_client)
  Downloading cattrs-25.2.0-py3-none-any.whl.metadata (8.4 kB)
Collecting url-normalize>=1.4 (from requests-cache~=1.2->chembl_webresource_client)
  Downloading url_normalize-2.2.1-py3-none-any.whl.metadata (5.6 kB)
Downloading chembl_webresource_client-0.10.9-py3-none-any.whl (55 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.2/55.2 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading rdkit-2025.3.6-cp312-cp312-manylinux_2_28_x86_64.whl (36.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m36.1/36.1 MB[0m [3

# see the following webpage:
https://www.ebi.ac.uk/chembl/explore/target/CHEMBL217#NameAndClassification

protein and different known drugs shown

In [None]:
# ================================
# STEP 2: Download DRD2 data
# ================================
target_id = "CHEMBL217"  # Dopamine D2 receptor


activities = new_client.activity.filter(target_chembl_id=target_id).only(
    ["molecule_chembl_id", "canonical_smiles", "standard_type",
     "standard_relation", "standard_value", "standard_units", "pchembl_value"]
)

df = pd.DataFrame(activities)
print("Total raw entries:", len(df))


# view a few rows of what we have downloaded:

In [None]:
df.head()

In [None]:
# ================================
# STEP 3: Clean dataset
# ================================
# Remove duplicate molecules (keep the first occurrence)
df = df.drop_duplicates(subset="canonical_smiles").reset_index(drop=True)

# Drop missing SMILES or pChEMBL
df = df.dropna(subset=["canonical_smiles", "standard_value"]).reset_index(drop=True)
df["standard_value"] = pd.to_numeric(df["standard_value"], errors='coerce')
df = df.dropna(subset=["standard_value"]).reset_index(drop=True)

# Optional: keep only common activity types
df = df[df["standard_type"].isin(["Ki", "Kd"])]
print("Remaining molecules after cleaning:", len(df))


In [None]:
df.head()

In [None]:
# ================================
# STEP 4: Compute Morgan fingerprints
# ================================
def smiles_to_fingerprint(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048)
    return fp

df["fingerprint"] = df["canonical_smiles"].apply(smiles_to_fingerprint)
df = df.dropna(subset=["fingerprint"]).reset_index(drop=True)
print("Remaining molecules after fingerprinting:", len(df))

# Convert SMILES to RDKit Mol objects for visualization
df["mol"] = df["canonical_smiles"].apply(lambda x: Chem.MolFromSmiles(x))


In [None]:
# ================================
# STEP 5: Sample pairs for similarity analysis
# ================================
# Sample pairs where i != j
pairs = []
n_samples = 5000

while len(pairs) < n_samples:
    i, j = random.sample(range(len(df)), 2)
    if i != j:  # ensure different molecules
        fp1, fp2 = df.loc[i, "fingerprint"], df.loc[j, "fingerprint"]
        sim = DataStructs.TanimotoSimilarity(fp1, fp2)
        act_diff = abs(df.loc[i, "standard_value"] - df.loc[j, "standard_value"])
        pairs.append((i, j, sim, act_diff))

pairs_df = pd.DataFrame(pairs, columns=["i", "j", "similarity", "activity_difference"])

pairs_df.head()

In [None]:
# ================================
# STEP 6: Scatter plot - similarity vs activity difference
# ================================
plt.figure(figsize=(8,6))
sns.scatterplot(data=pairs_df, x="similarity", y="activity_difference", alpha=0.3)
plt.xlabel("Tanimoto Similarity")
plt.ylabel("Δ Activity (|Δ pChEMBL|)")
plt.title("Similarity vs Activity Difference (DRD2)")
plt.show()


In [None]:
# ================================
# STEP 7: Histogram of pChEMBL values
# ================================
plt.figure(figsize=(8,6))
sns.histplot(df["standard_value"], bins=30, kde=True)
plt.xlabel("pChEMBL Value")
plt.ylabel("Number of Molecules")
plt.title("Distribution of DRD2 pChEMBL Values")
plt.show()


In [None]:
# ================================
# STEP 8: Correlation analysis
# ================================
corr = pairs_df["similarity"].corr(pairs_df["activity_difference"])
print(f"Correlation between similarity and activity difference: {corr:.3f}")


In [None]:

# ================================
# STEP 9: Show 2D images of random molecules
# ================================
sample_mols = df.sample(9)["mol"].tolist()
img = Draw.MolsToGridImage(sample_mols, molsPerRow=3, subImgSize=(200,200))
display(img)



In [None]:
# ================================
# STEP 10: Most similar pair
# ================================
most_sim = pairs_df.loc[pairs_df["similarity"].idxmax()]
mol1 = df.loc[most_sim["i"], "mol"]
mol2 = df.loc[most_sim["j"], "mol"]
sim_val = most_sim["similarity"]
act_diff_val = most_sim["activity_difference"]

print(f"Most similar pair - Tanimoto: {sim_val:.3f}, ΔpChEMBL: {act_diff_val:.2f}")
display(Draw.MolsToGridImage([mol1, mol2], molsPerRow=2, subImgSize=(250,250)))


In [None]:
# ================================
# STEP 11: Least similar pair
# ================================
least_sim = pairs_df.loc[pairs_df["similarity"].idxmin()]
mol1 = df.loc[least_sim["i"], "mol"]
mol2 = df.loc[least_sim["j"], "mol"]
sim_val = least_sim["similarity"]
act_diff_val = least_sim["activity_difference"]

print(f"Least similar pair - Tanimoto: {sim_val:.3f}, ΔpChEMBL: {act_diff_val:.2f}")
display(Draw.MolsToGridImage([mol1, mol2], molsPerRow=2, subImgSize=(250,250)))


In [None]:


# ================================
# STEP 12: Save datasets for future use
# ================================
df.to_csv("DRD2_activities_fingerprints.csv", index=False)
pairs_df.to_csv("DRD2_similarity_activity_pairs.csv", index=False)
print("Data saved to CSV files.")


In [None]:
# ================================
# STEP 13: Binned plot - average activity difference vs similarity bins
# ================================
# Define bins
import numpy as np
bins = np.linspace(0, 1, 11)  # 0.0, 0.1, ..., 1.0
pairs_df['sim_bin'] = pd.cut(pairs_df['similarity'], bins)

# Compute mean and std of activity differences per bin
binned_stats = pairs_df.groupby('sim_bin')['activity_difference'].agg(['mean','std','count']).reset_index()
binned_stats = binned_stats[binned_stats['count']>0]  # remove empty bins

# Plot
plt.figure(figsize=(8,6))
plt.errorbar(
    x=[interval.mid for interval in binned_stats['sim_bin']],
    y=binned_stats['mean'],
    yerr=binned_stats['std'],
    fmt='o-', capsize=5
)
plt.xlabel("Tanimoto Similarity Bin")
plt.ylabel("Average Δ Activity (|Δ pChEMBL|)")
plt.title("Average Activity Difference vs Molecular Similarity (Binned)")
plt.grid(True)
plt.show()
