In [None]:
# ============================================================
# 📘 Pipeline explanation: From SMILES to ChEMBL ID and bioactivity
#
# This workflow performs the following steps:
# 1. Generates InChIKeys from SMILES using RDKit.
# 2. Queries ChEMBL (via the chembl_webresource_client) to retrieve:
#    - ChEMBL IDs for each compound.
#    - Associated bioactivity classes (if available).
# 3. Writes results directly into the same dataset (no extra columns
#    are created unless bioactivities contain multiple entries).
#
# ✅ Requirements and recommendations:
# - The input Excel file must contain a column named "Smiles".
# - It is highly recommended to include an "ID" column
#   (unique identifier for each compound).
# - If the database already contains calculated columns
#   ("InChIKey", "ChEMBL ID", "ChEMBL bioactivity..."), the script
#   will skip those entries. This allows updating existing datasets
#   without overwriting previous results.
#
# ⚡ Workflow robustness:
# - Safe to interrupt: results are progressively saved every N compounds 
#   and also if the process is manually stopped (e.g., Stop button in Jupyter).
#   Work can be resumed in the next execution.
# - Only the ChEMBL retrieval block (block 2) supports automatic resuming 
#   from partially completed runs using the same output file (_results.xlsx).
# - The first input file used for the very first run should NOT contain 
#   the following underscore in its name: "_results".
#
# ------------------------------------------------------------
# Example 1: Minimal input database (only ID and Smiles)
# (first 5 rows)
#     ID          Smiles
#     CMPD001     CCO
#     CMPD002     C1CCO1
#     CMPD003     CCN(CC)C
#     CMPD004     COC
#     CMPD005     CC(=O)O
#
# ------------------------------------------------------------
# Example 2: Database that can be updated with this workflow
# (some columns already filled, others empty)
#     ID          Smiles        InChIKey                           ChEMBL ID     ChEMBL bioactivity 1   ChEMBL bioactivity 2
#     CMPD001     CCO           LFQSCWFLJHTTHZ-UHFFFAOYSA-N        CHEMBL25      GPCR ligand
#     CMPD002     C1CCO1        ZUXXECWHPGJHCN-UHFFFAOYSA-N        CHEMBL123     Enzyme inhibitor        Kinase
#     CMPD003     CCN(CC)C      KXKPIXYAXDZYMO-UHFFFAOYSA-N        not found     <empty>                 <empty>
#     CMPD004     COC           <empty>                           <empty>       <empty>                 <empty>
#     CMPD005     CC(=O)O       <empty>                           <empty>       <empty>                 <empty>
#
# In this second case, the workflow will:
# - Skip already completed values (e.g., row CMPD001).
# - Resume missing calculations for the empty fields.
# - Continue seamlessly from a specified compound ID if desired.
# ============================================================


In [None]:
# ============================================================
# 🔑 InChIKey generation and ChEMBL info retrieval
# ============================================================

from chembl_webresource_client.new_client import new_client
import pandas as pd
from rdkit import Chem
from rdkit.Chem import inchi
import os
import math
import time
from concurrent.futures import ThreadPoolExecutor, as_completed

# -------------------- CONFIGURATION --------------------
save_every   = 200          # 💾 Save progress every N compounds
max_workers  = 300          # 🔗 Number of parallel requests to ChEMBL (moderate)

# 🔹 Note on processing speed:
#   The script usually processes compounds one by one. This seems to be a
#   restriction from the ChEMBL server, so increasing max_workers beyond a
#   moderate value does not improve speed. In very rare cases, the server
#   may process up to ~300 compounds simultaneously, but this is exceptional
#   and should not be relied upon.

# 🔹 Start processing from a specific compound ID.
#    Use single quotes for the ID, e.g., 'LANaPDB13579'.
#    Set to None (or comment out) to start from the beginning.
start_id     = None
# -------------------------------------------------------

# Input file
input_file = 'Replace here the name of the input file.xlsx'

# Output file
file_root, file_ext = os.path.splitext(input_file)
output_file = f"{file_root}_results{file_ext}"

# -------------------- Load input or resume from existing output --------------------
if os.path.exists(output_file):
    print(f"📂 Found existing results file: {output_file}")
    df = pd.read_excel(output_file)
    print("🔄 Resuming from last saved progress...")
else:
    print(f"📂 No results file found. Starting from input file: {input_file}")
    df = pd.read_excel(input_file)

# Add Index if not present (for internal control only)
if 'Index' not in df.columns:
    df['Index'] = df.reset_index().index

# -------------------- InChIKey calculation --------------------
if 'InChIKey' not in df.columns:
    df['InChIKey'] = ""

missing_inchikey_idx = df[df['InChIKey'].isna() | (df['InChIKey'] == "")].index

def calc_inchikey(idx):
    smiles = df.at[idx, 'Smiles']
    if pd.isna(smiles) or smiles == "":
        return idx, "error"
    try:
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            return idx, "error"
        return idx, inchi.MolToInchiKey(mol)
    except:
        return idx, "error"

if len(missing_inchikey_idx) > 0:
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {executor.submit(calc_inchikey, idx): idx for idx in missing_inchikey_idx}
        for future in as_completed(futures):
            idx, ik = future.result()
            df.at[idx, 'InChIKey'] = ik
            print(f"\rProcessed {idx+1}/{len(df)} InChIKeys", end="", flush=True)

    print("\n✅ InChIKey calculation complete.")
else:
    print("✅ All InChIKeys already calculated. Skipping step.")

# -------------------- ChEMBL retrieval --------------------
client = new_client

def get_chembl_info(inchi_key, ID):
    mol = client.molecule.filter(molecule_structures__standard_inchi_key=inchi_key).only(
        ['molecule_chembl_id', 'indication_class']
    )
    df_res = pd.DataFrame.from_records(mol).iloc[:1] if mol else pd.DataFrame()

    if df_res.empty:
        df_res = pd.DataFrame({
            'ChEMBL ID': ['not found'],
            'ChEMBL bioactivity 1': [""]
        })
    else:
        df_res.rename(
            columns={
                'molecule_chembl_id': 'ChEMBL ID',
                'indication_class': 'ChEMBL bioactivity'
            },
            inplace=True
        )
        bioactivity_df = df_res['ChEMBL bioactivity'].str.split(';', expand=True)
        bioactivity_df.columns = [
            f'ChEMBL bioactivity {i+1}' for i in range(bioactivity_df.shape[1])
        ]
        df_res = pd.concat(
            [df_res.drop(columns=['ChEMBL bioactivity']), bioactivity_df],
            axis=1
        )

    df_res['InChIKey'] = inchi_key
    df_res['ID'] = ID
    return df_res

# -------------------- Prepare processing --------------------
start_time = time.time()
processed_this_run = 0
bar_length = 30
last_successful_id = None

# 🔹 Filter rows to process: only those with missing ChEMBL IDs (empty or NaN)
if 'ChEMBL ID' in df.columns:
    df_missing = df[df['ChEMBL ID'].isna() | (df['ChEMBL ID'] == "")]
else:
    df_missing = df.copy()

# 🔹 Determine dataframe to process based on start_id
if start_id is not None:
    if start_id in df_missing['ID'].values:
        start_idx = df_missing.index[df_missing['ID'] == start_id][0]
        df_to_process = df_missing.loc[start_idx:].sort_index()
        total_compounds = len(df_to_process)
        print(f"⚡ Starting from ID '{start_id}' at index {start_idx}")
    else:
        raise ValueError(f"❌ ID '{start_id}' not found among compounds with missing ChEMBL ID.")
else:
    df_to_process = df_missing.sort_index()
    total_compounds = len(df_to_process)

print(f"🔎 Compounds pending ChEMBL retrieval: {total_compounds}")

# -------------------- Main loop with interrupt protection --------------------
try:
    for idx, row in df_to_process.iterrows():
        inchi_key = row['InChIKey']
        ID = row['ID'] if 'ID' in df.columns else f"Index {idx}"

        chembl_info_df = get_chembl_info(inchi_key, ID)

        if not chembl_info_df.empty:
            df.at[idx, 'ChEMBL ID'] = chembl_info_df.at[0, 'ChEMBL ID']
            bio_cols = [c for c in chembl_info_df.columns if c.startswith('ChEMBL bioactivity')]
            for c in bio_cols:
                df.at[idx, c] = chembl_info_df.at[0, c]

        last_successful_id = ID
        processed_this_run += 1

        # Progress bar
        elapsed_time = time.time() - start_time
        avg_time_per = elapsed_time / processed_this_run if processed_this_run > 0 else 0
        remaining_secs = avg_time_per * (total_compounds - processed_this_run)
        h, m, s = int(remaining_secs // 3600), int((remaining_secs % 3600) // 60), int(remaining_secs % 60)
        remaining = total_compounds - processed_this_run
        progress_ratio = processed_this_run / total_compounds
        filled_len = math.ceil(bar_length * progress_ratio)
        bar = '#' * filled_len + '-' * (bar_length - filled_len)

        print(
            f"\r[{bar}] {processed_this_run}/{total_compounds} "
            f"({progress_ratio*100:.1f}%) | Remaining: {remaining} | "
            f"Last successful ID: {last_successful_id} | Time left: {h}h {m}m {s}s",
            end="", flush=True
        )

        # Save intermediate results
        if processed_this_run % save_every == 0:
            df.drop(columns=['Index'], inplace=True, errors='ignore')
            df.to_excel(output_file, index=False)

except KeyboardInterrupt:
    print(f"\n🛑 Interrupted by user. Saving progress up to ID: {last_successful_id}...")
    df.drop(columns=['Index'], inplace=True, errors='ignore')
    df.to_excel(output_file, index=False)
    print(f"💾 Progress saved in {output_file}")
    raise

# -------------------- Final save --------------------
df.drop(columns=['Index'], inplace=True, errors='ignore')
df.to_excel(output_file, index=False)
print(f"\n✅ Final file saved as: {output_file}")


In [None]:
# ============================================================
# 📊 Summary of ChEMBL IDs and Bioactivities
#
# This block reads the *_results.xlsx file generated previously,
# counts the number of compounds with a valid ChEMBL ID (excluding 'not found'),
# and counts the number of compounds with a value in each ChEMBL bioactivity column.
# ============================================================

import pandas as pd
import glob

# Automatically detect the latest results file
results_files = glob.glob('*_results.xlsx')
if not results_files:
    raise FileNotFoundError("No *_results.xlsx file found in the current folder.")
input_file = results_files[0]
print(f"Using input file: {input_file}\n")

# Load the file
df = pd.read_excel(input_file)
total_compounds = len(df)

# ------------------ ChEMBL ID presence ------------------
if 'ChEMBL ID' in df.columns:
    # Count only IDs that are not NaN, not empty, and not equal to 'not found'
    chembl_id_count = df['ChEMBL ID'].apply(
        lambda x: pd.notna(x) and str(x).strip().lower() != 'not found' and str(x).strip() != ''
    ).sum()

    print(f"Total compounds: {total_compounds}")
    print(f"Compounds with valid ChEMBL ID: {chembl_id_count} ({(chembl_id_count/total_compounds)*100:.2f}%)\n")
else:
    print("No 'ChEMBL ID' column found in the input file.\n")

# ------------------ ChEMBL bioactivity counts ------------------
bioactivity_cols = [col for col in df.columns if col.startswith('ChEMBL bioactivity')]
if bioactivity_cols:
    print(f"{'Bioactivity column':<25}{'Compounds with activity':<25}{'Percentage':<10}")
    for col in bioactivity_cols:
        count = df[col].notna().sum()
        percent = (count / total_compounds) * 100
        print(f"{col:<25}{count:<25}{percent:.2f}%")
else:
    print("No 'ChEMBL bioactivity' columns found in the input file.")
