In [None]:
# This Script is to get the Bioactivity Data of Multiple Compounds - for multiple-targets
# By Ashok K. Sharma
# Date: Aug-10-2024

In [None]:
# Import necessary libraries
import pandas as pd
from chembl_webresource_client.new_client import new_client

In [24]:
# Only 1 Target
target = new_client.target
target_query = target.search('glutamate carboxypeptidase II')
targets = pd.DataFrame.from_dict(target_query)
targets

# If we are not sure in which line Human - CHEMBL ID is present.
selected_target = targets[(targets['organism'] == 'Homo sapiens') & (targets['pref_name'].str.contains('Glutamate carboxypeptidase II', case=False))]['target_chembl_id'].iloc[0]
selected_target

# Get the activity like IC50 values of all compounds tested for this target
activity = new_client.activity
res = activity.filter(target_chembl_id=selected_target).filter(standard_type="IC50")
df = pd.DataFrame.from_dict(res)

# Handling Missing data
# Remove Rows or Compounds which do not have standard value IC50 or SMILE notations
df2 = df[df.standard_value.notna()]
df2 = df2[df.canonical_smiles.notna()]

# Drop duplicate Lines - which have similar SMILES notation
df2_nr = df2.drop_duplicates(['canonical_smiles'])
df2_nr

# Filter the data frame to get CHEMBL ID, Canonical SMILES, and Standard Value of IC50
selection = ['molecule_chembl_id','canonical_smiles','standard_value']
df3 = df2_nr[selection]
df3 # As some rows are removed Index number is not continuous. To make it continuous use the following command.
df3_reset = df3.reset_index(drop=True)

#Labeling compounds as either being active, inactive or intermediate
#The bioactivity data is in the IC50 unit. Compounds having values of less than 1000 nM will be considered to be active while those greater than 10,000 nM will be considered to be inactive. As for those values in between 1,000 and 10,000 nM will be referred to as intermediate.
## Active: IC50 values less than 1000 nM (1 µM) are generally considered to indicate strong activity.
## Inactive: IC50 values greater than 10,000 nM (10 µM) are typically considered to indicate weak or no activity.

bioactivity_threshold = []
for i in df3_reset.standard_value:
  if float(i) >= 10000:
    bioactivity_threshold.append("inactive")
  elif float(i) <= 1000:
    bioactivity_threshold.append("active")
  else:
    bioactivity_threshold.append("intermediate")

bioactivity_class = pd.Series(bioactivity_threshold, name='class')
df4 = pd.concat([df3_reset, bioactivity_class], axis=1)
df4

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value,class
0,CHEMBL51979,O=C(O)CCC(=O)NO,43000.0,inactive
1,CHEMBL47009,O=C(O)CCC(CP(=O)(O)O)C(=O)O,0.3,active
2,CHEMBL65043,O=C(O)CCC(CC(=O)NO)C(=O)O,220.0,active
3,CHEMBL63893,N[C@@H](CCC(=O)NO)C(=O)O,100000.0,inactive
4,CHEMBL65072,CC(=O)N[C@@H](CC(=O)O)C(=O)NO,100000.0,inactive
...,...,...,...,...
280,CHEMBL5205430,[N-]=[N+]=NCCCNC(=O)[C@H](Cc1ccc(O)c([N+](=O)[...,1.5,active
281,CHEMBL5270990,CC1(C)C(/C=C/C=C/C=C2/N(CCCCCC(=O)NCCCC[C@H](N...,19.2,active
282,CHEMBL5282876,CC(C)(C)[Si](F)(c1ccc(C(=O)NCC(NC(=O)CC(C(=O)O...,2.8,active
283,CHEMBL5277672,CC(C)(C)[Si](F)(c1ccc(C(=O)NC[C@@H](NC(=O)CC(C...,7.1,active


In [23]:
# Multiple Targets
# If you want to input the list of targets directly
# Define a list of targets you want to search for
target_names = ['glutamate carboxypeptidase II', 'acetylcholinesterase']

# Initialize empty DataFrames to store all raw and processed results
final_raw_df = pd.DataFrame()
final_processed_df = pd.DataFrame()

# Loop through each target in the list
for target_name in target_names:
    # Search for the target
    target = new_client.target
    target_query = target.search(target_name)
    targets = pd.DataFrame.from_dict(target_query)

    # Check if the DataFrame is empty or doesn't contain the necessary columns
    if targets.empty or 'organism' not in targets.columns or 'pref_name' not in targets.columns:
        print(f"No match found for target '{target_name}'. Please check the target name.")
        continue
    
    # Select the target_chembl_id for the human organism
    selected_target = targets[(targets['organism'] == 'Homo sapiens') & (targets['pref_name'].str.contains(target_name, case=False))]['target_chembl_id'].iloc[0]

    # Get the activity like IC50 values of all compounds tested for this target
    activity = new_client.activity
    res = activity.filter(target_chembl_id=selected_target).filter(standard_type="IC50")
    df = pd.DataFrame.from_dict(res)

    # Save the raw data with the target name
    df['target_name'] = target_name
    final_raw_df = pd.concat([final_raw_df, df], ignore_index=True)

    # Handle missing data
    df2 = df[df.standard_value.notna()]
    df2 = df2[df.canonical_smiles.notna()]
    df2_nr = df2.drop_duplicates(['canonical_smiles'])
    
    # Filter the data frame to get CHEMBL ID, Canonical SMILES, and Standard Value of IC50
    selection = ['molecule_chembl_id','canonical_smiles','standard_value']
    df3 = df2_nr[selection]
    df3_reset = df3.reset_index(drop=True)

    # Label compounds as active, inactive, or intermediate based on IC50 values
    bioactivity_threshold = []
    for i in df3_reset.standard_value:
        if float(i) >= 10000:
            bioactivity_threshold.append("inactive")
        elif float(i) <= 1000:
            bioactivity_threshold.append("active")
        else:
            bioactivity_threshold.append("intermediate")
    
    bioactivity_class = pd.Series(bioactivity_threshold, name='class')
    df4 = pd.concat([df3_reset, bioactivity_class], axis=1)

    # Add the target name to the processed DataFrame
    df4['target_name'] = target_name
    
    # Append the results to the final processed DataFrame
    final_processed_df = pd.concat([final_processed_df, df4], ignore_index=True)

# Now, final_raw_df contains the raw data for all targets, and final_processed_df contains the processed data
final_raw_df
final_processed_df

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value,class,target_name
0,CHEMBL51979,O=C(O)CCC(=O)NO,43000.0,inactive,glutamate carboxypeptidase II
1,CHEMBL47009,O=C(O)CCC(CP(=O)(O)O)C(=O)O,0.3,active,glutamate carboxypeptidase II
2,CHEMBL65043,O=C(O)CCC(CC(=O)NO)C(=O)O,220.0,active,glutamate carboxypeptidase II
3,CHEMBL63893,N[C@@H](CCC(=O)NO)C(=O)O,100000.0,inactive,glutamate carboxypeptidase II
4,CHEMBL65072,CC(=O)N[C@@H](CC(=O)O)C(=O)NO,100000.0,inactive,glutamate carboxypeptidase II
...,...,...,...,...,...
6649,CHEMBL2238282,O=C(/C=C/c1ccc(N2CCCCC2)cc1)c1sccc1Cl,160.0,active,acetylcholinesterase
6650,CHEMBL4636881,CC(=O)Nc1c(F)cc(C(=O)N[C@H]2CC[C@H](O)CC2)cc1O...,7943.28,intermediate,acetylcholinesterase
6651,CHEMBL4635134,CNC(=O)c1cc(C(=O)NC2CC2)cn(Cc2ccccc2)c1=O,100000.0,inactive,acetylcholinesterase
6652,CHEMBL4639128,COCc1nc2cnc3cc(-c4c(C)noc4C)c(OC[C@H]4CCNC4)cc...,63095.73,inactive,acetylcholinesterase


In [29]:
# Multiple Targets involved in different Diseases. 
# Here is the example of target involved in IBD and Alzheimer's disease (AD)
# Load the Input file
file_path = '/Users/ashoksharma/Work/Python_bioinfo/IBD_target/input/Set1_Disease_Target.csv'  # Update the path accordingly
disease_target_df = pd.read_csv(file_path)
disease_target_df

Unnamed: 0,disease_names,target_names
0,IBD,glutamate carboxypeptidase II
1,Alzheimer's disease,acetylcholinesterase
2,IBD,TNF-alpha


In [28]:
import warnings
warnings.simplefilter(action='ignore', category=UserWarning) # To ingnore the warnings

# Initialize empty DataFrames to store all raw and processed results
final_raw_df = pd.DataFrame()
final_processed_df = pd.DataFrame()

# Loop through each row in the disease_target_df
for index, row in disease_target_df.iterrows():
    disease_name = row['disease_names']
    target_name = row['target_names']
    
    # Search for the target
    target = new_client.target
    target_query = target.search(target_name)
    targets = pd.DataFrame.from_dict(target_query)
    
    # Check if the DataFrame is empty or doesn't contain the necessary columns
    if targets.empty or 'organism' not in targets.columns or 'pref_name' not in targets.columns:
        print(f"No match found for target '{target_name}'. Please check the target name.")
        continue
    
    # Select the target_chembl_id for the human organism
    selected_target = targets[(targets['organism'] == 'Homo sapiens') & (targets['pref_name'].str.contains(target_name, case=False))]
    selected_target_id = selected_target['target_chembl_id'].iloc[0]

    # Get the activity like IC50 values of all compounds tested for this target
    activity = new_client.activity
    res = activity.filter(target_chembl_id=selected_target_id).filter(standard_type="IC50")
    df = pd.DataFrame.from_dict(res)

    # Save the raw data with the target name and disease name
    df['target_name'] = target_name
    df['disease_name'] = disease_name
    final_raw_df = pd.concat([final_raw_df, df], ignore_index=True)

    # Handle missing data
    df2 = df[df.standard_value.notna()]
    df2 = df2[df.canonical_smiles.notna()]
    df2_nr = df2.drop_duplicates(['canonical_smiles'])
    
    # Filter the data frame to get CHEMBL ID, Canonical SMILES, and Standard Value of IC50
    selection = ['molecule_chembl_id','canonical_smiles','standard_value']
    df3 = df2_nr[selection]
    df3_reset = df3.reset_index(drop=True)

    # Label compounds as active, inactive, or intermediate based on IC50 values
    bioactivity_threshold = []
    for i in df3_reset.standard_value:
        if float(i) >= 10000:
            bioactivity_threshold.append("inactive")
        elif float(i) <= 1000:
            bioactivity_threshold.append("active")
        else:
            bioactivity_threshold.append("intermediate")
    
    bioactivity_class = pd.Series(bioactivity_threshold, name='class')
    df4 = pd.concat([df3_reset, bioactivity_class], axis=1)

    # Add the target name and disease name to the processed DataFrame
    df4['target_name'] = target_name
    df4['disease_name'] = disease_name
    
    # Append the results to the final processed DataFrame
    final_processed_df = pd.concat([final_processed_df, df4], ignore_index=True)

# Now, final_raw_df contains the raw data and final_processed_df contains the processed data with both disease and target names
final_raw_df
final_processed_df

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value,class,target_name,disease_name
0,CHEMBL51979,O=C(O)CCC(=O)NO,43000.0,inactive,glutamate carboxypeptidase II,IBD
1,CHEMBL47009,O=C(O)CCC(CP(=O)(O)O)C(=O)O,0.3,active,glutamate carboxypeptidase II,IBD
2,CHEMBL65043,O=C(O)CCC(CC(=O)NO)C(=O)O,220.0,active,glutamate carboxypeptidase II,IBD
3,CHEMBL63893,N[C@@H](CCC(=O)NO)C(=O)O,100000.0,inactive,glutamate carboxypeptidase II,IBD
4,CHEMBL65072,CC(=O)N[C@@H](CC(=O)O)C(=O)NO,100000.0,inactive,glutamate carboxypeptidase II,IBD
...,...,...,...,...,...,...
7610,CHEMBL5287859,Cc1coc2c1C(=O)c1c3c-2ccc2c3c(n1Cc1ccccc1)C(=O)...,6380.0,intermediate,TNF-alpha,IBD
7611,CHEMBL2391027,CCOC(=O)C1=C(C)N=c2s/c(=C\c3ccc(N4CCOCC4)cc3)c...,3310.0,intermediate,TNF-alpha,IBD
7612,CHEMBL5289737,C=CCOc1ccc(C2C(C(=O)OCC)=C(C)NC3S/C(=C\c4ccc(N...,1160.0,intermediate,TNF-alpha,IBD
7613,CHEMBL5282465,CCOC(=O)C1=C(C)NC2S/C(=C\c3ccc(N4CCOCC4)cc3)C(...,890.0,active,TNF-alpha,IBD


In [30]:
print(final_raw_df.shape)
print(final_processed_df.shape)
# Change your Input to get the Bioactivity data for different Targets
final_raw_df.to_csv('/Users/ashoksharma/Work/Python_bioinfo/IBD_target/output/Set1_bioactivity_data_Raw.csv', index=False)
final_processed_df.to_csv('/Users/ashoksharma/Work/Python_bioinfo/IBD_target/output/Set1_bioactivity_data_curated.csv', index=False)

(10563, 48)
(7615, 6)
