In [1]:
import os
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors, Lipinski
from chembl_webresource_client.new_client import new_client



In [2]:
target = new_client.target
activity = new_client.activity

ntds = [
    'Cryptosporidium',
    'Entamoeba',
    'Giardia',
    'Leishmania',
    'Nematoda',
    'Plasmodium',
    'Schistosoma',
    'Toxoplasma',
    'Trachomatis',
    'Trypanosoma'#9
]
name = ntds[9]
target_query = target.search(name) # Genus of selected NTD
targets = target_query.all()
print(name)

Trypanosoma


In [3]:
print(len(target_query))

75


In [4]:
print(targets)

[{'cross_references': [], 'organism': 'Trypanosoma', 'pref_name': 'Trypanosoma', 'score': 16.0, 'species_group_flag': True, 'target_chembl_id': 'CHEMBL612883', 'target_components': [], 'target_type': 'ORGANISM', 'tax_id': 5690}, {'cross_references': [], 'organism': 'Trypanosoma brucei', 'pref_name': 'Trypanosoma brucei', 'score': 14.0, 'species_group_flag': False, 'target_chembl_id': 'CHEMBL612849', 'target_components': [], 'target_type': 'ORGANISM', 'tax_id': 5691}, {'cross_references': [], 'organism': 'Trypanosoma cruzi', 'pref_name': 'Trypanosoma cruzi', 'score': 14.0, 'species_group_flag': False, 'target_chembl_id': 'CHEMBL368', 'target_components': [], 'target_type': 'ORGANISM', 'tax_id': 5693}, {'cross_references': [], 'organism': 'Trypanosoma evansi', 'pref_name': 'Trypanosoma evansi', 'score': 14.0, 'species_group_flag': False, 'target_chembl_id': 'CHEMBL1075362', 'target_components': [], 'target_type': 'ORGANISM', 'tax_id': 5697}, '...(remaining elements truncated)...']


In [5]:
output_folder = '_' + name+ '_Dataset (forreal)'
os.makedirs(output_folder, exist_ok=True)

In [6]:
def check_lipinski_rule(smiles):
    if smiles and smiles.strip():  # Check if SMILES is not None or empty
        mol = Chem.MolFromSmiles(smiles)
        if mol:
            return all([Lipinski.NumHDonors(mol) <= 5, Lipinski.NumHAcceptors(mol) <= 10, Descriptors.MolWt(mol) <= 500, Lipinski.NumRotatableBonds(mol) <= 10])
    return False

In [7]:
test_bioactivities_list = []

for target_entry in targets:
    target_id = target_entry['target_chembl_id']

    bioactivities = activity.filter(
        target_chembl_id=target_entry['target_chembl_id'],
        type='IC50'
    ).only(
        'molecule_chembl_id', 
        'canonical_smiles',  
        'pchembl_value'
    )
    if bioactivities:
        df_bioactivities = pd.DataFrame(bioactivities)
        df_bioactivities = df_bioactivities.drop_duplicates(subset=['canonical_smiles']).dropna()
    
        if not df_bioactivities.empty:
            df_bioactivities['standard_value'] = pd.to_numeric(df_bioactivities['standard_value'], errors='coerce')    
            df_bioactivities = df_bioactivities[[
                'molecule_chembl_id', 
                'canonical_smiles',
                'pchembl_value'
            ]]
            test_bioactivities_list.append(df_bioactivities)
            print(len(df_bioactivities))
            # print(df_bioactivities.head())
            # print(len(df_bioactivities))
    else:
        print(f"No valid data for target {target_id}")
# Concatenate all DataFrames
test_bioactivities_list = pd.concat(test_bioactivities_list, ignore_index=True)
test_bioactivities_list = test_bioactivities_list.drop_duplicates(subset=['canonical_smiles'])
csv_filename = os.path.join(output_folder, name +'_IC50.csv')

test_bioactivities_list.to_csv(csv_filename, index=False)
print(len(test_bioactivities_list))
print("Done :D")

4
2443
8639
62
18
70
No valid data for target CHEMBL5209658
No valid data for target CHEMBL4879389
5388
267
4515
109
231
732
No valid data for target CHEMBL2601
No valid data for target CHEMBL5069
187
No valid data for target CHEMBL3697
81
42
55
9
No valid data for target CHEMBL3328
17
19
No valid data for target CHEMBL5220
34
34
54
38
No valid data for target CHEMBL6134
No valid data for target CHEMBL5672
88
18
16
11
No valid data for target CHEMBL1075156
19
8
14
No valid data for target CHEMBL1667688
51
No valid data for target CHEMBL1741167
191
1
51
249
No valid data for target CHEMBL2146316
No valid data for target CHEMBL2189141
No valid data for target CHEMBL2362982
No valid data for target CHEMBL2366047
42
No valid data for target CHEMBL3286069
No valid data for target CHEMBL3337328
No valid data for target CHEMBL3758065
No valid data for target CHEMBL3826861
2
5
504
13
130
No valid data for target CHEMBL4105987
3
46
No valid data for target CHEMBL4295599
1
No valid data for targ

In [8]:
# for target_entry in targets:
#     test_bioactivities_list = []

#     target_id = target_entry['target_chembl_id']
    
#     # Get bioactivity data for the target
#     bioactivities = activity.filter(target_chembl_id=target_entry['target_chembl_id'], type='IC50').only(
#         'molecule_chembl_id', 
#         'canonical_smiles',  
#         'standard_value',
#         'standard_units',
#         'standard_type',
#         'pchembl_value',
#         'target_pref_name',
#         'bao_label'
#     )
#     df_bioactivities = pd.DataFrame(bioactivities)
#     df_bioactivities = df_bioactivities.drop_duplicates(subset=['canonical_smiles']).dropna()

#     if not df_bioactivities.empty:

#         df_bioactivities['passes_lipinski'] = df_bioactivities['canonical_smiles'].apply(check_lipinski_rule)
#         df_bioactivities = df_bioactivities[df_bioactivities['passes_lipinski']]

#         df_bioactivities['standard_value'] = pd.to_numeric(df_bioactivities['standard_value'], errors='coerce')
#         df_bioactivities = df_bioactivities.dropna(subset=['standard_value'])

        
#         df_bioactivities = df_bioactivities[[
#             'molecule_chembl_id', 
#             'canonical_smiles',  
#             'standard_value',
#             'standard_units',
#             'standard_type',
#             'pchembl_value',
#             'target_pref_name',
#             'bao_label', 
#             'passes_lipinski'
#         ]]
#         test_bioactivities_list.append(df_bioactivities)

#         test_bioactivities = pd.concat(test_bioactivities_list, ignore_index=True)
        
#         csv_filename = os.path.join(output_folder, f'{target_id}.csv')
#         test_bioactivities.to_csv(csv_filename, index=False)
#         # print(df_bioactivities.head())
#         # print(len(df_bioactivities))


#     else:
#         print(f"No valid data for target {target_id}")

# # Concatenate all DataFrames outside the loop


# print("Done :D")