In [1]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors, Lipinski
from chembl_webresource_client.new_client import new_client
import os


KeyboardInterrupt



In [None]:
ntds = [
    'Antibiotic',
    'Antifungal',
    'Antiviral',
    'Antiparasitic',
    'Antihelminthic'
]

name = ntds[4] # CSV file name
print(name)
file_path = name + '.csv'
df = pd.read_csv(file_path, delimiter=';')
activity = new_client.activity

In [None]:
output_folder = name+ '_Dataset (forreal)'
os.makedirs(output_folder, exist_ok=True)

In [None]:
bioactivities = activity.filter(molecule_chembl_id__in=df['ChEMBL ID'].tolist(), type='IC50')

In [None]:
# Initialize data to store important information
data = {'molecule_chembl_id': [],
        'canonical_smiles': [],
        'pchembl_value': []

In [None]:
# Store bioactivity data in initialized data
for entry in bioactivities:
    data['molecule_chembl_id'].append(entry.get('molecule_chembl_id', ''))
    data['canonical_smiles'].append(entry.get('canonical_smiles', ''))
    data['pchembl_value'].append(entry.get('pchembl_value', ''))

In [None]:
df_bioactivities = pd.DataFrame(data)

In [None]:

df_bioactivities['pchembl_value'] = pd.to_numeric(df_bioactivities['pchembl_value'], errors='coerce')

df_bioactivities = df_bioactivities.dropna(subset=['pchembl_value'])

# Find indices of rows with the maximum pchembl_value for each canonical_smiles
idx_to_keep = df_bioactivities.groupby('canonical_smiles')['pchembl_value'].idxmax()

# Select rows with the highest pchembl_value for each canonical_smiles
df_filtered = df_bioactivities.loc[idx_to_keep]

print(len(df_filtered))


In [None]:
df_bioactivities = df_filtered.drop_duplicates()
df_bioactivities = df_filtered.dropna(subset=['canonical_smiles'])


In [None]:
csv_file_path = os.path.join(output_folder, name + '_IC50.csv')
df_bioactivities.to_csv(csv_file_path, index=False)
