In [None]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors, Lipinski
from chembl_webresource_client.new_client import new_client
import os

In [None]:
# Create a ChEMBL client
target = new_client.target
activity = new_client.activity
target_query = target.search('bacteria')
targets_all = target_query.all()
print(len(target_query))

In [None]:
# Create a folder to store the CSV files
output_folder = 'Bacteria_Dataset (forreal)'
os.makedirs(output_folder, exist_ok=True)

In [None]:
# Function to check Lipinski's Rule of Five
def check_lipinski_rule(smiles):
    if smiles and smiles.strip():  # Check if SMILES is not None or empty
        mol = Chem.MolFromSmiles(smiles)
        if mol:
            return all([Lipinski.NumHDonors(mol) <= 5, Lipinski.NumHAcceptors(mol) <= 10, Descriptors.MolWt(mol) <= 500, Lipinski.NumRotatableBonds(mol) <= 10])
    return False

In [None]:
test_bioactivities = pd.DataFrame()

for target_entry in targets_all:

    target_id = target_entry['target_chembl_id']
    
    # Get bioactivity data for the target
    bioactivities = activity.filter(target_chembl_id=target_entry['target_chembl_id'], type='IC50').only(
        'molecule_chembl_id', 
        'canonical_smiles',  
        'standard_value',
        'standard_units',
        'standard_type',
        'pchembl_value',
        'target_pref_name',
        'bao_label', 
    )
    df = pd.DataFrame(bioactivities)

    df_bioactivities = df.drop_duplicates(subset=['canonical_smiles']).dropna()
    
    if not df_bioactivities.empty:
        df_bioactivities['passes_lipinski'] = df_bioactivities['canonical_smiles'].apply(check_lipinski_rule)
        df_bioactivities = df_bioactivities[df_bioactivities['passes_lipinski']]

        df_bioactivities['standard_value'] = pd.to_numeric(df_bioactivities['standard_value'], errors='coerce')
        df_bioactivities = df_bioactivities.dropna(subset=['standard_value'])

        
        df_bioactivities = df_bioactivities[[
            'molecule_chembl_id', 
            'canonical_smiles',  
            'standard_value',
            'standard_units',
            'standard_type',
            'pchembl_value',
            'target_pref_name',
            'bao_label', 
            'passes_lipinski'
        ]]
        test_bioactivities = pd.concat([test_bioactivities, df_bioactivities], ignore_index=True)
        
        # print(test_bioactivities.head())
        print(len(test_bioactivities))
    else:
        print(f"No valid data for target {target_id}")

test_bioactivities = test_bioactivities.drop_duplicates(subset=['canonical_smiles']).reset_index(drop=True)

csv_filename = os.path.join(output_folder, 'Bacteria_.csv')
test_bioactivities.to_csv(csv_filename, index=False)

print("Done :D")

