In [1]:
import pandas as pd
from src.smiles_processor import DataFrameSmilesProcessor, save_processed_df
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.error')
RDLogger.DisableLog('rdApp.info')
RDLogger.DisableLog('rdApp.warning')

### Dataset specific pre-processing

In [2]:
stokes_2020 = pd.read_csv('data/raw/stokes_2020.csv')
stokes_2020['target'] = 'Escherichia coli BW25113'
stokes_2020['antimicrobial_activity'] = stokes_2020['Activity'].apply(lambda x: 1 if x == 'Active' else 0)
stokes_2020.drop(columns=['Name', 'Activity'], inplace=True)

swanson_2024 = pd.read_csv('data/raw/swanson_2024.csv')
swanson_2024['target'] = 'Acinetobacter baumannii ATCC 17978'
swanson_2024.rename(columns={'smiles': 'SMILES', 'antibiotic_activity': 'antimicrobial_activity'}, inplace=True)
swanson_2024.drop(columns=['clogp_6.5', 'mol_weight', 'clogp'], inplace=True)

swanson_2025 = pd.read_csv('data/raw/swanson_2025.csv')
swanson_2025['target'] = 'Staphylococcus aureus RN4220'
swanson_2025.rename(columns={'smiles': 'SMILES', 's_aureus_activity': 'antimicrobial_activity_continuous', 's_aureus_activity_true': 'antimicrobial_activity'}, inplace=True)

liu_2023 = pd.read_csv('data/raw/liu_2023.csv')
liu_2023['target'] = 'Acinetobacter baumannii ATCC 17978'
mean_liu = liu_2023['Mean'].mean()
std_liu = liu_2023['Mean'].std()
threshold_liu = mean_liu - std_liu
liu_2023['antimicrobial_activity'] = (liu_2023['Mean'] < threshold_liu).astype(int)
liu_2023.drop(columns=['Name', 'Rep 1', 'Rep 2'], inplace=True)


wong_2024 = pd.read_csv('data/raw/wong_2024.csv')
wong_2024['antimicrobial_activity'] = 0
wong_2024.loc[:511, 'antimicrobial_activity'] = 1
wong_2024['target'] = 'Staphylococcus aureus RN4220'
wong_2024.drop(columns=['R1_50uM', 'R2_50uM'], inplace=True)

### SMILES preparation
Basically, RDKit first runs sanitization (Mol From Smiles), checking if the molecule is feasible.
Then, we run some standardization with `rdMolStandardize`, consisting of CleanUp (removing Hs, disconnecting metals, etc), Uncharging, and getting the (usually) most stable tautomer with TautomerEnumeration. (Ref: [RDKit](https://www.rdkit.org/docs/RDKit_Book.html#molecular-sanitization) and [a short post about it](https://www.blopig.com/blog/2022/05/molecular-standardization/))


In [5]:
processor = DataFrameSmilesProcessor()
stokes_2020 = processor.process(stokes_2020, 'SMILES')
swanson_2024 = processor.process(swanson_2024, 'SMILES')
swanson_2025 = processor.process(swanson_2025, 'SMILES')
liu_2023 = processor.process(liu_2023, 'SMILES')
wong_2024 = processor.process(wong_2024, 'SMILES')

[32m2025-10-13 15:05:05.345[0m | [1mINFO    [0m | [36msrc.smiles_processor[0m:[36mprocess[0m:[36m18[0m - [1mStarting SMILES validation of 2335 entries.[0m


[32m2025-10-13 15:05:05.551[0m | [1mINFO    [0m | [36msrc.smiles_processor[0m:[36mprocess[0m:[36m23[0m - [1mNumber of invalid SMILES: 0[0m
[32m2025-10-13 15:05:05.552[0m | [32m[1mSUCCESS [0m | [36msrc.smiles_processor[0m:[36mprocess[0m:[36m27[0m - [32m[1mSMILES validation completed. 2335 valid entries retained.[0m
[32m2025-10-13 15:05:05.552[0m | [1mINFO    [0m | [36msrc.smiles_processor[0m:[36mprocess[0m:[36m30[0m - [1mStarting SMILES standardization of 2335 entries.[0m
[32m2025-10-13 15:05:45.544[0m | [1mINFO    [0m | [36msrc.smiles_processor[0m:[36mprocess[0m:[36m44[0m - [1mNumber of invalid SMILES after standardization: 0[0m
[32m2025-10-13 15:05:45.545[0m | [32m[1mSUCCESS [0m | [36msrc.smiles_processor[0m:[36mprocess[0m:[36m48[0m - [32m[1mSMILES standardization completed. 2335 valid entries retained.[0m
[32m2025-10-13 15:05:45.545[0m | [1mINFO    [0m | [36msrc.smiles_processor[0m:[36mprocess[0m:[36m53[0m - [

In [7]:
for df, name in zip([stokes_2020, swanson_2024, swanson_2025, liu_2023, wong_2024],
                    ['stokes_2020', 'swanson_2024', 'swanson_2025', 'liu_2023', 'wong_2024']):
    save_processed_df(df, f"{name}_processed.csv")