# Downselection Pipelines 

Throughout the paper we have used various funnels that consist of a combination of the following filters and respective thresholds. How to obtain the respective scores needed for the filtering steps is explained in the README to this GitHub. 

* Antibiotic activity based on Chemprop scores (Staph. aureus, N. gonorrhoeae): ```ACTIVITY_THRESHOLD```
* Cytotoxicity based on Chemprop scores (IMR90, HSkMC, HepG2): ```TOXICITY_THRESHOLD```
* PAINS and Brenk filters (binary, no threshold needed)
* Tanimoto similarity to 559 known antibiotics: ```TANSIM_THRESHOLD```
* Retrosynthetic accessbility (SA or RA) score: ```SA_RA_THRESHOLD```

In [None]:
# Example values 
ACTIVITY_THRESHOLD = 0.1 
TOX_THRESHOLD = 0.8
TANSIM_THRESHOLD = 0.2
RA_THRESHOLD = 0.2

# Fix the name of the SMILES column 
SMILES_COL = "SMILES"

In [None]:
import pandas as pd

smiles_df = pd.read_csv("/path/to/molecules_to_be_downselected.csv")

print(f"The original file has {len(smiles_df)} molecules.")

### 1) Filtering for Antibiotic Activity  

In [None]:
ACTIVITY_COL = "ACTIVITY_COLUMN"

len_before = len(smiles_df)
smiles_df = smiles_df[smiles_df[ACTIVITY_COL] > ACTIVITY_THRESHOLD]
len_after = len(smiles_df)

print(f"removed: {len_before - len_after} molecules with activity below {ACTIVITY_THRESHOLD}")
print(f"remaining: {len_after} molecules")

### 2) Filter for Toxicitiy 

In [None]:
TOX_COL = "TOXICITY_COLUMN"

len_before = len(smiles_df)
smiles_df = smiles_df[smiles_df[TOX_COL] < TOX_THRESHOLD]
len_after = len(smiles_df)

print(f"removed: {len_before - len_after} molecules with toxicity above {TOX_THRESHOLD}")
print(f"remaining: {len_after} molecules")

### 3) PAINS and Brenk

In [None]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import FilterCatalog
from rdkit.Chem.FilterCatalog import FilterCatalogParams

# Load your DataFrame (assuming SMILES are in the first column)

# Initialize PAINS filters (PAINS A, B, and C)
pains_params = FilterCatalogParams()
pains_params.AddCatalog(FilterCatalogParams.FilterCatalogs.PAINS) # just use .PAINS
pains_catalog = FilterCatalog.FilterCatalog(pains_params)

# Initialize Brenk alerts using RDKit's built-in Brenk filter catalog
brenk_params = FilterCatalogParams()
brenk_params.AddCatalog(FilterCatalogParams.FilterCatalogs.BRENK)
brenk_catalog = FilterCatalog.FilterCatalog(brenk_params)

def check_pains(smile):
    mol = Chem.MolFromSmiles(smile)
    if mol is None:
        return 0
    return 1 if pains_catalog.HasMatch(mol) else 0

def check_brenk(smile):
    mol = Chem.MolFromSmiles(smile)
    if mol is None:
        return 0
    return 1 if brenk_catalog.HasMatch(mol) else 0

# Apply the functions to compute the flags (assumes SMILES are in the first column)
smiles_df["has_PAINS"] = smiles_df.iloc["SMILES"].apply(check_pains)
smiles_df["has_Brenk"] = smiles_df.iloc["SMILES"].apply(check_brenk)

len_before = len(smiles_df)
smiles_df = smiles_df[smiles_df["has_PAINS"] == 0]
smiles_df = smiles_df[smiles_df["has_Brenk"] == 0]
len_after = len(smiles_df)

print(f"removed: {len_before - len_after} molecules with PAINS or Brenk alerts")
print(f"remaining: {len_after} molecules")

### 4) Tanimoto Similarity to Known Antibiotics

In [None]:
import pandas as pd
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem

# Read the known antibiotics from CSV
known_abx = pd.read_csv("559_known_abx.csv")

# Function to compute a Morgan fingerprint for a given SMILES string.
def get_fingerprint(smile, radius=2, nBits=2048):
    mol = Chem.MolFromSmiles(smile)
    if mol:
        return AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits)
    else:
        return None

# Compute fingerprints for molecules in known_abx using the defined SMILES column (update column if needed)
known_abx['fp'] = known_abx[SMILES_COL].apply(get_fingerprint)
known_fps = list(known_abx['fp'].dropna())

# Compute fingerprints for the molecules in smiles_df
smiles_df['fp'] = smiles_df[SMILES_COL].apply(get_fingerprint)

# Function to compute the maximum Tanimoto similarity between a molecule and the list of known fingerprints
def max_tanimoto(fp, known_fps):
    if fp is None:
        return 0.0
    sims = DataStructs.BulkTanimotoSimilarity(fp, known_fps)
    return max(sims) if sims else 0.0

# Add a new column 'TanSim' that holds the max Tanimoto similarity value for each molecule
smiles_df['TanSim'] = smiles_df['fp'].apply(lambda fp: max_tanimoto(fp, known_fps))

# Filter the DataFrame to include only molecules meeting the Tanimoto similarity threshold
len_before = len(smiles_df)
smiles_df = smiles_df[smiles_df['TanSim'] < TANSIM_THRESHOLD]
len_after = len(smiles_df)

print(f"removed: {len_before - len_after} molecules that did not meet TANSIM_THRESHOLD")
print(f"remaining: {len(len_after)} molecules similar to known_abx")

### 5) Filter by SA or RA Score

In [None]:
import pandas as pd
import rdkit
from rdkit import RDLogger
import sascorer

# Suppress RDKit warnings
RDLogger.DisableLog("rdApp.*")

# Compute SA scores
sascores = [sascorer.calculateScore(mol) for mol in mols]


import pandas as pd
import rdkit
from rdkit import RDLogger
from RAscore import RAscore_XGB  # This should now be found if the package structure is correct

# Suppress RDKit warnings
RDLogger.DisableLog("rdApp.*")

# Load the XGBoost model trained on GDB
xgb_scorer = RAscore_XGB.RAScorerXGB()

len_before = len(smiles_df)
smiles_df["RA_score"] = smiles_df["SMILES"].apply(lambda x: xgb_scorer.predict(x))
len_after = len(smiles_df)

print(f"removed: {len_before - len_after} molecules with PAINS or Brenk alerts")
print(f"remaining: {len_after} molecules")

## Save resulting dataframe

In [None]:
# define place to save file
output_path = "path/to/save/file"

smiles_df.to_csv(output_path, index=False)
print(f'Saved the downselected list of {len(smiles_df)} molecules.')