In [None]:
import pandas as pd
from rdkit.Chem import MolFromSmiles, MolToSmiles
from rdkit.Chem.SaltRemover import SaltRemover
from tqdm import tqdm

In [None]:
smiles = pd.read_csv(r"data\smiles\smiles_119.csv", index_col=0)["SMILES"].to_list()
smiles[:10]

In [None]:
class SmilesCleaner:
    def __init__(self, salt_remover: SaltRemover):
        self.salt_remover = salt_remover
        
    def clean_all_smiles(self, smiles: list[str]) -> list[str]:
        cleaned_smiles = set()
        for i in tqdm(smiles):
            result = self.clean_smiles(i)
            if result:
                cleaned_smiles.add(result)
        return list(cleaned_smiles)
        
    def clean_smiles(self, smiles: str) -> str:
        smiles = self.remove_stereochemistry(smiles)
        smiles = self.remove_salts(smiles)
        return smiles
        
    @staticmethod
    def remove_stereochemistry(smiles: str) -> str:
        return smiles.replace("@", "")

    def remove_salts(self, smiles: str) -> str:
        try:
            mol = MolFromSmiles(smiles)
            mol = self.salt_remover.StripMol(mol)
            return MolToSmiles(mol)
        except Exception:
            pass

In [None]:
smiles_cleaner = SmilesCleaner(SaltRemover())

In [None]:
cleaned_smiles = smiles_cleaner.clean_all_smiles(smiles)

In [None]:
cleaned_smiles[:10]