In [1]:
!pip install rdkit
!pip install selfies

Collecting rdkit
  Downloading rdkit-2023.9.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.9/34.9 MB[0m [31m16.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: rdkit
Successfully installed rdkit-2023.9.6
Collecting selfies
  Downloading selfies-2.1.1-py3-none-any.whl (35 kB)
Installing collected packages: selfies
Successfully installed selfies-2.1.1


In [None]:
from rdkit import Chem
from rdkit.Chem import RWMol, SanitizeMol, rdchem
import selfies as sf

def is_valence_ok(atom_element, existing_bonds, new_bond_order=1):
    max_valences = {'H': 1, 'O': 2, 'C': 4, 'Cl': 1}  # Added chlorine with a valence of 1
    return (existing_bonds + new_bond_order) <= max_valences.get(atom_element, 4)

def add_atom_and_bond(molecule, atom_index, element):
    atom = molecule.GetAtomWithIdx(atom_index)
    existing_bonds = sum([bond.GetBondTypeAsDouble() for bond in atom.GetBonds()])
    if not is_valence_ok(atom.GetSymbol(), existing_bonds):
        return None

    new_atom_index = molecule.AddAtom(Chem.Atom(element))
    molecule.AddBond(atom_index, new_atom_index, rdchem.BondType.SINGLE)
    try:
        Chem.SanitizeMol(molecule)
        return new_atom_index
    except:
        molecule.RemoveAtom(new_atom_index)  # Rollback if invalid
        return None

def generate_molecules(max_depth, current_molecule, current_depth=0, molecules=None):
    if molecules is None:
        molecules = []

    smiles = Chem.MolToSmiles(current_molecule)
    molecules.append(smiles)

    if current_depth <= max_depth:
        for atom_index in range(current_molecule.GetNumAtoms()):
            for element in ['C', 'O', 'H', 'Cl']:  # Added chlorine to the list of elements
                existing_bonds = sum([bond.GetBondTypeAsDouble() for bond in current_molecule.GetAtomWithIdx(atom_index).GetBonds()])
                if is_valence_ok(element, existing_bonds):
                    new_molecule = Chem.RWMol(current_molecule)
                    if add_atom_and_bond(new_molecule, atom_index, element) is not None:
                        generate_molecules(max_depth, new_molecule, current_depth + 1, molecules)

    return list(set(molecules))

if __name__ == '__main__':
    max_depth = 6
    init_molecule = Chem.RWMol()
    init_molecule.AddAtom(Chem.Atom('C'))  # Starting with a carbon atom
    generated_molecules = generate_molecules(max_depth, init_molecule)
    for smiles in generated_molecules:
        print(smiles)
    print(f'Generated {len(generated_molecules)} unique molecules.')
    file_name_smiles = f'smiles_depth_{max_depth}.txt'
    file_name_selfies = f'selfies_depth_{max_depth}.txt'

    with open(file_name_smiles, 'w') as f_smiles:
        for smiles in generated_molecules:
            f_smiles.write(smiles + '\n')

    selfies_list = [sf.encoder(smiles) for smiles in generated_molecules]  # Ensure smiles_to_selfies function is defined
    with open(file_name_selfies, 'w') as f_selfies:
        for selfies in selfies_list:
            f_selfies.write(selfies + '\n')


CC(C)C(O)C(C)Cl
CC(O)(CO)C(O)O
OCCCl
COOC(C)OO
[H]C(OC)C(C)(C)O
[H]C(C)C(CO)CO
[H]C(C)(CO)COO
CC(OO)C(O)OO
CCOC(C)OOO
OCCC(O)OCCl
CCOCOCCCl
[H]CCOOOCO
[H]COOCOCO
OCCCOOCO
COCCOOCO
CCC(C)(O)OOC
CCC(C)(CC)CCl
CC(OO)C(C)(C)O
CCCC(C)C
CCCCC(C)C
[H]C(C)(O)COOC
CC(Cl)C(C)OOO
CC(Cl)OCCOO
COCC(O)(Cl)CO
[H]C(C)COOO
CC(C)(CO)OCCl
COCCCOCCl
[H]COCCCCC
OOOCOOOO
[H]C(C)(C)OC(C)O
[H]C(CCO)OCC
COC(CO)OC
COOC(C)OOC
CC(C)CC(Cl)OO
CCCOCCCl
COCC(C)OCCl
[H]C(C)CC(C)OO
COCCOCOC
CC(O)(O)CCCO
[H]C(C)(C)C(C)O
CCOCOCCl
COC(C)OOCCl
CCC(O)CCl
COCOOO
OCC(CO)CO
[H]C(O)CC(C)O
CCCC(C)(O)CCl
[H]C(C)OOCCC
CCC(O)(Cl)C(C)C
COOCC(C)OC
CC(O)OCOOO
[H]CC(C)OC(C)C
CCOC(CC)OO
[H]COC(C)(C)OC
CCC(OC)OCCl
CCCCCCCC
CCCOOOCO
OOCC(Cl)OOO
[H]CC(CCC)OC
CCC(CO)CO
[H]C(CO)OOOC
[H]C(C)OOOCO
[H]CC(C)C(C)OO
CCOOC(O)CC
CCOCCOCC
COCOC
OCCOC(Cl)CO
[H]COCCOOO
CC(O)C(CO)CO
COC(C)C(C)(C)Cl
COOC(Cl)C(C)O
CCOOCOCCl
COC(C)(C)OCCl
COCC(C)Cl
[H]CCOOCO
COOC(Cl)C(C)C
[H]C(CO)C(C)OO
[H]C(C)OCOOO
[H]C(C)(C)OOOO
[H]COOOC
CCC(CCO)OC
[H]C(C)CC(C)OC
[H]CC(C