In [27]:
import pandas as pd
import numpy as np
import rdkit.Chem as Chem
import rdkit.Chem.MolStandardize.rdMolStandardize as rdMolStandardize
import rdkit.Chem.rdMolDescriptors as rdMolDescriptors
import rdkit.Chem.rdmolops as rdmolops
import multiprocessing as mp
import queue
import time
import selfies as sf

In [61]:
path = 'data/activity_data/d2_klek_100nM.parquet'
df = pd.read_parquet(path)
len(df)

10170

In [62]:
alphabet = pd.read_csv('data/alphabet.txt', header=None)[0].tolist()
alphabet = set(alphabet)

In [63]:
def process(chunk, return_list, uc):
    chunk_copy = chunk.copy()
    chunk_copy['mol'] = chunk_copy['smiles'].apply(lambda x: Chem.MolFromSmiles(x))
    chunk_copy['mol'].apply(rdmolops.RemoveStereochemistry)
    chunk_copy['mol'] = chunk_copy['mol'].apply(uc.uncharge)
    chunk_copy['smiles'] = chunk_copy['mol'].apply(lambda x: Chem.MolToSmiles(x))
    chunk_copy['selfies'] = chunk_copy['smiles'].apply(lambda x: sf.encoder(smiles=x))
    chunk_copy['tokens'] = chunk_copy['selfies'].apply(lambda x: set(sf.split_selfies(x)))
    chunk_copy['is_in_alph'] = chunk_copy['tokens'].apply(lambda x: x.issubset(alphabet))
    chunk_copy = chunk_copy[chunk_copy['is_in_alph'] == True]
    chunk_copy.drop(columns=['mol', 'tokens', 'is_in_alph'], inplace=True)
    return_list.append(chunk_copy)
    return

In [64]:
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.info')  
sf.set_semantic_constraints('hypervalent')

# multiprocessing
manager = mp.Manager()
return_list = manager.list()
cpus = mp.cpu_count()
verbosity = 1
uc = rdMolStandardize.Uncharger()

print("Number of workers:", cpus) if verbosity > 0 else None
q = queue.Queue()

# prepare a process for each file and add to queue
chunk_size = 1000
n_chunks = int(len(df) / chunk_size) + 1 if len(df) % chunk_size != 0 else int(len(df) / chunk_size)
chunks = [df[i * chunk_size:(i + 1) * chunk_size] for i in range(n_chunks)]
for chunk in chunks:
    proc = mp.Process(target=process, args=(chunk, return_list, uc))
    q.put(proc)

print('(mp) Processes in queue: ', q.qsize()) if verbosity > 0 else None

# handle the queue
processes = []
while True:
    if q.empty():
        print("(mp) Queue handled successfully") if verbosity > 0 else None
        break
    if len(mp.active_children()) < cpus:
        proc = q.get()
        proc.start()
        if q.qsize() % 5 == 0:
            print('(mp) Processes in queue: ', q.qsize()) if verbosity > 0 else None
        processes.append(proc)
    time.sleep(1)

# complete the processes
for proc in processes:
    proc.join()

Number of workers: 8
(mp) Processes in queue:  11
(mp) Processes in queue:  10
(mp) Processes in queue:  5
(mp) Processes in queue:  0
(mp) Queue handled successfully


In [65]:
return_df = pd.concat(return_list)
len(return_df)

9391

In [66]:
alphabet = sf.get_alphabet_from_selfies(return_df['selfies'])

In [67]:
return_df.drop(columns=['selfies']).to_parquet(path.replace('.parquet', '_standardized.parquet'))

In [68]:
alphabet = list(alphabet)

In [69]:
alphabet.sort()
print(alphabet)

['[#Branch1]', '[#Branch2]', '[#C]', '[#N]', '[=Branch1]', '[=Branch2]', '[=C]', '[=N]', '[=O]', '[=Ring1]', '[=Ring2]', '[=S]', '[Br]', '[Branch1]', '[Branch2]', '[C]', '[Cl]', '[F]', '[I]', '[N+1]', '[NH1]', '[N]', '[O]', '[P]', '[Ring1]', '[Ring2]', '[S]']
