In [1]:
import pandas as pd
import numpy as np
import rdkit.Chem as Chem
import rdkit.Chem.MolStandardize.rdMolStandardize as rdMolStandardize
import rdkit.Chem.rdMolDescriptors as rdMolDescriptors
import rdkit.Chem.rdmolops as rdmolops
import multiprocessing as mp
import queue
import time
import selfies as sf

In [24]:
sf.set_semantic_constraints('hypervalent')

path = 'data/train_data/big_dataset.parquet'
df = pd.read_parquet(path)
len(df)

1222179

In [14]:
alphabet = pd.read_csv('data/alphabet.txt', header=None)[0].tolist()
alphabet = set(alphabet)

In [16]:
def process(chunk, return_list, uncharger):
    chunk_copy = chunk.copy()
    # robie mol
    chunk_copy['mol'] = chunk_copy['smiles'].apply(lambda x: Chem.MolFromSmiles(x))
    # usuwam stereo
    chunk_copy['mol'].apply(rdmolops.RemoveStereochemistry)
    # usuwam ładunki
    chunk_copy['mol'] = chunk_copy['mol'].apply(uncharger.uncharge)
    # z powrotem do smiles
    chunk_copy['smiles'] = chunk_copy['mol'].apply(lambda x: Chem.MolToSmiles(x))
    # wypierdalam cząsteczki które nie da sie zakodować nowym alfabetem
    chunk_copy['selfies'] = chunk_copy['smiles'].apply(lambda x: sf.encoder(smiles=x))
    chunk_copy['tokens'] = chunk_copy['selfies'].apply(lambda x: set(sf.split_selfies(x)))
    chunk_copy = chunk_copy[chunk_copy['tokens'].apply(lambda x: x.issubset(alphabet))]
    # wypierdalam niepotrzebne kolumny
    chunk_copy.drop(columns=['mol', 'tokens', 'selfies'], inplace=True)
    # do return list (multiprocessing)
    return_list.append(chunk_copy)
    return None

In [22]:
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.info')  

# multiprocessing
manager = mp.Manager()
return_list = manager.list()
cpus = mp.cpu_count()
verbosity = 1
uc = rdMolStandardize.Uncharger()

print("Number of workers:", cpus) if verbosity > 0 else None
q = queue.Queue()

# prepare a process for each file and add to queue
chunk_size = 100000
n_chunks = int(len(df) / chunk_size) + 1 if len(df) % chunk_size != 0 else int(len(df) / chunk_size)
chunks = [df[i * chunk_size:(i + 1) * chunk_size] for i in range(n_chunks)]
for chunk in chunks:
    proc = mp.Process(target=process, args=(chunk, return_list, uc))
    q.put(proc)

print('(mp) Processes in queue: ', q.qsize()) if verbosity > 0 else None

# handle the queue
processes = []
while True:
    if q.empty():
        print("(mp) Queue handled successfully") if verbosity > 0 else None
        break
    if len(mp.active_children()) < cpus:
        proc = q.get()
        proc.start()
        if q.qsize() % 5 == 0:
            print('(mp) Processes in queue: ', q.qsize()) if verbosity > 0 else None
        processes.append(proc)
    time.sleep(1)

# complete the processes
for proc in processes:
    proc.join()

Number of workers: 8
(mp) Processes in queue:  12
(mp) Processes in queue:  10
(mp) Processes in queue:  5
(mp) Processes in queue:  0
(mp) Queue handled successfully


In [23]:
return_df = return_list[0]
for df_i in return_list[1:]:
    return_df = pd.concat((return_df, df_i), axis=0)
    return_df.reset_index(drop=True, inplace=True)
len(return_df)

1163393

In [None]:
return_df.to_parquet(path.replace('.parquet', '_standardized.parquet')))

In [12]:
# to nieważne

def gen_ECFP(chunk, return_list):
    chunk_copy = chunk.copy()
    chunk_copy.drop(columns=['fps'], inplace=True)
    chunk_copy['mol'] = chunk_copy['smiles'].apply(lambda x: Chem.MolFromSmiles(x))
    chunk_copy['fps'] = chunk_copy['mol'].apply(lambda x: 
        rdMolDescriptors.GetMorganFingerprintAsBitVect(x, 2, nBits=2048))
    chunk_copy.drop(columns=['mol'], inplace=True)
    chunk_copy['fps'] = chunk_copy['fps'].apply(lambda x: list(x))
    return_list.append(chunk_copy)
    return

In [18]:
alphabet = sf.get_alphabet_from_selfies(return_df['selfies'])

In [19]:
return_df.to_parquet(path)

In [20]:
alphabet = list(alphabet)

In [21]:
alphabet.sort()
print(alphabet)

['[#Branch1]', '[#Branch2]', '[#C]', '[#N]', '[=Branch1]', '[=Branch2]', '[=C]', '[=N]', '[=O]', '[=Ring1]', '[=Ring2]', '[=S]', '[Br]', '[Branch1]', '[Branch2]', '[C]', '[Cl]', '[F]', '[I]', '[N+1]', '[NH1]', '[N]', '[O]', '[P]', '[Ring1]', '[Ring2]', '[S]']
