In [1]:
import multiprocessing as mp
import queue
import random
import time
import pandas as pd
import rdkit.Chem.AllChem as AllChem
from src.utils.finger import sparse2dense
import numpy as np

In [2]:
def foo(chunk, return_list):
    df = pd.DataFrame(chunk)
    df['mol'] = df['smiles'].apply(lambda x: AllChem.MolFromSmiles(x))
    df['fps'] = df['mol'].apply(lambda x: AllChem.GetMorganFingerprintAsBitVect(x, 2, nBits=2048))
    df['fps'] = df['fps'].apply(lambda x: sparse2dense(x, return_numpy=False))
    df.drop(columns=['mol'], inplace=True)
    return_list.append(df)
    return

In [3]:
df = pd.read_parquet('data/train_data/combined_dataset.parquet')['smiles']
n = 10000
chunks = [df[i:i+n] for i in range(0,df.shape[0],n)]

In [4]:
random.seed(42)
"""
Multiprocessing support and queue handling
"""
manager = mp.Manager()
return_list = manager.list()
cpus = mp.cpu_count()
print("Number of cpus: ", cpus)

q = queue.Queue()

for chunk in chunks:
    proc = mp.Process(target=foo, args=[chunk, return_list])
    q.put(proc)

# handle the queue
processes = []
while True:
    if q.empty():
        print("(mp) Queue handled successfully")
        break
    if len(mp.active_children()) < cpus:
        proc = q.get()
        proc.start()
        if q.qsize() % 5 == 0:
            print('(mp) Processes in queue: ', q.qsize())
        processes.append(proc)
    time.sleep(0.1)

# complete the processes
for proc in processes:
    proc.join()

Number of cpus:  8
(mp) Processes in queue:  120
(mp) Processes in queue:  115
(mp) Processes in queue:  110
(mp) Processes in queue:  105
(mp) Processes in queue:  100
(mp) Processes in queue:  95
(mp) Processes in queue:  90
(mp) Processes in queue:  85
(mp) Processes in queue:  80
(mp) Processes in queue:  75
(mp) Processes in queue:  70
(mp) Processes in queue:  65
(mp) Processes in queue:  60
(mp) Processes in queue:  55
(mp) Processes in queue:  50
(mp) Processes in queue:  45
(mp) Processes in queue:  40
(mp) Processes in queue:  35
(mp) Processes in queue:  30
(mp) Processes in queue:  25
(mp) Processes in queue:  20
(mp) Processes in queue:  15
(mp) Processes in queue:  10
(mp) Processes in queue:  5
(mp) Processes in queue:  0
(mp) Queue handled successfully


In [5]:
df_out = pd.DataFrame()
for chunk in return_list:
    df_out = pd.concat([df_out, chunk], axis=0)

In [6]:
df_out.head()

Unnamed: 0,smiles,fps
0,Cc1ccc(C)n1-c1cccc(C(=O)O)c1,"[94, 389, 456, 650, 807, 875, 883, 935, 1039, ..."
1,Cc1nc2c(c(C)c1CC(=O)NCc1ccco1)c(=O)[nH]n2C,"[24, 41, 80, 92, 197, 252, 255, 314, 325, 333,..."
2,O=C(c1csnn1)N1CCC[C@@]2(CCN(c3ncccn3)C2)C1,"[18, 180, 216, 338, 369, 378, 395, 399, 407, 4..."
3,CCC(C(=O)NCc1ccco1)n1nc(C)c2c(C)n(-c3ccc(C)cc3...,"[1, 80, 92, 122, 197, 233, 235, 255, 294, 302,..."
4,O=S(=O)(NCc1ccc2c(c1)OCO2)c1c[nH]cn1,"[29, 80, 84, 255, 310, 350, 378, 441, 502, 625..."


In [7]:
df_out.to_parquet('data/train_data/combined_dataset_ECFP.parquet', index=False)