In [1]:
import pandas as pd
import numpy as np
from src.utils.finger import sparse2dense
import rdkit.Chem as Chem
import rdkit.Chem.MolStandardize.rdMolStandardize as rdMolStandardize
import rdkit.Chem.rdMolDescriptors as rdMolDescriptors
import rdkit.Chem.rdmolops as rdmolops
import multiprocessing as mp
import queue
import time
import selfies as sf

In [130]:
sf.set_semantic_constraints('hypervalent')

path = 'data/activity_data/d2_klek_100nM_std.parquet'
df = pd.read_parquet(path)
len(df)

9391

In [121]:
alphabet = pd.read_csv('data/alphabet.txt', header=None)[0].tolist()
alphabet = set(alphabet)

In [131]:
from src.utils.finger import smiles2dense

def foo(chunk, return_list):
    chunk_copy = chunk.copy()
    chunk_copy['fps'] = chunk_copy['smiles'].apply(smiles2dense)
    return_list.append(chunk_copy)
    return None

def get_ECFP(chunk, return_list):
    chunk_copy = chunk.copy()
    chunk_copy['mol'] = chunk_copy['smiles'].apply(lambda x: Chem.MolFromSmiles(x))
    chunk_copy['fps'] = chunk_copy['mol'].apply(lambda x: 
        list(rdMolDescriptors.GetMorganFingerprintAsBitVect(x, 2, nBits=2048)))
    chunk_copy.drop(columns=['mol'], inplace=True)
    chunk_copy['fps'] = chunk_copy['fps'].apply(lambda x: sparse2dense(x).tolist())
    return_list.append(chunk_copy)

In [123]:
def process(chunk, return_list, uncharger):
    chunk_copy = chunk.copy()
    # robie mol
    chunk_copy['mol'] = chunk_copy['smiles'].apply(lambda x: Chem.MolFromSmiles(x))
    # usuwam stereo
    chunk_copy['mol'].apply(rdmolops.RemoveStereochemistry)
    # usuwam ładunki
    chunk_copy['mol'] = chunk_copy['mol'].apply(uncharger.uncharge)
    # z powrotem do smiles
    chunk_copy['smiles'] = chunk_copy['mol'].apply(lambda x: Chem.MolToSmiles(x))
    # wypierdalam cząsteczki które nie da sie zakodować nowym alfabetem
    chunk_copy['selfies'] = chunk_copy['smiles'].apply(lambda x: sf.encoder(smiles=x))
    chunk_copy['tokens'] = chunk_copy['selfies'].apply(lambda x: set(sf.split_selfies(x)))
    chunk_copy = chunk_copy[chunk_copy['tokens'].apply(lambda x: x.issubset(alphabet))]
    # wypierdalam niepotrzebne kolumny
    chunk_copy.drop(columns=['mol', 'tokens', 'selfies'], inplace=True)
    chunk_copy['fps'] = chunk_copy['smiles'].apply(smiles2dense)
    # do return list (multiprocessing)
    return_list.append(chunk_copy)
    return None

In [134]:
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.info')  

# multiprocessing
manager = mp.Manager()
return_list = manager.list()
cpus = mp.cpu_count()
verbosity = 1
uc = rdMolStandardize.Uncharger()

print("Number of workers:", cpus) if verbosity > 0 else None
q = queue.Queue()

# prepare a process for each file and add to queue
chunk_size = 1000
chunks = []
for start in range(0, len(df), chunk_size):
    chunks.append(df[start:start+chunk_size])
for chunk in chunks:
    proc = mp.Process(target=get_ECFP, args=(chunk, return_list))
    q.put(proc)

print('(mp) Processes in queue: ', q.qsize()) if verbosity > 0 else None

# handle the queue
processes = []
while True:
    if q.empty():
        print("(mp) Queue handled successfully") if verbosity > 0 else None
        break
    if len(mp.active_children()) < cpus:
        proc = q.get()
        proc.start()
        if q.qsize() % 5 == 0:
            print('(mp) Processes in queue: ', q.qsize()) if verbosity > 0 else None
        processes.append(proc)
    time.sleep(1)

# complete the processes
for proc in processes:
    proc.join()

Number of workers: 8
(mp) Processes in queue:  10
(mp) Processes in queue:  5
(mp) Processes in queue:  0
(mp) Queue handled successfully


In [135]:
return_df = pd.concat(return_list)
return_df.head()

Unnamed: 0,activity,Ki,fps,smiles
0,0,5247.0,"[74, 80, 216, 218, 231, 277, 310, 561, 567, 56...",CC1Cc2ccccc2N1C(=O)CN1CCN(CCc2ccc(Cl)cc2)CC1
1,0,12000.0,"[40, 43, 74, 80, 95, 216, 233, 378, 407, 442, ...",Cc1cccn2ncc(CN3CCN(c4ccc(Cl)cc4)CC3)c12
2,1,16.0,"[80, 119, 193, 296, 314, 328, 352, 463, 588, 6...",O=C1c2ccccc2C(=O)N1CCCCN1CC=C(c2c[nH]c3ccc(F)c...
3,0,403.0,"[1, 74, 80, 90, 227, 263, 378, 407, 448, 527, ...",OC(CCCN1CCN(c2ccccn2)CC1)c1ccc(F)cc1
4,0,1047.13,"[9, 29, 80, 82, 114, 226, 249, 371, 387, 389, ...",COc1ccccc1C1CC1CNCCC1CCC(NC(=O)c2ccccc2)CC1


In [136]:
return_df.to_parquet(path.replace('.parquet', '_ECFP.parquet'), index=False)

In [12]:
# to nieważne

def gen_ECFP(chunk, return_list):
    chunk_copy = chunk.copy()
    chunk_copy.drop(columns=['fps'], inplace=True)
    chunk_copy['mol'] = chunk_copy['smiles'].apply(lambda x: Chem.MolFromSmiles(x))
    chunk_copy['fps'] = chunk_copy['mol'].apply(lambda x: 
        rdMolDescriptors.GetMorganFingerprintAsBitVect(x, 2, nBits=2048))
    chunk_copy.drop(columns=['mol'], inplace=True)
    chunk_copy['fps'] = chunk_copy['fps'].apply(lambda x: list(x))
    chunk_copy['fps'] = chunk_copy['fps'].apply(sparse2dense)
    return_list.append(chunk_copy)
    return

In [18]:
alphabet = sf.get_alphabet_from_selfies(return_df['selfies'])

In [19]:
return_df.to_parquet(path)

In [20]:
alphabet = list(alphabet)

In [21]:
alphabet.sort()
print(alphabet)

['[#Branch1]', '[#Branch2]', '[#C]', '[#N]', '[=Branch1]', '[=Branch2]', '[=C]', '[=N]', '[=O]', '[=Ring1]', '[=Ring2]', '[=S]', '[Br]', '[Branch1]', '[Branch2]', '[C]', '[Cl]', '[F]', '[I]', '[N+1]', '[NH1]', '[N]', '[O]', '[P]', '[Ring1]', '[Ring2]', '[S]']
