In [9]:
import os
import copy

import numpy as np
import rdkit
from rdkit.Chem.rdmolfiles import SmilesMolSupplier
import h5py
from tqdm import tqdm


from Parameters import Parameters as C

from MolecularGraph import PreprocessingGraph

from tqdm import tqdm


In [2]:
with open('./data/test.smi') as smi_file:
    first_line = smi_file.readline()
    has_header = bool("SMILES" in first_line)
smi_file.close()

# read file
molecule_set = SmilesMolSupplier('./data/test.smi', sanitize=True, nameColumn=-1, titleLine=has_header)

number_of_molecule = len(molecule_set)
def get_graph(mol):
    
    if mol is not None:
        if not C.use_aromatic_bonds:
            rdkit.Chem.Kekulize(mol, clearAromaticFlags=True)
        molecular_graph = PreprocessingGraph(molecule=mol, constants=C)

        return molecular_graph


def calculate_reversing_decode_route_length(molecular_graph):
    
    return molecular_graph.get_n_edges() + 2

n_subgraphs = 0  
molecular_graph_generator = map(get_graph, molecule_set)


# for molecular_graph in tqdm(molecular_graph_generator , total=number_of_molecule):
#     n_SGs = calculate_reversing_decode_route_length(molecular_graph=molecular_graph)
#     n_subgraphs += n_SGs


In [3]:
n_subgraphs = 7336931

In [4]:
def get_dataset_dims():

    dims = {}
    dims["nodes"] = C.dim_nodes
    dims["edges"] = C.dim_edges
    dims["APDs"] = [np.prod(C.dim_f_add) + np.prod(C.dim_f_conn) + 1]
    return dims

In [5]:
dataset_names = ["nodes", "edges", "APDs"]
dims = get_dataset_dims()


def create_datasets(hdf_file, max_length, dataset_name_list, dims):
    ds = {}

    for ds_name in dataset_name_list:
        ds[ds_name] = hdf_file.create_dataset(ds_name,
                                              (max_length, *dims[ds_name]),
                                              chunks=True,
                                              dtype=np.dtype("int8"))

    return ds


In [6]:
def save_group(dataset_dict, data_subgraphs, data_APDs, n_SGs, init_idx):

    nodes = np.array([graph_tuple[0] for graph_tuple in data_subgraphs])
    edges = np.array([graph_tuple[1] for graph_tuple in data_subgraphs])
    APDs = np.array(data_APDs)

    end_idx = init_idx + n_SGs 

    dataset_dict["nodes"][init_idx:end_idx] = nodes
    dataset_dict["edges"][init_idx:end_idx] = edges
    dataset_dict["APDs"][init_idx:end_idx] = APDs

    return dataset_dict


def get_graph(mol):
    
    if mol is not None:
        if not C.use_aromatic_bonds:
            rdkit.Chem.Kekulize(mol, clearAromaticFlags=True)
        molecular_graph = PreprocessingGraph(molecule=mol, constants=C)

        return molecular_graph

def generate_decoding_states(molecular_graph, subgraph_idx):

    molecular_graph = copy.deepcopy(molecular_graph)

    if subgraph_idx != 0:
        
        for _ in range(1, subgraph_idx):
            molecular_graph.truncate_graph()
            
        decoding_APD = molecular_graph.get_decoding_APD()
        molecular_graph.truncate_graph()
        
        X, E = molecular_graph.get_graph_state()
        
    elif subgraph_idx == 0:
        
        decoding_APD = molecular_graph.get_final_decoding_APD()
        
        X, E = molecular_graph.get_graph_state()

    else:
        raise ValueError("`subgraph_idx` not a valid value.")

    decoding_graph = [X, E]
    return decoding_graph, decoding_APD
def group_subgraphs(init_idx, molecule, dataset_dict):
   
    data_subgraphs = []        
    data_APDs = []           
 
    molecular_graph_generator = get_graph(molecule)

    molecules_processed = 0 
    
    for graph in [molecular_graph_generator]:
      
        molecules_processed += 1

        n_SGs = calculate_reversing_decode_route_length(molecular_graph=graph)

        for new_SG_idx in range(n_SGs):  
            
            SG, APD = generate_decoding_states(molecular_graph=graph,
                                                   subgraph_idx=new_SG_idx)
            data_subgraphs.append(SG)
            data_APDs.append(APD)
          
    dataset_dict = save_group(dataset_dict=dataset_dict,
                              n_SGs=n_SGs,
                              data_subgraphs=data_subgraphs,
                              data_APDs=data_APDs,
                              init_idx=init_idx)

    len_data_subgraphs = len(data_subgraphs)
    return molecules_processed, dataset_dict, len_data_subgraphs

In [7]:
if os.path.exists(f"./train.h5.chunked"):
        print("Chunk File Already exist Removing Previous Chunk File")
        os.remove(f"./train.h5.chunked")
        
# with h5py.File(f"./train.h5.chunked", "a") as hdf_file:
#         print("Creating HDF File To Store APDs")
        
#         ds = create_datasets(hdf_file=hdf_file,
#                                 max_length=n_subgraphs,
#                                 dataset_name_list=dataset_names,
#                                 dims=dims)
        

#         dataset_size = 0  
#         for init_idx in tqdm(range(30, number_of_molecule)):
                
#                 (final_molecule_idx, ds, len_data_subgraphs) = group_subgraphs(init_idx=init_idx,
#                                                 molecule=molecule_set[init_idx],
#                                                 dataset_dict=ds,                                           
#                                                 )
            
#                 dataset_size += len_data_subgraphs
        # print("Start Looping Over Molecules")
        # for init_idx in tqdm(range(0, number_of_molecule)):

Chunk File Already exist Removing Previous Chunk File


In [9]:
from multiprocessing import Process, Manager
import threading


In [10]:
n_threads = 10

In [None]:

def process_range(start, end, hdf_file_path, molecule_set, dataset_dict , shared_counter):
    dataset_size = 0
    for init_idx in range(start, end):
        final_molecule_idx, ds, len_data_subgraphs = group_subgraphs(
            init_idx=init_idx,
            molecule=molecule_set[init_idx],
            dataset_dict=dataset_dict,
        )
        with shared_counter.get_lock():
            shared_counter.value += 1
        dataset_size += len_data_subgraphs
    # Additional processing if needed
    return dataset_size


def tqdm_thread(counter, total):
    pbar = tqdm(total=total)
    while True:
        pbar.n = counter.value
        pbar.refresh()
        if pbar.n >= total:
            break
    pbar.close()


# manager = Manager()
# shared_counter = manager.Value('i', 0)

# # Create and start tqdm thread
# tqdm_thread = threading.Thread(target=tqdm_thread, args=(shared_counter, end - start))
# tqdm_thread.start()


# with h5py.File(f"./train.h5.chunked", "a") as hdf_file:
#     print("Creating HDF File To Store APDs")
#     ds = create_datasets(hdf_file=hdf_file, max_length=n_subgraphs, dataset_name_list=dataset_names, dims=dims)

#     # Split the range into 4 parts
#     range_splits = [(i * number_of_molecule // n_threads, (i + 1) * number_of_molecule // n_threads) for i in range(n_threads)]

#     processes = []
#     for start, end in range_splits:
#         p = Process(target=process_range, args=(start, end, hdf_file.filename, molecule_set, ds))
#         processes.append(p)
#         p.start()

#     for p in processes:
#         p.join()
manager = Manager()
shared_counter = manager.Value('i', 0)

with h5py.File("./train.h5.chunked", "a") as hdf_file:
    print("Creating HDF File To Store APDs")
    ds = create_datasets(hdf_file=hdf_file, max_length=n_subgraphs, dataset_name_list=dataset_names, dims=dims)

    range_splits = [(i * number_of_molecule // n_threads, (i + 1) * number_of_molecule // n_threads) for i in range(n_threads)]

    processes = []
    for start, end in range_splits:
        p = Process(target=process_range, args=(start, end, hdf_file.filename, molecule_set, ds, shared_counter))
        processes.append(p)
        p.start()

    # Start the tqdm thread
    tqdm_thread_instance = threading.Thread(target=tqdm_thread, args=(shared_counter, number_of_molecule))
    tqdm_thread_instance.start()

    for p in processes:
        p.join()

    # Ensure the tqdm thread is also joined after all processes complete
    tqdm_thread_instance.join()


Creating HDF File To Store APDs


  0%|          | 15/29055 [00:34<18:50:39,  2.34s/it]

KeyboardInterrupt: 

  0%|          | 25/29055 [00:57<20:02:50,  2.49s/it]

In [7]:
from rdkit import Chem
i = 0
with Chem.MultithreadedSDMolSupplier('./data/test.smi') as sdSupl:
  for mol in sdSupl:
    
    
    if mol is not None:
      i += 1
# with Chem.MultithreadedSDMolSupplier('./data/test.smi') as sdSupl:
    

None


In [2]:
i

0

In [11]:

molecule_set = SmilesMolSupplier('./data/test.smi', sanitize=True, nameColumn=-1, titleLine=0)


In [12]:
molecule_set[0]

<rdkit.Chem.rdchem.Mol at 0x7fb16c6d93c0>