In [8]:
import json
from glob import glob
from rdkit import Chem
import os

# Generating GNN Input

{
“DrugID”: {
	“EC_Full”: “x.x.x.x”, (string)
	“EC_First”: x, (int)
	“EC_Second: x, (int)
	“Reaction_Center”: x, (int: SMILE index of the rxn center) 
}
}


In [9]:
OUTDIR = "output/set1_subset2_products"
outfile_name = "E-Microbial_gnn_input.json"

OUTDIR, outfile_name

('output/set1_subset2_products', 'E-Microbial_gnn_input.json')

In [10]:
results_dict = {}
lengths_of_reaction_centers = []
lengths_of_queries = []
lenghts_of_templates = []
drug_paths_visited = []
drugs_that_made_it = []

for drug_folder_path in glob(f'{OUTDIR}/*'):
    drug_paths_visited.append(drug_folder_path)
    drug_reaction_count = 1
    drug_made_it = False
    for reaction_folder_path in glob(f"{drug_folder_path}/*"):
        json_paths = glob(f"{reaction_folder_path}/*.json")
        for i, json_path in enumerate(json_paths):
            raw_output = json.load(open(json_path))
            
            drugID = raw_output['QueryInformation'][0]['name']
            drugSMILES = raw_output['QueryInformation'][0]['smiles']
            drugRxnCtr = raw_output['QueryInformation'][0]['reactionCenter(s)']
            lengths_of_reaction_centers = lengths_of_reaction_centers + [len(drugRxnCtr)]
            
            rxn_prox_id = reaction_folder_path.split('/')[-1]
            rxn_names = raw_output['TemplateReaction'][0]["ID"][0]
            
            product_smile = raw_output['GeneratedProduct'][0]['smiles']
            mol = Chem.MolFromSmiles(product_smile)
            product_key = Chem.inchi.MolToInchiKey(mol)
            
            for enzymeID in raw_output['TemplateReaction'][0]['ec'][0]:
                composite_drugID = drugID + '_' + str(drug_reaction_count)
                if enzymeID.split('.')[0] == '-' or enzymeID.split('.')[1] == '-': continue
                ec_firstnum = int(enzymeID.split('.')[0])
                ec_secondnum = int(enzymeID.split('.')[1])
                
                results_dict[composite_drugID] = {
                    "Drug_Smiles": drugSMILES,
                    "Product_InChi_Key": product_key,
                    "EC_Full": enzymeID,
                    "EC_First": ec_firstnum,
                    "EC_Second": ec_secondnum,
                    "Reaction_Centers": drugRxnCtr,
                    "Proximal_Reaction_ID": rxn_prox_id,
                    "RR_Rule_IDs": rxn_names
                }
                
                drug_made_it = True
                drug_reaction_count += 1
            
    if drug_made_it: drugs_that_made_it.append(drug_folder_path)
set(lengths_of_reaction_centers), results_dict


({1},
 {'Balsalazide_1': {'Drug_Smiles': 'O=C(O)CCNC(=O)c1ccc(/N=N/c2ccc(O)c(C(=O)O)c2)cc1',
   'Product_InChi_Key': 'XXKMDOGUWRFXQT-OCEACIFDSA-N',
   'EC_Full': '3.2.1.21',
   'EC_First': 3,
   'EC_Second': 2,
   'Reaction_Centers': [19],
   'Proximal_Reaction_ID': 'R1269',
   'RR_Rule_IDs': ['RR-02-b7f3a66a9a179f00-02-F_r',
    'RR-02-b2e200dcd6d7d0ba-02-F_r',
    'RR-02-1caab062d13ab449-04-F_r',
    'RR-02-236fde7ecc4f5730-04-F_r',
    'RR-02-55c780b542a1f3be-06-F_r',
    'RR-02-6236b1816a8f42ea-06-F_r',
    'RR-02-133daf6aa404db4e-08-F_r',
    'RR-02-4807458e26c79280-08-F_r',
    'RR-02-896fc68c751da0cf-10-F_r',
    'RR-02-3ccef3d96e057576-10-F_r',
    'RR-02-117f5c406a0af338-12-F_r',
    'RR-02-c1ecfbb40cbf8b3d-12-F_r',
    'RR-02-74d71c53a82d30d7-14-F_r',
    'RR-02-1e54069d3bee913d-14-F_r',
    'RR-02-74d71c53a82d30d7-16-F_r',
    'RR-02-1e54069d3bee913d-16-F_r']},
  'Balsalazide_2': {'Drug_Smiles': 'O=C(O)CCNC(=O)c1ccc(/N=N/c2ccc(O)c(C(=O)O)c2)cc1',
   'Product_InChi_Key': 'XXK

In [11]:
# the json file where the output must be stored 
out_file = open(f"{outfile_name}", "w") 
  
json.dump(results_dict, out_file, indent = 6) 
  
out_file.close() 