## Computes number of atoms and minimum bond edit distance for all products 

In [1]:
from collections import defaultdict
import numpy as np

import matplotlib.pyplot as plt
import pandas as pd

from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Atom, BondType

from tqdm.notebook import tqdm

import os

import SM_amats
import processing_utils
import multiprocessing as mp

In [2]:
if not os.path.exists("./data_files/smiles_with_all_dists"):
    os.makedirs("./data_files/smiles_with_all_dists")

### various utility functions

In [3]:
def canonize_smiles(s):
    return Chem.MolToSmiles(Chem.MolFromSmiles(s))


alphabet = "cnoCNO"
alphabet = [i for i in alphabet]
alphabet.sort()

# a quick way to get heavy atom count without going through the mol object
# only for CNO systems - if there are others, more atoms will need to be included in the alphabet variable.

def count_letters(s):
    # counts number of relevant letters in a string
    result = len([char for char in s if char in alphabet])
    return result

In [4]:
# can be pretty memory-intensive.

result = []

for i in tqdm(range(14)):
    _ = processing_utils.compile_smiles_dists(i)
    

  0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

### make a default dict that collects the minimum distance, for each time a certain SMILES appears

In [None]:
sd = defaultdict(list)

for file_index in tqdm(range(18)):
    
    file_tag = str(file_index).zfill(2)
    data = pd.read_csv(f"./data_files/smiles_with_all_dists/smiles_with_all_dists_{file_tag}.csv")
    dist_array = np.array(data[list(data)[1:]])
    
    # get minumum distances across all 4 hybridization combinations, for atoms =< 4 heavy atoms.
    data["min_dist_all"] = np.min(dist_array,axis=1)
    
    for r in data.itertuples():
        split_smiles = r.smiles.split(".")
    
        for ss in split_smiles:
                       
            atom_count = int(count_letters(ss))
            if atom_count >= 4:
                sd[ss].append(r.min_dist_all)
        
    data = []

### canonize entries. Done here to minimize work duplication.

In [None]:
# less than 1 min
sd2 = defaultdict(list)
for k in tqdm(sd.keys()):
    canon_smiles = canonize_smiles(k)
    sd2[canon_smiles].append(np.min(sd[k]))
sd = []

In [None]:
data_raw = pd.DataFrame(data={"smiles":sd2.keys(),
                              "min_dist_all":[min(i) for i in sd2.values()],
                             "natoms":[count_letters(s) for s in sd2.keys()]})

data_raw.head()

In [None]:
data_raw.to_csv("./data_files/smiles_min_dist_natoms.csv")

### we do the same, but only retrieving product systems that have less than 6 bond edits

In [None]:
if not os.path.exists("./data_files/smiles_min_dist_6"):
    os.makedirs("./data_files/smiles_min_dist_6")

In [None]:
for file_index in tqdm(range(18)):
    
    file_tag = str(file_index).zfill(2)
    data = pd.read_csv(f"./data_files/smiles_with_all_dists/smiles_with_all_dists_{file_tag}.csv")
    dist_array = np.array(data[list(data)[1:]])
    
    # get minumum distances across all 4 hybridization combinations.
    data["min_dist_all"] = np.min(dist_array,axis=1)
    data["local_index"] = data.index
    data = data[data.min_dist_all <= 6].copy()
    data.reset_index(drop=True,inplace=True)
    
    data.to_csv(f"./data_files/smiles_min_dist_6/smiles_md6_{file_tag}.csv")
        
    data = []