## computes number of atoms and minimum bond edit distance 

In [None]:
from collections import defaultdict
import numpy as np

import matplotlib.pyplot as plt

import pandas as pd

from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Atom, BondType

from tqdm.notebook import tqdm
from tqdm.contrib.concurrent import process_map

### matrices of starting systems
ac3 = sp3 acid, am3 = sp3 amine, etc

In [None]:
ac3_am3 = [[0, 1, 0, 0, 0, 0, 0, 0],
           [1, 0, 0, 0, 0, 1, 0, 0],
           [0, 0, 0, 1, 0, 0, 0, 0],
           [0, 0, 1, 0, 1, 0, 0, 0],
           [0, 0, 0, 1, 0, 0, 2, 1],
           [0, 1, 0, 0, 0, 0, 0, 0],
           [0, 0, 0, 0, 2, 0, 0, 0],
           [0, 0, 0, 0, 1, 0, 0, 0]]


ac2_am3 = [[0, 1, 0, 0, 0, 0, 0, 0],
           [1, 0, 0, 0, 0, 1, 0, 0],
           [0, 0, 0, 2, 0, 0, 0, 0],
           [0, 0, 2, 0, 1, 0, 0, 0],
           [0, 0, 0, 1, 0, 0, 2, 1],
           [0, 1, 0, 0, 0, 0, 0, 0],
           [0, 0, 0, 0, 2, 0, 0, 0],
           [0, 0, 0, 0, 1, 0, 0, 0]]

ac3_am2 = [[0, 2, 0, 0, 0, 0, 0, 0],
           [2, 0, 0, 0, 0, 1, 0, 0],
           [0, 0, 0, 1, 0, 0, 0, 0],
           [0, 0, 1, 0, 1, 0, 0, 0],
           [0, 0, 0, 1, 0, 0, 2, 1],
           [0, 1, 0, 0, 0, 0, 0, 0],
           [0, 0, 0, 0, 2, 0, 0, 0],
           [0, 0, 0, 0, 1, 0, 0, 0]]

ac2_am2 = [[0, 2, 0, 0, 0, 0, 0, 0],
           [2, 0, 0, 0, 0, 1, 0, 0],
           [0, 0, 0, 2, 0, 0, 0, 0],
           [0, 0, 2, 0, 1, 0, 0, 0],
           [0, 0, 0, 1, 0, 0, 2, 1],
           [0, 1, 0, 0, 0, 0, 0, 0],
           [0, 0, 0, 0, 2, 0, 0, 0],
           [0, 0, 0, 0, 1, 0, 0, 0]]


various utility functions

In [None]:
# change this for the different atoms
atoms = [6,6,6,6,6,7,8,8]

def molFromAdjMat(atoms, amat):
    """Creates a mol object from an adjacency matrix.
    Inputs:
    atoms: list of atomic numbers of atoms, by row
    amat: adjacency matrix. Has to have same length as atoms (obviously)
    Output: mol object
    Might need to add a try clause for sanitize"""

    m = Chem.RWMol()
    # add in the separate atoms
    for a in atoms: m.AddAtom(Atom(a))
    side_len = len(amat)
    for r in range(side_len):
        for c in range(r+1,side_len):
            bond_order = amat[r][c]
            if bond_order > 0:
                if bond_order == 1: m.AddBond(r,c,BondType.SINGLE)
                if bond_order == 2: m.AddBond(r,c,BondType.DOUBLE)
                if bond_order == 3: m.AddBond(r,c,BondType.TRIPLE)
    try:
        Chem.SanitizeMol(m)
    except: 
        m = Chem.MolFromSmiles("C")
    return m

In [None]:
def canonize_smiles(s):
    return Chem.MolToSmiles(Chem.MolFromSmiles(s))


alphabet = "cnoCNO"
alphabet = [i for i in alphabet]
alphabet.sort()

# a quick way to get heavy atom count without going through the mol object
def count_letters(s):
    # counts number of relevant letters in a string
    result = len([char for char in s if char in alphabet])
    return result

In [None]:
def compile_smiles_dists(file_index):
    atoms = [6,6,6,6,6,7,8,8]

    file_tag = str(file_index).zfill(2)
    amat_file = f"./product_amats/pdt_amat_{file_tag}_int8.npy"
    amats = np.load(amat_file)

    hybrid_combos = ["ac2am2","ac2am3","ac3am2","ac3am3"]
    mols = [molFromAdjMat(atoms,amat) for amat in amats]
    smiles = [Chem.MolToSmiles(m) for m in mols]
    mols = []
#     print("mols get")
    data_df_dict = {}
    data_df_dict["smiles"] = smiles
    for hc in hybrid_combos:
#         print(hc)

        dmat_file = f"./backup/dmats_{hc}_{file_tag}.npy"


        dmats = np.load(dmat_file)
        bond_change_sums = [sum(sum(np.abs(dmat)))/2 for dmat in dmats]
        data_df_dict[hc] = bond_change_sums

    out_df = pd.DataFrame(data=data_df_dict)   
    out_df.to_csv(f"./data_files/smiles_with_all_dists_{file_tag}.csv",index=False)

In [None]:
foo = process_map(compile_smiles_dists, range(56), max_workers = 16)

out_df = []

In [None]:
sd = defaultdict(list)
for file_index in tqdm(range(56)):
    
    file_tag = str(file_index).zfill(2)
    data = pd.read_csv(f"./data_files/smiles_with_all_dists_{file_tag}.csv")
    dist_array = np.array(data[list(data)[1:]])
    data["min_dist_all"] = np.min(dist_array,axis=1)
    
    for r in data.itertuples():
        split_smiles = r.smiles.split(".")
    
        for ss in split_smiles:
            atom_count = int(count_letters(ss))
            if atom_count >= 4:
                sd[ss].append(r.min_dist_all)
        
    data = []

In [None]:
sd2 = {}

for k in sd.keys():
    sd2[k] = np.min(sd[k])

sd = []

data_raw = pd.DataFrame(data={"smiles":sd2.keys(),"min_dist_all":sd2.values()})

data_raw.head()

In [None]:
smiles_raw = list(data_raw.smiles)

natoms = [count_letters(s) for s in smiles_raw]

data_raw["natoms"] = natoms
data_raw.to_csv("./data_files/smiles_min_dist_natoms.csv")

clean up if needed

In [None]:
# !rm ./data_files/smiles_with_all_dists*