# Generate 3D Conformers of Reaction Products and Filter by Score
## Use OpenEye Toolkits and Tanimoto Combo scoring
This code can be used to score products of reactions using Tanimoto combo and filter these based on their score. 

In [None]:
import openeye.oechem as oechem
import openeye.oeshape as oeshape
import openeye.oedepict as oedepict
import openeye.oemolprop as oemolprop
from IPython.display import display
import openeye.oeomega as oeomega
from openeye.oeshape import *


In [None]:
#initialize stream to read in molecules and create a list storing all the molecules
# We also want to attach the SD data containing the "reactants" or building blocks that led to each product 
# this will allow us to trace back products in each cluster to its beginnings. 

istream = oechem.oemolistream( '<filename>')

fragments = []

for oemol in istream.GetOEGraphMols():
    oemol.SetTitle(oechem.OEMolToSmiles(oemol) + "Aldehyde:" + oechem.OEGetSDData(oemol, "Aldehyde Reactant") + "   Amine:"+ oechem.OEGetSDData(oemol, "Primary Amine Reactant") )
    fragments.append(oechem.OEMol(oemol))
    
    
istream.close()

print(str(len(fragments)) + '\n' + fragments[0].GetTitle()) #uncomment to make sure it saved


In [None]:
# quickly test to make sure we can visualize imported fragments
import oenotebook as oenb

oenb.draw_mol(fragments[0])

# Perform overlays using ROCS on a matrix of fragments

The dimensions of the matrix correspond to each fragment (the more dimensions, the more fragments are being compared). The maximum number of conformation per molecule can also be set, but this greatly determines the amount of processing time/space needed.

overlay_dist contains all the "reverse Tanimoto" scores, which is the inverse of the tanimoto score between two molecules. This gives a unit of "distance" between two molecules which relates to the 3D similarity.

overlay_Confs contains a list of overlay'ed molecules in which the best is saved. This can be edited to include all conformations by using getOverlayConfs. The molecules in this list are also tagged with which fragments were compared to produce the output molecule. 


In [None]:
import numpy as np
import random
# Initialize OEOmega object and set the max configurations per molecule. Set strict stero and atom types to false to keep restrictions loose
omega = oeomega.OEOmega() 
omega.SetMaxConfs(20)
omega.SetStrictStereo(False)
omega.SetStrictAtomTypes(False)

istream = oechem.oemolistream( '<filename>')

ofs = oechem.oemolostream('<filename>')
ofs.SetFormat(oechem.OEFormat_SDF)

fragments_shuff = []

# shuffle fragments to break up any sorting on similarity; this will help us test if the clustering is working
for oemol in istream.GetOEMols():
    omega(oemol)
    fragments_shuff.append(oechem.OEMol(oemol))
    oechem.OEWriteMolecule(ofs, oemol)

    
random.shuffle(fragments_shuff)


In [None]:
ofs = oechem.oemolostream(<FileName>)
ofs.SetFormat(oechem.OEFormat_SDF)

#uncomment if reading in from file
# fragments_shuff = []
#istream = oechem.oemolistream( <filename> )
#for oemol in istream.GetOEMols():
#    fragments_shuff.append(oechem.OEMol(oemol)

#initialize matrix with dimesions needed 
dim = len(fragments_shuff)
overlay_dist = np.zeros(shape=(dim,dim))
overlay_Confs = []


# Setup ROCS to provide specified number of conformers per hit
# Add our molecule as the one we are fitting    
    
for i,frag in enumerate(fragments_shuff[0:dim]):
    options = OEROCSOptions()
    options.SetNumBestHits(1)
    #options.SetConfsPerHit(10)
    rocs = OEROCS(options)
    rocs.AddMolecule(frag)                             #add in reference molecule
    
    
    for (j, frag2) in enumerate(fragments_shuff[0:dim]):
        if i==j:                                       #take out values on the diagonal, or all the "self" pairs
            continue
        # Loop over results and output
        for res in rocs.Overlay(frag2):                #add in overlay "test" molecule
            outmol = res.GetOverlayConf()              #Use GetOverlayConf to get just the best; GetOverlayConfs for all
            oeshape.OERemoveColorAtoms(outmol)
            oechem.OEAddExplicitHydrogens(outmol)
            outmol.SetTitle(oechem.OEMolToSmiles(outmol))
   
            oechem.OEAddSDData(outmol,"Tanimoto", str(res.GetTanimotoCombo()))
            oechem.OEAddSDData(outmol, "Fragment 1",  oechem.OEMolToSmiles(frag))
            oechem.OEAddSDData(outmol, "Fragment 2",  oechem.OEMolToSmiles(frag2))
            
            overlay_Confs.append(oechem.OEMol(outmol))
            overlay_dist[i,j] = (2 - res.GetTanimotoCombo()) if res.GetTanimotoCombo() <= 2 else 0 #get "distance" from tanimoto score by reversing it; sill set values of the diagonals to 0 (same molecules are compared)
            
            print(overlay_dist[i,j])
            oechem.OEWriteMolecule(ofs, outmol)
            
            
ofs.close()
#print (overlay_dist)

In [None]:
# quickly draw a molecule in overlay_Confs to double check that the title and structure were saved.

oenb.draw_mol(overlay_Confs[7])

# Using DBScan for clustering based on 3D structure
   This method does not need a set number of clusters to be pre-specified. You can specify variation limits of each cluster. We will read in the reverse Tanimoto scores calculated for each conformer in the previous section as a measure of distance.

In [None]:
from sklearn.cluster import DBSCAN
from sklearn import metrics

# eps: Controls maximum distance between two samples to be considered as in neighborhood of the other.
# min_samples: Minimum number of samples (compounds) near a compound for it to be considered a core point
# Metric: "precomputed" means use precomputed distance matrix

clustering = DBSCAN(eps=0.7, min_samples = 2, metric="precomputed")

# Fit clustering
db = clustering.fit(overlay_dist)

# Pull labels
labels = db.labels_
#print(labels)

# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)

print('Estimated number of clusters: %d' % n_clusters_)
print('Total number of points: %d' % len(overlay_dist))
print('Estimated number of noise points: %d' % n_noise_)



# Determine how many compounds are in each cluster.
# Cluster "-1" is the "outliers"/noise points that are not in clusters.

mols_by_cluster = {}
cluster_nrs = set(labels)

for label in cluster_nrs:
    mols_by_cluster[label] = []
    
 # Clarify what this part is doing?   
    for (idx, thislabel) in enumerate(labels):
        if thislabel == label:
            mols_by_cluster[label].append(fragments_shuff[idx])
            
    print("%d molecules in cluster %s" % (len(mols_by_cluster[label]), label))


In [None]:
#check to make sure identity of members in mols_by_cluster is correct

print(mols_by_cluster[0][0])

## In-Notebook Depiction

In [None]:
import oenotebook as oenb

oenb.draw_mol(mols_by_cluster[1][1])

## Out-of-notebook depiction (PDFs)
This will write molecules to PDF files separately based on which cluster they have been grouped into. First, this will look for a folder named cluster_pdfs and delete it if it is found then make a new one; this will prevent multiple copies of the same files getting saved in the folder. 

In [None]:
from openeye import oedepict
import shutil
import os
if os.path.isdir('cluster_pdfs'): shutil.rmtree('cluster_pdfs')
os.mkdir('cluster_pdfs')

for label in mols_by_cluster:
    if label=='-1':
        continue
        
    oemols = [ oechem.OEMol(mol) for mol in mols_by_cluster[label]]
    itf = oechem.OEInterface()
    PageByPage = True
    suppress_h = True
    rows = 10
    cols = 3
    ropts = oedepict.OEReportOptions(rows, cols)
    ropts.SetHeaderHeight(25)
    ropts.SetFooterHeight(25)
    ropts.SetCellGap(2)
    ropts.SetPageMargins(10)
    report = oedepict.OEReport(ropts)
    cellwidth, cellheight = report.GetCellWidth(), report.GetCellHeight()
    opts = oedepict.OE2DMolDisplayOptions(cellwidth, cellheight, oedepict.OEScale_Default * 0.5)
    opts.SetAromaticStyle(oedepict.OEAromaticStyle_Circle)
    pen = oedepict.OEPen(oechem.OEBlack, oechem.OEBlack, oedepict.OEFill_On, 1.0)
    opts.SetDefaultBondPen(pen)
    oedepict.OESetup2DMolDisplayOptions(opts, itf)
    for i, mol in enumerate(oemols):
        cell = report.NewCell()
        mol_copy = oechem.OEMol(mol)
        oedepict.OEPrepareDepiction(mol_copy, False, suppress_h)
        disp = oedepict.OE2DMolDisplay(mol_copy, opts)

        oedepict.OERenderMolecule(cell, disp)

    oedepict.OEWriteReport("cluster_pdfs/cluster%s.pdf" % label, report)

# Sort based on frequency of reactants 
## Find which reactants appear most frequently

In [None]:
from collections import Counter


common_alds = {}
common_amines = {}


c = 1
for label in mols_by_cluster:
    
    oemols = [ oechem.OEMol(mol) for mol in mols_by_cluster[label]]
    alds = []
    ams = []
    
    for i, mol in enumerate(oemols): #i is the counter for each molecule
        ald = oechem.OEGetSDData(mol, "Aldehyde Reactant")
        am = oechem.OEGetSDData(mol, "Amine Reactant")
        alds.append(ald)
        if not ald in common_alds.keys(): #if an entry for the aldehyde reactant does not exist, then add one and give it a value of 1
            common_alds[ald] = 1
        elif common_alds[ald] != c: #if it does exist and hasn't been added within this cluster already(the maximum the value should be is the count of the cluster), then add one to the count
            common_alds[ald] = common_alds[ald] + 1
            
        ams.append(am)
        if not am in common_amines.keys(): #if an entry for the aldehyde reactant does not exist, then add one and assign in the value of the
            common_amines[am] = 1
        elif common_amines[am] != c:
            common_amines[am] = common_amines[am] + 1   
            
    c+=1

    print("\nCluster No." + str(label))
    print ("\nAldehydes")
    print (Counter(alds))
    print ("\nSecondary Amines")
    print(Counter(ams))
        
# get frequency of reactants appearing between clusters    
print ("\n Aldehyde Appearance in clusters:")
print (common_alds)
print("\n Amine Appearance in clusters:")
print (common_amines)

        