In [1]:
import argparse
import logging
import os
import torch
from torch.utils.data import DataLoader, random_split
import json
import rdkit.Chem as Chem
#from rxntorch.models.reactivity_network import ReactivityNet as RxnNet, ReactivityTrainer as RxnTrainer
#from rxntorch.utils import collate_fn
import rdkit
from rxntorch.containers.reaction import Rxn
from rxntorch.containers.molecule import Mol
from rxntorch.containers.dataset import RxnGraphDataset as RxnGD
from rxntorch.utils import collate_fn
import warnings
warnings.filterwarnings("ignore")
from rdkit.Chem import AllChem
from rdkit.Chem import Draw
from collections import defaultdict

In [2]:
import re
import glob


def get_atom_mapping_doyle(mol_dir):
    
    smiles_mapping=defaultdict(dict) 
    
    for mol_fn in glob.glob(mol_dir):
        atoms , labels, atom_mapping =[], [], []
        if True:
            
            m = Chem.MolFromMolFile(mol_fn)
            smiles=Chem.MolToSmiles(m)
            mol_lines=open(mol_fn,'r').readlines()

            for i,line in enumerate(mol_lines[4:]):
                l=re.sub(' +', ' ', line.strip('\n'))
                l2=l.split(' ')
                if len(l2)==17:
                    atom=l2[4]
                    if atom !='' and '*' not in atom and 'H' not in atom:
                        atoms.append(atom)
                if "atom_labels" in line:
                    labels=[i.strip('*') for i in mol_lines[i+1+4].strip('\n').split(' ') if ((i!='') and ('H' not in i))]
            if len(atoms)==len(labels):
                for i in range(len(atoms)):
                    atom_mapping.append((atoms[i],labels[i]))            
            else:
                print('Atoms and lables don\'t match')
                atom_mapping=[]

            if smiles not in smiles_mapping:
                smiles_mapping[mol_fn.split('/')[-1].split('.')[0]]=atom_mapping
            else:
                print("key exsists")
    return smiles_mapping



In [3]:
smiles_mapping=get_atom_mapping_doyle("data/doyle_reaction_mols/*.mol")

In [None]:
weights, volumes,surface_areas=[],[],[]
vs, ovalities,hardnesses=[],[],[]
dipole_moments, electroes, HOMOs, LUMOs = [],[],[],[]
charges,shifts = [],[]

            weights.append(weight)
            vs.append(volume)
            surface_areas.append(surface_area)
            ovalities.append(ovality)
            hardnesses.append(hardness)
            dipole_moments.append(dipole_moment)
            electroes.append(electronegativity)
            HOMOs.append(HOMO)
            LUMOs.append(LUMO)

In [34]:
import math
from sklearn.preprocessing import LabelEncoder
import numpy as np

rows=set()
file_name = 'doyle_reactions_data.json'
path = 'data/'
mol_path='doyle_reaction_mols'

data_dict=defaultdict(lambda: defaultdict(float))
rxns = []
max_nbonds = 10   
smiles_mapping = get_atom_mapping_doyle(mol_path+'/*.mol') 

with open(os.path.join(path, file_name)) as datafile:
    data = json.load(datafile)
    for line in data:
        plate, row, col = line["plate"], line["row"], line["col"]
        #if plate==1 and row==2 and col==1:
            #print(line)
        name="-".join(map(str,[plate, row, col]))
        product=line['product']
        reactants=line['reactants']
        r_yield=line['yield']
        rxn = Rxn(product,reactants,r_yield)
        mol_reactants=rxn.reactants
        for mol_idx in range(len(mol_reactants)):
            current_molecule = mol_reactants[mol_idx]
            category = current_molecule.category
            #print(category)
            vib_modes = current_molecule.vib_modes

            atoms =current_molecule.atoms
            all_attributes=current_molecule.get_attributes().squeeze().tolist()
            weight,volume,surface_area,ovality,hardness,dipole_moment,electronegativity,HOMO,LUMO=all_attributes
            data_dict[name][category] = current_molecule.name
            data_dict[name][category +'_molecular_weight'] = round(float(weight),5)
            data_dict[name][category +'_molecular_volume'] = round(float(volume),5)
            data_dict[name][category +'_surface_area'] = round(float(surface_area),5)
            data_dict[name][category +'_ovality'] = round(float(ovality),5)
            data_dict[name][category +'_hardness'] = round(float(hardness),5)
            data_dict[name][category +'_dipole_moment'] = round(float(dipole_moment),5)
            data_dict[name][category +'_electronegativity'] = round(float(electronegativity),5)
            data_dict[name][category +'_E_HOMO'] = round(float(HOMO),5)
            data_dict[name][category +'_E_LOMO'] = round(float(LUMO),5)
            for n in range(len(vib_modes)):
                data_dict[name][category + '_V'+str(n)+'_frequency'] = round(float(vib_modes[n][0]),5)
                data_dict[name][category + '_V'+str(n)+'_intensity'] = round(float(vib_modes[n][1]),5)
            
            for atom in atoms:
                if 'partial_charge' in atom:
                    data_dict[name][category+'_.'+atom['name']+'_electrostatic_charge']=round(float(atom['partial_charge']),5)

                if 'nmr_shift' in atom:
                    data_dict[name][category+'_.'+atom['name']+'_NMR_shift']= round(float(atom['nmr_shift']),5)
        if isfloat(r_yield):                
            data_dict[name]['yield'] = round(float(r_yield),5)      
        else:
            del data_dict[name]
            rows.add(str(row)+'-'+str(col))
print("reactions with problematic yield: ",len(rows))

reactions with problematic yield:  9


In [19]:
def isfloat(value):
    try:
        float(value)
        return True
    except ValueError:
        return False

In [31]:
import pandas as pd
df=pd.DataFrame.from_dict(data_dict, orient='index')
df=df.reset_index()
df.to_csv('data/doyle_data.csv')



In [25]:
len(data_dict)

4599

In [41]:
am_df=pd.read_csv('data/amination.csv')

In [46]:
list(am_df.columns)

['additive',
 'additive_.C3_NMR_shift',
 'additive_.C3_electrostatic_charge',
 'additive_.C4_NMR_shift',
 'additive_.C4_electrostatic_charge',
 'additive_.C5_NMR_shift',
 'additive_.C5_electrostatic_charge',
 'additive_.N1_electrostatic_charge',
 'additive_.O1_electrostatic_charge',
 'additive_E_HOMO',
 'additive_E_LUMO',
 'additive_V1_frequency',
 'additive_V1_intensity',
 'additive_dipole_moment',
 'additive_electronegativity',
 'additive_hardness',
 'additive_molecular_volume',
 'additive_molecular_weight',
 'additive_ovality',
 'additive_surface_area',
 'aryl_halide',
 'aryl_halide_.C1_NMR_shift',
 'aryl_halide_.C1_electrostatic_charge',
 'aryl_halide_.C2_NMR_shift',
 'aryl_halide_.C2_electrostatic_charge',
 'aryl_halide_.C3_NMR_shift',
 'aryl_halide_.C3_electrostatic_charge',
 'aryl_halide_.C4_NMR_shift',
 'aryl_halide_.C4_electrostatic_charge',
 'aryl_halide_.H2_NMR_shift',
 'aryl_halide_.H2_electrostatic_charge',
 'aryl_halide_.H3_NMR_shift',
 'aryl_halide_.H3_electrostatic_char

In [53]:
am_df[(am_df['aryl_halide']=='3-iodopyridine') & (am_df['base']=='MTBD') & (am_df['ligand']=='XPhos')]

Unnamed: 0,additive,additive_.C3_NMR_shift,additive_.C3_electrostatic_charge,additive_.C4_NMR_shift,additive_.C4_electrostatic_charge,additive_.C5_NMR_shift,additive_.C5_electrostatic_charge,additive_.N1_electrostatic_charge,additive_.O1_electrostatic_charge,additive_E_HOMO,...,ligand_V6_intensity,ligand_V7_frequency,ligand_V7_intensity,ligand_V8_frequency,ligand_V8_intensity,ligand_V9_frequency,ligand_V9_intensity,ligand_dipole_moment,yield,plate
44,5-phenylisoxazole,143.12,0.223,93.06,-0.447,162.34,0.292,-0.334,-0.057,-0.2317,...,4.414,3026.561,16.577,3043.097,18.145,3064.344,38.21,1.212924,17.378304,plate1
89,ethyl-3-methylisoxazole-5-carboxylate,153.74,0.719,103.98,-0.533,155.9,0.187,-0.364,-0.087,-0.2736,...,4.414,3026.561,16.577,3043.097,18.145,3064.344,38.21,1.212924,46.0142,plate1
134,ethyl-5-methylisoxazole-3-carboxylate,150.81,0.279,99.02,-0.545,163.54,0.602,-0.322,-0.096,-0.264,...,4.414,3026.561,16.577,3043.097,18.145,3064.344,38.21,1.212924,47.33481,plate1
584,4-phenylisoxazole,140.71,0.187,117.52,-0.168,146.61,0.062,-0.313,-0.067,-0.2332,...,4.414,3026.561,16.577,3043.097,18.145,3064.344,38.21,1.212924,25.939748,plate1
629,3-phenylisoxazole,154.33,0.439,95.8,-0.451,150.97,0.113,-0.332,-0.081,-0.2382,...,4.414,3026.561,16.577,3043.097,18.145,3064.344,38.21,1.212924,53.364639,plate1
674,3-methylisoxazole,151.95,0.723,99.77,-0.493,149.94,0.045,-0.384,-0.077,-0.2612,...,4.414,3026.561,16.577,3043.097,18.145,3064.344,38.21,1.212924,49.096822,plate1
1124,5-methylisoxazole,142.24,0.152,96.34,-0.505,161.08,0.606,-0.321,-0.112,-0.2538,...,4.414,3026.561,16.577,3043.097,18.145,3064.344,38.21,1.212924,29.068447,plate2
1169,benzo[c]isoxazole,148.88,0.501,113.25,0.053,147.73,-0.145,-0.394,-0.009,-0.2233,...,4.414,3026.561,16.577,3043.097,18.145,3064.344,38.21,1.212924,5.085671,plate2
1214,"3,5-dimethylisoxazole",152.73,0.66,98.89,-0.644,161.03,0.555,-0.398,-0.128,-0.2472,...,4.414,3026.561,16.577,3043.097,18.145,3064.344,38.21,1.212924,47.66949,plate2
1259,methyl-isoxazole-5-carboxylate,143.07,0.225,103.17,-0.384,154.86,0.19,-0.29,-0.059,-0.2806,...,4.414,3026.561,16.577,3043.097,18.145,3064.344,38.21,1.212924,12.53742,plate2


In [40]:
data_dict['1-1-47']

defaultdict(float,
            {'aryl_halide': '3-iodopyridine',
             'aryl_halide_molecular_weight': 204.994,
             'aryl_halide_molecular_volume': 118.02,
             'aryl_halide_surface_area': 136.84,
             'aryl_halide_ovality': 1.176,
             'aryl_halide_hardness': 0.11,
             'aryl_halide_dipole_moment': 1.989,
             'aryl_halide_electronegativity': 0.14,
             'aryl_halide_E_HOMO': -0.25,
             'aryl_halide_E_LOMO': -0.04,
             'aryl_halide_V1_frequency': 715.898,
             'aryl_halide_V1_intensity': 20.803,
             'aryl_halide_V2_frequency': 1291.42,
             'aryl_halide_V2_intensity': 0.104,
             'aryl_halide_V3_frequency': 1619.152,
             'aryl_halide_V3_intensity': 11.53,
             'aryl_halide_.C1_electrostatic_charge': -0.557,
             'aryl_halide_.C1_NMR_shift': -332.15,
             'aryl_halide_.C2_electrostatic_charge': 0.392,
             'aryl_halide_.C2_NMR_shift'

In [66]:
df.shape,am_df.shape

((4608, 137), (3960, 126))

In [67]:
for i in list(df.columns):
    if i not in list(am_df.columns):
        print(i)

aryl_halide_E_LOMO
aryl_halide_V0_frequency
aryl_halide_V0_intensity
base_E_LOMO
base_V0_frequency
base_V0_intensity
base_V1_frequency
base_V1_intensity
base_V2_frequency
base_V2_intensity
ligand_molecular_weight
ligand_molecular_volume
ligand_surface_area
ligand_ovality
ligand_hardness
ligand_electronegativity
ligand_E_HOMO
ligand_E_LOMO
ligand_V0_frequency
ligand_V0_intensity
additive_E_LOMO
additive_V0_frequency
additive_V0_intensity


In [63]:
for i in list(am_df.columns):
    if i not in list(df.columns):
        print(i)

additive_.C4_NMR_shift
additive_.C4_electrostatic_charge
additive_E_LUMO
additive_V1_frequency
additive_V1_intensity
aryl_halide_E_LUMO
aryl_halide_V3_frequency
aryl_halide_V3_intensity
base_E_LUMO
ligand_V10_frequency
ligand_V10_intensity
plate


In [46]:
def norm_vals(array):
    mean=np.mean(array)
    std=np.std(array)
    return mean,std

In [10]:
len(HOMOs),len(set(HOMOs))
len(vs),len(set(vs))

(17952, 42)

ModuleNotFoundError: No module named 'matplotlib'