Conda environment: pygeo

In [None]:
import numpy as np
import pandas as pd
import random,math,os,copy
random.seed(0)
import rdkit
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem.rdDistGeom import GetMoleculeBoundsMatrix

# Pytorch and Pytorch Geometric
import torch
from torch import nn
import torch.nn.functional as Fun
import torch_geometric
from torch_geometric.data import Data
from torch_geometric.nn import GraphConv
from torch_geometric.loader import DataLoader
from torch_geometric.nn import global_mean_pool
from torch.nn import Linear
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error


# 1) Functions
## 1.1) Dataset

In [None]:
def onehot(x,ls:list)->list:
    """Take an input x and list and return equally sized list but with 1 and 0 depending 
    on if the input x matches the elements in the list"""
    return [int(x==e) for e in ls]

def bondfeatures(bond:rdkit.Chem.rdchem.Bond,dmatrix:np.ndarray,featuredic:dict={'bondtype':True,
                'conjugated':True,
                'inring':True,
                'stereochemistry':True,
                'bondlength':True}
               )->np.ndarray:
    """Given a bond, distance matrix, and dictionary of desired features, return a numpy array of bond features"""
    bond_features = []
    if featuredic['bondtype']: ### onehot for bond order
        bond_features += onehot(bond.GetBondType(),[Chem.rdchem.BondType.SINGLE, Chem.rdchem.BondType.DOUBLE, Chem.rdchem.BondType.TRIPLE, Chem.rdchem.BondType.AROMATIC])
    if featuredic['conjugated']: ### 1/0 if bond is conjugated
        bond_features += [int(bond.GetIsConjugated())]
    if featuredic['inring']: ### 1/0 if bond is in a ring
        bond_features += [int(bond.IsInRing())]
    if featuredic['stereochemistry']: ### onehot for bond stereochemistry
        bond_features += onehot(bond.GetStereo(),["STEREOZ", "STEREOE", "STEREOANY", "STEREONONE"])
    if featuredic['bondlength']: ### Using a distance matrix containing bond lengths, get the mean bond length for bond A-B and B-A
        avgbond = np.mean([dmatrix[bond.GetBeginAtom().GetIdx()-1][bond.GetEndAtom().GetIdx()-1],dmatrix[bond.GetEndAtom().GetIdx()-1][bond.GetBeginAtom().GetIdx()-1]])
        bond_features += [(avgbond-bmin)/(bmax-bmin)]

    return np.array(bond_features) 

def atomfeatures2(atom:rdkit.Chem.rdchem.Atom, 
                 featuredic:dict = {'atomtypeonehot':True,
                 'useHs':False,
                 'heavyatomValence':True,
                 'formalcharge':True,
                 'hybridization':True,
                 'ringmembership':True,
                 'aromatic':True,
                 'electronegativity':True,
                 'atompolarizability':True,
                 'gasteiger':True,
                 'mass':True,
                 'vdw':True,
                 'covalent':True,
                 'chirality':True, 
                 'hydrogencount':True},
                 atomls:list = ['C','N','O','S','F','Si','P','Cl','Br','Unknown']
                      )->np.ndarray: 
    """Given an atom, dictionary of desired features, and atom types, produce a feature vector."""
    atom_feature_vector = []
    if featuredic['atomtypeonehot']: ### Atom type. ex: H:0, C:1, F:0, etc.
        if featuredic['useHs']:
            atomls = ['H'] + atomls
        atom_feature_vector += onehot(str(atom.GetSymbol()), atomls)
    
    if featuredic['heavyatomValence']: ### Atom valency
        atom_feature_vector += [int(atom.GetDegree())]
    
    if featuredic['formalcharge']: ### Formal charge
        atom_feature_vector += [atom.GetFormalCharge()]

    if featuredic['hybridization']: ### SPx hybridization
        atom_feature_vector += onehot(int(atom.GetHybridization()), ["SP", "SP2", "SP3", "OTHER"])

    if featuredic['ringmembership']: ### Whether atom is part of a ring
        atom_feature_vector += [int(atom.IsInRing())]
    
    if featuredic['aromatic']: ### Whether atom is aromatic
        atom_feature_vector += [int(atom.GetIsAromatic())]
    
    if featuredic['electronegativity']: ### Electronegativity (comes from PubChem data)
        atom_feature_vector += [enchart[str(atom.GetSymbol())]]
    
    if featuredic['atompolarizability']: ### Atom polarizability (comes from PubChem data)
        atom_feature_vector += [polchart[str(atom.GetSymbol())]]
    
    if featuredic['gasteiger']: ### Gasteiger partial charges
        atom_feature_vector += [(atom.GetDoubleProp('_GasteigerCharge')-cmin)/(cmax-cmin)]
    
    if featuredic['mass']: ### Atomic Mass
        atom_feature_vector += [masschart[str(atom.GetSymbol())]]
    
    if featuredic['vdw']: ### Van der Waals radius
        atom_feature_vector += [vdwchart[str(atom.GetSymbol())]]
        
    if featuredic['covalent']: ### Covalent radius 
        atom_feature_vector += [covchart[str(atom.GetSymbol())]]
                                        
    if featuredic['chirality'] == True: ### Chirality
        atom_feature_vector += onehot(str(atom.GetSymbol()), ["CHI_UNSPECIFIED", "CHI_TETRAHEDRAL_CW", "CHI_TETRAHEDRAL_CCW", "CHI_OTHER"])
    
    if featuredic['hydrogencount'] == True: ### How many hydrogens are bonded
        atom_feature_vector += onehot(str(atom.GetSymbol()), [0, 1, 2, 3, "3+"])
    return np.array(atom_feature_vector)

def create_pytorchgeometric_dataset2(x_smiles:list, y:list, pattern_smiles:list,afdic:dict,bfdic:dict,AddHs:bool=False,
                                     neighbours:int=2)->list:
    """Create a list of pytorch geometric data objects.
    x_smiles = list of molecule SMILES
    y = calculated barrier height
    pattern_smiles = divinyl ketone bonding patterns (multiple are needed to account for aromaticity), optional method of
        reducing the graph size
    afdic = atom feature dictionary
    bfdic = bond feature dictionary
    AddHs = whether hydrogen atoms should be considered when generating atom features
    neighbours = When using pattern_smiles to reduce the graph size, how many neighbouring atoms to the divinyl ketone
        reaction centre (pattern_smiles) should be included in the graph.
    """
    ### Generate RDKit mol objects for the patterns if neccesary
    if pattern_smiles != None:
        pattern_mols = [Chem.MolFromSmiles(x) for x in pattern_smiles]
    data_list = []
    
    
    for (smiles, y_val) in zip(x_smiles, y):
        
        mol = Chem.MolFromSmiles(smiles)
        if pattern_smiles:
            # Find matching pattern in the molecule. Several matches are needed to account for aromaticity
            matches = None
            matchfound = False
            for p in pattern_mols:
                matches = mol.GetSubstructMatches(p)
                if matches:
                    matchfound=True
                    break 
            if not matchfound:
                raise Exception()
            
            # Get atoms in the subgraph: pattern atoms + their neighbors
            atom_indices = set(matches[0]) ### reaction centre atoms
            for n in range(0,neighbours): ## neighbours
                tempatoms = set()
                ### For each atom in the reaction centre, add the indexes of all of its neighbours to the temporary set
                [tempatoms.update([nbr.GetIdx() for nbr in mol.GetAtomWithIdx(atom).GetNeighbors()]) for atom in atom_indices]
                atom_indices.update(tempatoms)
        else:
            atom_indices = list(range(0,len(mol.GetAtoms())))
        
        ### Prep charges and bond distances
        if AddHs:
            mol = Chem.AddHs(mol)
        AllChem.ComputeGasteigerCharges(mol)
        dmatrix = GetMoleculeBoundsMatrix(mol)
        
        ### Automatic feature dimensions
        n_nodes = len(atom_indices)
        n_node_features = len(atomfeatures2(mol.GetAtomWithIdx(0),afdic))
        n_edge_features = len(bondfeatures(mol.GetBonds()[0],dmatrix,bfdic))
        
        ### Atom index dictionary for correctly mapping atom and bond features
        atom_map = {old_idx: new_idx for new_idx, old_idx in enumerate(atom_indices)}

        # Node feature matrix
        X = np.zeros((n_nodes, n_node_features))
        for old_idx in atom_indices:
            atom = mol.GetAtomWithIdx(old_idx)
            X[atom_map[old_idx], :] = atomfeatures2(atom,afdic)
        X = torch.tensor(X, dtype=torch.float)
        
        # Edge index and features
        edges = []
        edge_features = []
        for bond in mol.GetBonds():
            start, end = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()
            if start in atom_indices and end in atom_indices:
                edges.append((atom_map[start], atom_map[end]))
                edges.append((atom_map[end], atom_map[start]))
                edge_features.append(bondfeatures(bond,dmatrix,bfdic))
                edge_features.append(bondfeatures(bond,dmatrix,bfdic))
        
        # Edge index tensor
        E = torch.tensor(np.array(edges).T, dtype=torch.long)
        
        # Edge feature tensor
        EF = torch.tensor(np.array(edge_features), dtype=torch.float)
        
        # Label tensor
        y_tensor = torch.tensor([y_val], dtype=torch.float)
        
        # Construct PyTorch Geometric data object
        data = Data(x=X, edge_index=E, edge_attr=EF, y=y_tensor)
        data_list.append(data)
    
    return data_list



## 1.2) Training, Validation, Testing, and K-Folds

In [None]:
def training(loader, model, loss, optimizer,device:str='cuda'):
    """Train for one epoch
    loader = dataloader
    model = GNN model 
    loss = loss function
    optimizer = optimizer function"""
    model.train()

    current_loss = 0
    ### To calculate MAE (for ease of analysis), keep track of ground truth and predictions
    groundtruth = []
    predictions = []
    for d in loader: 
        device = torch.device(device)
        d = d.to(device)
        optimizer.zero_grad()
        d.x = d.x.float() ### convert all x features to float

        out = model(d) ### make prediction

        l = loss(out, torch.reshape(d.y, (len(d.y), 1))) ### calculate loss using model output and ground truth
        
        ### Add the data out and ground truth to cpu rather than keep on GPU
        predictions += list(out.cpu().detach().numpy()) ### add to list of predictions
        groundtruth += list(torch.reshape(d.y, (len(d.y), 1)).cpu().numpy()) ### add to list of ground truths

        current_loss += l / len(loader) ### In case of multiple batches, the loss must be added together
        l.backward()
        optimizer.step()
    currentMAE = mean_absolute_error(predictions,groundtruth) ### calculate MAE for ease of analysis
    return current_loss, currentMAE, model

def validation(loader, model, loss):
    """Validation
    loader = dataloader
    model = GNN model 
    loss = loss function"""
    
    model.eval() ### evaluation mode
    
    val_loss = 0
    ### To calculate MAE (for ease of analysis), keep track of ground truth and predictions
    groundtruth = []
    predictions = []
    for d in loader:
        device = torch.device('cuda')
        d = d.to(device)
        out = model(d)
        l = loss(out, torch.reshape(d.y, (len(d.y), 1)))
        val_loss += l / len(loader)
        
        predictions += list(out.cpu().detach().numpy()) ### add to list of predictions
        groundtruth += list(torch.reshape(d.y, (len(d.y), 1)).cpu().numpy()) ### add to list of ground truths
        
    valMAE = mean_absolute_error(predictions,groundtruth) ### calculate MAE
    return val_loss,valMAE

@torch.no_grad()
def testing(loader, model,loss):
    """Testing
    loader = dataloader
    model = GNN model 
    loss = loss function"""
    
    test_loss = 0
    test_target = np.empty((0))
    test_y_target = np.empty((0))
    for d in loader:
        device = torch.device('cuda')
        d = d.to(device)
        out = model(d)
        l = loss(out, torch.reshape(d.y, (len(d.y), 1)))
        test_loss += l / len(loader)

        # save prediction vs ground truth values for later use
        test_target = np.concatenate((test_target, out.cpu().detach().numpy()[:, 0]))
        test_y_target = np.concatenate((test_y_target, d.y.detach().cpu().numpy()))

    return test_loss, test_target, test_y_target

def train_epochs(epochs, model, train_loader, val_loader, test_loader, path,printresults=None,lr=0.001, weight_decay=5e-4,loss = torch.nn.MSELoss()):
    """Training over all epochs
    epochs = number of epochs
    model = GNN model
    train_loader = data loader for training set
    val_loader = data loader for validation set
    test_loader = data loader for testing set
    path = path to save best model
    printresults = print results every x epochs
    lr = learning rate
    weight_decay = decay rate in optimizer"""
    
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)

    train_loss = np.empty(epochs)
    val_loss = np.empty(epochs)
    test_loss = np.empty(epochs)
    train_MAE = np.empty(epochs)
    val_MAE = np.empty(epochs)
    test_MAE = np.empty(epochs)
    best_loss = math.inf
    best_mae = math.inf
    
    ### These are temporary filenames that are immediately overwritten  
    newpath='nonexistantfile'
    newpathMAE = 'nonexistantfile'
    

    for epoch in range(epochs):
        epoch_loss, epochMAE, model = training(train_loader, model, loss, optimizer)
        test_loss, test_target, test_y_target = testing(train_loader,model,loss)
        v_loss,valMAE = validation(val_loader, model, loss)
        ### Calculate testing loss
        testMAE = mean_absolute_error(test_target, test_y_target) 
        
        ### If model has a lower loss, save it
        if v_loss < best_loss:
            if os.path.exists(newpath):
                os.remove(newpath)
            newpath = f"""{path.split('.pt')[0]}_{epoch}_{round(float(v_loss.detach().cpu().numpy()),5)}_{round(float(valMAE),5)}.pt"""
            torch.save(model.state_dict(), newpath)
            best_loss = v_loss
        ### If model has a lower MAE, save it
        if valMAE < best_mae:
            if os.path.exists(newpathMAE):
                os.remove(newpathMAE)
            newpathMAE = f"""{path.split('.pt')[0]}_{epoch}_{round(float(v_loss.detach().cpu().numpy()),5)}_{round(float(valMAE),5)}.pt"""
            torch.save(model.state_dict(), newpathMAE)
            best_mae = valMAE

        ### Record loss
        train_loss[epoch] = epoch_loss.cpu().detach().numpy()
        val_loss[epoch] = v_loss.cpu().detach().numpy()
        ### Record MAE
        train_MAE[epoch] = epochMAE
        val_MAE[epoch] = valMAE
        test_MAE[epoch] = testMAE
        
        ### Print current train and val loss as well as train, val, testing MAE
        if printresults !=None:
            if epoch % printresults == 0 or epoch== epochs-1:
                print(f"Epoch: {str(epoch)}, Train loss: {round(epoch_loss.item(),2)}, Val loss: {round(v_loss.item(),2)}, Train MAE: {epochMAE:.2f}, Val MAE: {valMAE:.2f}, Test MAE: {testMAE:.2f}")

    ### In case the newpath and newpathMAE variables were not overwritten (can occur depending on model performance, epochs, and randomness)
    if not os.path.exists(newpath):
        newpath = newpathMAE
    if not os.path.exists(newpathMAE):
        newpathMAE = newpath
    ### Return losses, MAEs, and paths to best models
    return train_loss, val_loss, test_loss, train_MAE, val_MAE, test_MAE,newpath,newpathMAE

def kfoldsGNN(datavalues,model,afdic,bfdic,path=None,pretraining=None,k_folds = 5,epochs = 2000,
              loss = torch.nn.MSELoss(),manseed=0,pattern = 'C=CC(C=C)=O',batch_size=128,dim_h=128,
              printresults=None,AddHs=False,neighbours=2,lr=0.001, weight_decay=5e-4):
    """
    datavalues = list of lists/tuples containing [SMILES, barrier height]
    model = GNN model
    afdic = Dictionary detailing which atom/node features to use
    bfdic = Dictionary detailing which bond/edge feature sto use
    path = List of model paths for each fold
    pretraining = Load a pretrained model
    k_folds = Number of k_folds to use
    epochs = Number of epochs to use
    loss = Loss function to use
    manseed = Random seed for PyTorch
    pattern = SMILES pattern for reaction centre (used in reaction centre based GNNs)
    batch_size = size of each batch in dataloaders
    dim_h = number of dimensions for each hidden layer
    printresults = print results every x epochs
    AddHs = Whether to add hydrogens to the structures when creating graphs
    neighbours = Number of neighbours to reaction centre to consider (used in reaction centre based GNNs)
    lr = Learning rate
    weight_decay = Weight decay to use in optimizer
    """
    results = {}
    torch.manual_seed(manseed)

    ### K-folds split dataset
    kfold = KFold(n_splits=k_folds,shuffle=True,random_state=0)

    ### Generate graph data
    data_list = create_pytorchgeometric_dataset2([x[0] for x in datavalues], [x[1] for x in datavalues], pattern,afdic,bfdic,AddHs=AddHs,neighbours=2)
    
    ### Provide some default pathnames in current directory
    if path == None:
        path = ["GNN_model.pt"]*k_folds
    for fold, (train_ids, test_ids) in enumerate(kfold.split(data_list)):
        if printresults:
            print(f'FOLD {fold}')
            print('--------------------------------')
        ### Split testing set to form validation set
        valid_ids = test_ids[:int(len(test_ids)/2)]
        test_ids = [x for x in test_ids if x not in valid_ids]
        
        # Sample elements randomly from a given list of ids, no replacement.
        train_subsampler = torch.utils.data.SubsetRandomSampler(train_ids)
        valid_subsampler = torch.utils.data.SubsetRandomSampler(valid_ids)
        test_subsampler = torch.utils.data.SubsetRandomSampler(test_ids)

        ### Create data loaders for the training, validation, and testing sets
        train_loader = DataLoader(dataset = data_list, batch_size = batch_size, sampler=train_subsampler)
        valid_loader = DataLoader(dataset = data_list, batch_size = batch_size, sampler=valid_subsampler)    
        test_loader = DataLoader(dataset = data_list, batch_size = batch_size, sampler=test_subsampler)    
        
        device = torch.device('cuda')        
        modeltouse = model(data_list[0]['x'].shape[1],dim_h=dim_h).to(device)

        if pretraining != None:
            modeltouse.load_state_dict(torch.load(pretraining[fold],weights_only=True)).to(device)
            print(f'Loaded model: {path[fold]}')
        gcn_train_loss, gcn_val_loss,gcn_test_loss, gcn_train_MAE, gcn_val_MAE,gcn_test_MAE,newpath,newpathMAE = train_epochs(epochs, modeltouse, train_loader, valid_loader, test_loader, path[fold], printresults=printresults, lr=lr, weight_decay=weight_decay,loss=loss)
        
        ### Get Best Model
        modeltouse.load_state_dict(torch.load(newpathMAE,weights_only=True))
        ### Get loss using best model
        train_loss, train_target, train_y_target = testing(train_loader,modeltouse,loss)
        valid_loss, valid_target, valid_y_target = testing(valid_loader,modeltouse,loss)
        test_loss, test_target, test_y_target = testing(test_loader,modeltouse,loss)
        ### Get MAE using best model
        trainmae = mean_absolute_error(train_target, train_y_target)
        validmae = mean_absolute_error(valid_target, valid_y_target)
        testmae = mean_absolute_error(test_target, test_y_target)
        ### Print results for best model according to validation set
        print(f'Fold {fold}, Best Validation epoch: Train MAE = {round(trainmae,2)}, Valid MAE = {round(validmae,2)}, Test MAE = {round(testmae,2)}') 
        
        ### Record results for each set. _y_ indicates predictions
        results[fold] = [gcn_train_loss, gcn_val_loss, gcn_train_MAE, gcn_val_MAE,trainmae,validmae,testmae,gcn_test_loss,gcn_test_MAE, valid_target, valid_y_target, test_target, test_y_target]

    print('==')
    print(f'Best Validation Model Train MAE: {np.mean([x[4] for x in results.values()]):.2f} ± {np.std([x[4] for x in results.values()]):.2f}')
    print(f'Best Validation Model Valid MAE: {np.mean([x[5] for x in results.values()]):.2f} ± {np.std([x[5] for x in results.values()]):.2f}')
    print(f'Best Validation Model Test MAE: {np.mean([x[6] for x in results.values()]):.2f} ± {np.std([x[6] for x in results.values()]):.2f}')
    print('==\n\n')
    return results


def test_new_molecule(trainvalid_data,test_data,model,afdic,bfdic,path=None,pretraining=None,k_folds = 5,epochs = 2000,
              loss = torch.nn.MSELoss(),manseed=0,pattern = 'C=CC(C=C)=O',batch_size=128,dim_h=128,
              printresults=None,AddHs=False,neighbours=2,lr=0.001, weight_decay=5e-4):
    """
    trainvalid_data = list of lists/tuples containing [SMILES, barrier height] used for trainign and validating model
                        Use the entire dataset from the manuscript for this
    test_data = list of lists/tuples containing [SMILES, barrier height]. This is the new unknown data
    model = GNN model
    afdic = Dictionary detailing which atom/node features to use
    bfdic = Dictionary detailing which bond/edge feature sto use
    path = List of model paths for each fold
    pretraining = Load a pretrained model
    k_folds = Number of k_folds to use
    epochs = Number of epochs to use
    loss = Loss function to use
    manseed = Random seed for PyTorch
    pattern = SMILES pattern for reaction centre (used to restrict graph to reaction centre)
    batch_size = size of each batch in dataloaders
    dim_h = number of dimensions for each hidden layer
    printresults = print results every x epochs
    AddHs = Whether to add hydrogens to the structures when creating graphs
    neighbours = Number of neighbours to reaction centre to consider (used in reaction centre based GNNs)
    lr = Learning rate
    weight_decay = Weight decay to use in optimizer
    """
    results = {x[0]:[] for x in test_data}
    torch.manual_seed(manseed)

    ### K-folds split dataset
    kfold = KFold(n_splits=k_folds,shuffle=True,random_state=0)

    ### Generate graph data
    data_list = create_pytorchgeometric_dataset2([x[0] for x in trainvalid_data], [x[1] for x in trainvalid_data], pattern,afdic,bfdic,AddHs=AddHs,neighbours=2)
    test_data_list = create_pytorchgeometric_dataset2([x[0] for x in test_data], [x[1] for x in test_data], pattern,afdic,bfdic,AddHs=AddHs,neighbours=2)
    
    ### Provide some default pathnames in current directory
    if path == None:
        path = ["GNN_model.pt"]*k_folds
    for fold, (train_ids, valid_ids) in enumerate(kfold.split(data_list)):
        # Sample elements randomly from a given list of ids, no replacement.
        train_subsampler = torch.utils.data.SubsetRandomSampler(train_ids)
        valid_subsampler = torch.utils.data.SubsetRandomSampler(valid_ids)
        
        ### Create data loaders for the training, validation, and testing sets
        train_loader = DataLoader(dataset = data_list, batch_size = batch_size, sampler=train_subsampler)
        valid_loader = DataLoader(dataset = data_list, batch_size = batch_size, sampler=valid_subsampler)    
        test_loader = DataLoader(dataset = test_data_list, batch_size = batch_size)    
        
        device = torch.device('cuda')        
        modeltouse = model(data_list[0]['x'].shape[1],dim_h=dim_h).to(device)

        if pretraining == None: ### Train models
            gcn_train_loss, gcn_val_loss,gcn_test_loss, gcn_train_MAE, gcn_val_MAE,gcn_test_MAE,newpath,newpathMAE = train_epochs(epochs, modeltouse, train_loader, valid_loader, test_loader, path[fold], printresults=printresults, lr=lr, weight_decay=weight_decay)
            print('Best model:',newpathMAE)
            ### Get Best Model
            modeltouse.load_state_dict(torch.load(newpathMAE,weights_only=True))
        else: ### Use pretrained models
            modeltouse.load_state_dict(torch.load(pretraining[fold],weights_only=True))
            print(f'Loaded model: {pretraining[fold]}')
        
        test_loss, test_target, test_y_target = testing(test_loader,modeltouse,loss)
        for n,x in enumerate(test_target):
            results[test_data[n][0]] += [x]
    
    print('Average after K-Folds')
    for k,v in results.items():
        print(k,round(np.mean(v),1))



# 2) Training and Testing
## 2.1) Data and Features

In [None]:
### ReactionID_SMILES : [lowest_BH,U_to_TS_BH]
## U_to_TS_BH is not neccesary but is provided for data purposes

dataset = {'1_C=CC(C=C)=O':[15.8,14.8],
           '2_C=CC(C(C)=C)=O':[13.9,12.8],
           '3_C=C(C)C(C(C)=C)=O':[9.7,9.7],
           '4_C=C(C)C(/C(C)=C/C)=O':[10.6,10.6],
           '5_O=C(/C(C)=C/C)/C(C)=C/C':[11.9,11.9],
           '6_O=C(/C(C)=C/C)/C(C)=C(C)/C':[11.1,11.1],
           '7_O=C(/C(C)=C(C)/C)/C(C)=C(C)/C':[13.4,11.9],
           '8_CC(C(C1=CC=CC=C1)=O)=C':[21,21],
           '9_C/C(C)=C(C(C2=CC=CC=C2)=O)/C':[20.5,20.5],
           '10_O=C(C3=CC=CC=C3)C4=CC=CC=C4':[30.5,30.5],
           '11_C=C(C)C(C(OC)=C)=O':[11.4,11.4],
           '12_C=C(C)C(C(C(OC)=O)=C)=O':[13.6,13.6],
           '13_C=C(C)C(/C(OC)=C/C(OC)=O)=O':[9.3,9.3],
           '14_C=C(C(OC)=O)C(C(OC)=C)=O':[11.9,11.9],
           '15_O=C(/C(C)=C(C)/C)/C(C)=C(C)/C1=CC=CC=C1':[12.7,12.7],
           '16_O=C(/C(C)=C(C)/C)/C(C)=C(C2=CC=CC=C2)/C':[15.2,15.1],
           '17_C=CC(/C=C/C3=CC=CC=C3)=O':[21.3,19.4],
           '18_C=CC(/C=C\C4=CC=CC=C4)=O':[22.2,20.0],
           '19_C=CC(/C(C(OC)=O)=C/OC)=O':[17.3,17.3],
           '20_O=C(/C(C)=C(C)/OC)/C(C)=C(C)\C':[16.3,15.5],
           '21_O=C(/C(OC)=C(C)/C(OC)=O)/C(C)=C(C)\C':[10.7,10.4],
           '22_O=C(/C(OC)=C(C)/C)/C(C)=C(C)\C':[12.0,9.8],
           '23_O=C(/C(C(OC)=O)=C(C)/C)/C(C)=C(C)\C':[15.6,15.1],
           '24_O=C(/C(OC)=C(C(OC)=O)/C)/C(C)=C(C)\C':[9.2,8.8],
           '25_O=C(/C(C)=C\C)/C(C)=C\C':[15.9,15.9],
           '26_O=C(/C=C(C)\C)/C=C(C)\C':[30.2,27.2],
           '27_O=C(/C=C\C)/C=C\C':[27.1,24.8],
           '28_O=C(/C=C/C)/C=C/C':[21.9,20.1],
           '29_C=CC(C(OC)=C)=O':[15.2,12.2],
           '30_C=CC(C(C(OC)=O)=C)=O':[18.9,17.3],
           '31_C=CC(/C(C)=C(C)/C)=O':[16.1,14.3],
           '32_C=CC(/C=C(C)/C)=O':[24.4,21.1],
           '33_O=C(/C(C)=C(C)/C)/C(C)=C(CC)/C':[12.5,11.5],
           '34_O=C(/C(C)=C(C)/C)/C(C)=C(C(C)C)/C':[14.3,13.2],
           '35_O=C(/C(C)=C(C)/C)C1=C(C)CCCC1':[14.8,14.4],
           '36_O=C(/C(C)=C(C)/C)C1=C(C(C)C)CCCC1':[15.2,14.3],
           '37_C=C(C)C(C(SC)=C)=O':[7.5,7.5],
           '38_C=C(C)C(C(C#N)=C)=O':[11.3,11.2],
           '39_O=C(/C(C)=C(C#N)/C)/C(C)=C(C)/O':[18.6,18.2],
           '40_O=C(/C(C)=C(C#N)/C)/C(C)=C(C)/C':[15.8,15.8],
           '41_O=C(/C(OC)=C(C)/C)/C(C(OC)=O)=C(C)\C':[12.0,10.5],
           '42_O=C(/C(C)=C(C)/C#N)/C(C)=C(C)/C':[16.2,15.7],
           '43_O=C(/C(SC)=C(C)/C)/C(C)=C(C)\C':[11.2,9.5],
           '44_O=C(/C(SC)=C(C(OC)=O)/C)/C(C)=C(C)\C':[10.2,8.5],
           '45_C/C(C)=C(C(C1=CC=CC=C1)=O)/SC':[20.5,20.5],
           '46_C=C(C#N)C(C(OC)=C)=O':[11.8,11.8],
           '47_O=C(/C(C)=C(C)/CC)C1=C(C)CCCC1':[13.9,13.8],
           '48_O=C(/C(C)=C(C)/CC)C1=C(C(C)C)CCCC1':[15.0,14.0],
           '49_O=C(/C(C)=C(C)/COC1=CC(OC)=CC(OC)=C1)/C(C)=C(C)/C2=CC=CC=C2':[13.5,13.5],
           '50_O=C(/C(C)=C(C)/COC1=CC(OC)=CC(OC)=C1)/C(C)=C(C2=CC=CC=C2)/C':[15.6,15.6],
           '51_O=C(/C(C)=C(COC1=CC(OC)=CC(OC)=C1)/C)/C(C)=C(C)/C2=CC=CC=C2':[10.4,8.7],
           '57_O=C(/C(C)=C(C)\C)C1=CCCC1':[14.7,14.7],
           '58_C=CC(C1=CCCC1)=O':[18.3,16.5],
           '59_O=C(/C(C)=C(C)\C)C1=C(C)CCCO1':[14.7,14.5],
           '60_O=C(C=C)C1=CCCCO1':[15.1,12.6],
           '61_C=CC(C1=CNC=C1)=O':[23.1,22.0],
           '62_C=CC(C1=CC=CN1)=O':[27.0,24.8],
           '63_O=C(C1=CNC=C1)C2=CNC=C2':[33.1,32.7],
           '64_C=CC(/C=C/C1=COC=C1)=O':[22.5,20.6],
           '65_O=C(/C(C)=C(C)/C1=COC=C1)/C(C)=C(C)/C':[11.4,11.4],
           '66_C=CC(C(C1=CC=CO1)=C)=O':[10.5,10.1],
           '67_C=CC(C(C1=COC=C1)=C)=O':[11.3,10.7],
           '68_C=CC(/C=C/[Si](C)(C)C)=O':[15.4,13.9],
           '69_C=CC(C(P(OC)(OC)=O)=C)=O':[19.6,19.6],
           '70_O=C(/C=C(C)/C1=CC=CC=N1)/C=C(C)\C':[28.0,25.7],
           '71_O=C(/C=C(C)/C1=CC=NC=C1)/C=C(C)\C':[28.7,25.9],
           '72_O=C(/C=C(C)/C1=CC=CN=C1)/C=C(C)\C':[28.8,26.0],
           '73_C=CC(C=C=C)=O':[15.8,12.6],
           '75_C=CC(C([Si](C)(C)C)=C)=O':[12.9,12.9],
           '76_C=CC(/C=C/P(OC)(OC)=O)=O':[18.4,16.6],
           '77_O=C(C1=CCCC1)C2=CCCC2':[17.7,17.7],
           '78_C=CC(C1=COC=C1)=O':[20.1,19.4],
           '79_C=CC(C1=CC=CO1)=O':[28.3,25.7],
           '80_O=C(/C(C)=C(C)/C)C1=COC=C1':[16.1,16.1],
           '81_O=C(C1=CC=NC=C1)C2=CC=NC=C2':[36.0,36.0],
           '82_O=C(C1=CC=NC=N1)C2=CC=NC=N2':[41.2,37.8],
           '83_C=CC(C1=CC(OC)=CC(OC)=C1)=O':[17.9,17.5],
           '84_O=C(/C(C)=C(C)/C)C1=CC(OC)=CC(OC)=C1':[13.5,13.5],
           '85_C=CC(C(C1NCOC1=O)=C)=O':[25.1,24.6],
           '86_O=C(/C(C1NCOC1=O)=C(C)/C)/C(C)=C(C)/C':[23.9,16.4],
           '87_C=CC(/C=C\C(C1=CC=CC=C1)C2=CC=CC=C2)=O':[19.4,16.1],
           '89_O=C(/C(C)=C(C)\C)/C(C(C1=CC=CC=C1)C2=CC=CC=C2)=C(C)\C':[13.0,10.2],
           '90_C=C(C1CCC1)C(C(C2CCC2)=C)=O':[10.8,10.8],
           '92_C=CC(/C=C/C1=NC(C=CC=C2)=C2S1)=O':[20.7,18.3],
           '93_C=CC(C1=CC=NC=C1)=O':[22.3,22.0],
           '94_O=C(C(SC)=C)C1=CC=CC=C1':[19.2,19.2],
           '95_C=CC(/C=C/C1=CC=NC=C1)=O':[20.4,18.4],
           '96_C=CC(/C=C/C1=CC=C(Br)C=C1)=O':[20.7,19.3],
           '97_O=C(/C(C)=C(C)/C)C1=C(C)CCC1':[18.9,18.6],
           '98_O=C(C1=CC=CC=C1)C2=CC=C(C(F)(F)F)C=C2':[38.4,38.4],
           '101_C=CC(C(C1CCC1)=C)=O':[13.5,13.3],
           '102_O=C(/C(C)=C(C)/C)/C(C)=C(C)/C1CCC1':[13.9,13.9],
           '103_C=CC(C1=CC=CS1)=O':[24.7,24.0],
           '104_C=CC(/C=C/C(C1=CC=CC=C1)C2=CC=CC=C2)=O':[18.9,17.6],
           '105_O=C(/C(C)=C(C)/C)C1=C(C)C=CS1':[24.5,23.5],
           '106_O=C(C1=CC=CO1)C2=CC=CO2':[40.7,39.2],
           '107_O=C(/C=C/C#N)/C=C/C#N':[23.5,22.1],
           '108_C=C(C#C)C(C(C#C)=C)=O':[10.4,10.4],
           '109_C=C(C)C(C(N)=C)=O':[14.3,7.4],
           '110_C=C(C)C(C(NC)=C)=O':[14.8,6.7],
           '113_C=C(F)C(C(F)=C)=O':[15.1,13.8],
           '114_O=C(/C=C/F)/C=C/F':[21.2,19.8],
           '115_O=C(/C=C\F)/C=C\F':[41.4,37.7],
           '116_C=C(Br)C(C(Br)=C)=O':[10.6,10.6],
           '117_O=C(/C=C/Br)/C=C/Br':[23.2,21.6],
           '118_O=C(/C=C\Br)/C=C\Br':[39.0,35.2],
           '119_C=C(C)C(C(C1NCOC1=O)=C)=O':[21.7,21.7],
           '120_C=CC(C1=CC=NC=N1)=O':[21.1,21.1],
           '121_C=C(C)C(C(C)=C=C)=O':[7.8,7.8],
           '122_O=C(/C(C#C)=C(C)\C)/C(C#C)=C(C)/C':[18.1,16.3],
           '123_C=C(C)C(C1=CNC=C1)=O':[22.8,22.8],
           '124_C=C(C)C(C1=CC=CN1)=O':[22.4,22.4],
           '125_C=C(C)C(C1=COC=C1)=O':[17.0,17.0],
           '126_C=C(C)C(C1=CC=CO1)=O':[23.0,23.0],
           '127_O=C(/C(C)=C(C)\C)C1=CC=CO1':[21.9,21.7],
           '128_O=C(/C(C)=C(C)\C)C1=CC=CN1':[21.0,20.9],
           '129_O=C(/C(C)=C(C)/C)C1=CC=CS1':[21.1,20.5],
           '130_C=C(C)C(C1=CC=CS1)=O':[21.3,21.3],
           '131_C=C(C#N)C(C(SC)=C)=O':[8.3,8.3],
           '132_C=C(C)C(C([Si](C)(C)C)=C)=O':[11.9,11.9],
           '133_O=C(/C(C)=C/C1=COC=C1)/C(C)=C/C':[11.6,11.6],
           '134_O=C(/C(C1=CC=CO1)=C(C)\C)/C(C)=C(C)/C':[10.1,8.9],
           '135_O=C(/C(C1=COC=C1)=C(C)/C)/C(C)=C(C)/C':[11.8,11.8],
           '136_O=C(C1=CC=NC=N1)/C(C)=C(C)\C':[23.7,23.7],
           '137_O=C(/C(SC)=C(C)/C1=COC=C1)/C(C)=C(C)/C':[9.3,9.3],
           '138_C=C(C)C(C1=CC(OC)=CC(OC)=C1)=O':[15.5,15.5],
           '139_C=C(SC)C(C1=CC(OC)=CC(OC)=C1)=O':[15.5,15.5],
           '141_C=C(C)C(/C(C)=C\C(C1=CC=CC=C1)C2=CC=CC=C2)=O':[10.5,10.5],
           '142_O=C(/C(SC)=C(C)/C#N)/C(C)=C(C)/C':[10.6,9.4],
           '143_O=C(/C(SC)=C(C)/C)/C(C)=C(C)/C#N':[13.6,13.6],
           '144_C=C(SC)C(C(SC)=C)=O':[7.4,7.4],
           '147_O=C(/C(SC)=C/C)/C(SC)=C/C':[8.0,8.0],
           '148_C=C(OC)C(C(OC)=C)=O':[11.7,10.8],
           '149_O=C(/C(OC)=C/C)/C(OC)=C/C':[10.9,9.6],
           '151_O=C(/C(F)=C(C)/C)/C(F)=C(C)/C':[19.4,18.3],
           '152_O=C(/C(C)=C(C)/F)/C(C)=C(C)/F':[13.7,13.7],
           '153_O=C(/C(C)=C(C)\F)/C(C)=C(C)\F':[23.4,23.4],
           '154_O=C(/C(Br)=C(C)/C)/C(Br)=C(C)/C':[14.0,14.0],
           '155_O=C(/C(C)=C(C)/Br)/C(C)=C(C)/Br':[12.9,12.4],
           '156_O=C(/C(C)=C(C)\Br)/C(C)=C(C)\Br':[18.3,18.3],
           '157_O=C(/C(C)=C(C)/C)/C(F)=C(C)/C':[16.0,15.9],
           '158_C=C(F)C(C(C)=C)=O':[11.7,11.7],
           '159_C=C(Br)C(C(C)=C)=O':[10.2,10.2],
           '160_O=C(/C(C)=C(C)/C)/C(Br)=C(C)/C':[13.9,13.4],
           '161_C=C(C#C)C(C=C)=O':[14.6,13.4],
           '162_C=C(C#C)C(C(C#N)=C)=O':[12.7,12.7],
           '163_C=C(C)C(C(C1=CC=CO1)=C)=O':[8.8,8.8],
           '164_C=C(C)C(C(C1=COC=C1)=C)=O':[9.1,9.1],
           '165_O=C(/C=C/C#C)/C=C/C#C':[24.4,22.4],
           '166_O=C(/C(C)=C/C#C)/C(C)=C/C#C':[14.7,14.7],
           '167_O=C(/C(C#N)=C(C#N)/C#N)/C(C#N)=C(C#N)/C#N':[34.4,32.3],
           '168_O=C(/C(C)=C(C)/SC)/C(C)=C(C)/SC':[19.0,19.0],
           '169_O=C(/C=C/SC)/C=C/SC':[30.5,28.8],
           '170_C=C([Si](C)(C)C)C(C([Si](C)(C)C)=C)=O':[12.0,12.0],
           '171_C=CC(/C=C/OC)=O':[23.3,21.8],
           '172_C=CC(/C=C/C(OC)=O)=O':[19.5,16.7],
           '173_C=CC(/C=C/SC)=O':[25.2,22.2],
           '174_C=C(C)C(/C(SC)=C/C#N)=O':[9.1,9.1],
           '175_C=C(C)C(/C(C#N)=C/SC)=O':[15.1,15.1],
           '176_C=C(C)C(/C(SC)=C/F)=O':[9.2,9.2],
           '177_C=C(C)C(/C(F)=C/SC)=O':[14.4,14.2],
           '179_C=C(C)C(/C(Br)=C/SC)=O':[13.6,10.0],
           '180_O=C(/C(SC)=C(C)/C)/C(C)=C(C)/F':[11.8,11.5],
           '181_O=C(/C(SC)=C(C)\C)/C(C)=C(C)/Br':[14.6,13.9],
           '182_O=C(/C(SC)=C(C)/F)/C(C)=C(C)/C':[9.8,9.0],
           '183_O=C(/C(SC)=C(Br)\C)/C(C)=C(C)/C':[10.1,9.2],
           '184_O=C(/C(C)=C(C(F)(F)F)\C)/C(C)=C(C)/C':[14.2,13.5],
           '186_O=C(/C(SC)=C(C(F)(F)F)\C)/C(C)=C(C)/C':[11.9,10.5],
           '187_O=C(/C(SC)=C(C(F)(F)F)\C)/C(SC)=C(C)/C(F)(F)F':[10.4,8.6],
           '189_O=C(/C(C(F)(F)F)=C(C)\C)/C(C(F)(F)F)=C(C)/C':[23.9,21.6],
           '190_O=C(/C(C(F)(F)F)=C/C)/C(C(F)(F)F)=C/C':[27.1,27.1],
           '191_O=C(/C(C1=CC=CC=C1)=C(C)\C)/C(C2=CC=CC=C2)=C(C)/C':[12.4,10.0],
           '192_C=C(C1=CC=CC=C1)C(C(C2=CC=CC=C2)=C)=O':[9.9,9.9],
           '193_O=C(/C(C1=CC=CC=C1)=C(C#N)\C)/C(C2=CC=CC=C2)=C(C)/C#N':[14.5,11.6],
           '195_C=C(SC)C(C(C1=CC=CO1)=C)=O':[5.9,5.9],
           '196_C=C(SC)C(C(C1=COC=C1)=C)=O':[6.5,6.5],
           '197_O=C(/C(O)=C(C)\C)/C(C)=C(C)/C':[11.8,11.8],
           '198_C=C(C)C(C(O)=C)=O':[10.6,10.6],
           '199_C/C(C(C1=CC=CC=C1)=O)=C\C':[22.9,22.9],
           '200_C/C(C(C1=CC=CC=C1)=O)=C\C#N':[24.7,24.7],
           '201_O=C(/C(SC)=C/C#N)C1=CC=CC=C1':[19.5,19.5],
           '202_C=C(SC)C(/C=C/C#N)=O':[11.9,10.5],
           '203_C=C(C#N)C(/C=C/SC)=O':[25.2,24.4],
           '204_O=C(/C(C#N)=C(C)/C)/C(C#N)=C(C)/C':[29.2,26.4],
           '205_O=C(/C(SC)=C/C#N)/C(SC)=C/C#N':[8.4,8.4],
           '206_O=C(/C(C#C)=C/C#N)/C(C#C)=C/C#N':[15.7,15.7],
           '207_C=C(SC)C(C(SC)=C=C)=O':[4.7,4.7],
           '208_O=C(/C(SC)=C/C(F)(F)F)/C(SC)=C/C(F)(F)F':[8.5,8.5],
           '210_O=C(/C(C#N)=C\C#N)/C(C#N)=C\C#N':[28.6,27.4],
           '213_C=C(C1=CC=CO1)C(C(C2=CC=CO2)=C)=O':[5.7,5.7],
           '214_C=C(C)C(C(C#N)=C=C)=O':[8.8,8.8],
           '216_O=C(C(C)=C=C)/C(C)=C/C#N':[10.7,10.7],
           '217_C/C(C(C1=CC=CC=C1)=O)=C\C(F)(F)F':[22.3,20.4],
           '218_C/C(C(C1=CC=CC=C1)=O)=C\F':[22.0,22.0],
           '220_O=C(C=C=C)/C=C/Br':[18.6,15.1],
           '221_O=C(C=C=C)/C=C/C#N':[18.1,15.0],
           '222_O=C(C=C=C)/C=C/F':[17.4,14.7],
           '223_O=C(/C(C1=CC=CC=C1)=C\C)/C(C2=CC=CC=C2)=C\C':[14.9,13.2],
           '224_O=C(/C(C#C)=C\C#N)/C(C#C)=C\C#N':[24.4,20.1],
           '225_C=C(O)C(C(O)=C)=O':[10.5,9.7],
           '226_O=C(/C(SC)=C\C)/C(SC)=C\C':[10.9,10.7],
           '227_O=C(/C=C\C(F)(F)F)C=CC(F)(F)F':[20.1,18.0],
           '228_O=C(/C=C/C(F)(F)F)/C=C/C(F)(F)F':[23.3,21.2],
           '229_O=C(C=C=C)/C=C\C#N':[22.6,19.5],
           '230_O=C(C=C=C)/C=C\F':[28.0,24.6],
           '232_C=C(C)C(C(SC)=C=C)=O':[7.9,7.9],
           '233_C=CC(C(SC)=C=C)=O':[11.2,9.1],
           '234_O=C(C(SC)=C=C)/C=C/C#N':[12.9,10.2],
           '235_O=C(C(SC)=C=C)/C=C\C#N':[15.8,13.9],
           '236_O=C(C(SC)=C=C)/C(C)=C\C#N':[12.6,12.6],
           '237_C=CC(/C(C1=COC=C1)=C/C#N)=O':[13.2,12.7],
           '238_C=C(C1=COC=C1)C(/C=C/C#N)=O':[13.9,12.4],
           '239_C=C(C1=COC=C1)C(/C=C\C#N)=O':[15.6,14.9],
           '240_C=C(SC)C(/C(C)=C\F)=O':[23.0,14.0],
           '241_C=C(F)C(/C(C)=C\SC)=O':[13.8,8.8],
           '242_C=C(C)C(/C(SC)=C\C#N)=O':[12.0,11.8],
           '243_C=C(C)C(/C(C#N)=C\SC)=O':[22.0,17.8],
           '244_C=C(C)C(/C(SC)=C\F)=O':[15.9,15.9],
           '246_C=C(SC)C(/C(C)=C\C#N)=O':[11.6,11.6],
           '247_C=C(C#N)C(/C(C)=C\SC)=O':[22.0,19.0],
           '248_C=C(SC)C(C1=CC=CS1)=O':[19.8,19.8],
           '249_O=C(/C(C)=C\C#N)C1=CC=CS1':[25.8,25.8],
           '250_O=C(/C(C)=C/C#N)C1=CC=CS1':[24.7,24.7],
           '251_O=C(/C(SC)=C\C#N)C1=CC=CS1':[23.8,23.8],
           '252_O=C(/C(SC)=C/C#N)C1=CC=CS1':[21.5,21.5],
           '253_O=C(/C=C\C#N)C1=CC=CO1':[33.2,30.5],
           '254_O=C(/C=C/C#N)C1=CC=CO1':[33.0,29.3],
           '255_O=C(/C=C\F)C1=CC=CN1':[32.4,29.0],
           '256_O=C(/C=C/F)C1=CC=CN1':[24.5,23.9],
           '257_O=C(/C=C\C#N)/C=C\C#N':[31.2,29.4],
           '258_O=C(/C(C)=C(C(F)(F)F)/C(F)(F)F)/C(C)=C(C(F)(F)F)/C(F)(F)F':[19.1,17.3],
           '259_O=C(/C(C(F)(F)F)=C(C(F)(F)F)/C(F)(F)F)/C(C(F)(F)F)=C(C(F)(F)F)/C(F)(F)F':[25.6,19.1],
           '261_C=CC(/C=C/F)=O':[18.7,17.2],
           '262_C=CC(/C(SC)=C/F)=O':[12.3,10.9],
           '263_C=C(SC)C(/C=C/F)=O':[10.4,10.1],
           '264_C=CC(C(SC)=C)=O':[10.4,9.8],
           '265_C=CC(/C(SC)=C/Br)=O':[13.3,11.9],
           '267_C=CC(/C=C/Br)=O':[20.0,18.4],
           '268_C=CC(/C(SC)=C\F)=O':[20.0,18.4],
           '269_C=C(SC)C(/C=C\F)=O':[18.9,17.3],
           '270_C=CC(/C=C\F)=O':[29.1,26.4],
           '271_C=CC(/C(SC)=C\Br)=O':[17.3,15.2],
           '272_C=C(SC)C(/C=C\Br)=O':[18.5,15.2],
           '273_C=CC(/C=C\Br)=O':[27.6,23.8],
           '274_O=C(/C(C1=COC=C1)=C/C#N)/C(C2=COC=C2)=C/C#N':[9.8,9.8],
           '297_O=C(/C=C\C#N)C1=CC=CS1':[29.2,28.5],
           '298_O=C(/C=C/C#N)C1=CC=CS1':[28.6,27.3],
           '299_C=C(C(F)(F)F)C(C(C(F)(F)F)=C)=O':[15.8,15.8],
           '300_C=C(C#N)C(C(C#N)=C)=O':[13.9,13.9],
           '301_C=CC(/C=C/C#N)=O':[20.1,18.0],
           '302_C=CC(/C=C/C)=O':[19.2,17.5],
           '303_O=C(/C(C)=C(C)/C(F)(F)F)/C(C)=C(C)/C(F)(F)F':[15.4,15.4],
           '304_O=C(/C(C)=C\F)/C(C)=C\F':[25.2,25.2],
           '305_O=C(/C(C)=C\Br)/C(C)=C\Br':[23.9,23.4],
           '307_C=C(C)C(C(N1CCOC1=O)=C)=O':[6.9,6.9],
           '309_O=C(/C(F)=C(C)/F)/C(F)=C(C)/F':[17.4,15.5],
           '310_O=C(/C(F)=C(F)/C)/C(F)=C(F)/C':[26.9,26.5],
           '311_O=C(/C(F)=C(F)/F)/C(F)=C(F)/F':[26.7,25.0],
           '312_O=C(/C(Br)=C(C)/Br)/C(Br)=C(C)/Br':[14.0,12.9],
           '313_O=C(/C(Br)=C(Br)/C)/C(Br)=C(Br)/C':[21.1,21.1],
           '316_C=C(Cl)C(C(Cl)=C)=O':[10.4,10.4],
           '317_O=C(/C=C/Cl)/C=C/Cl':[23.0,21.3],
           '318_O=C(/C=C\Cl)/C=C\Cl':[40.2,36.6],
           '319_O=C(/C(Cl)=C(C)/C)/C(Cl)=C(C)/C':[14.7,14.2],
           '320_O=C(/C(C)=C(C)/Cl)/C(C)=C(C)/Cl':[13.9,13.9],
           '321_O=C(/C(C)=C(C)\Cl)/C(C)=C(C)\Cl':[20.3,19.0],
           '322_O=C(/C(C)=C(C)/C)/C(Cl)=C(C)/C':[13.4,13.0],
           '323_C=C(Cl)C(C(C)=C)=O':[10.1,10.1],
           '324_C=CC(/C=C/Cl)=O':[19.7,18.0],
           '326_O=C(/C(C)=C/F)/C(C)=C/F':[12.3,12.3],
           '327_O=C(/C(C)=C/Br)/C(C)=C/Br':[13.3,13.3],
           '328_O=C(/C(C)=C/Cl)/C(C)=C/Cl':[13.1,13.1]
           
          }

### Create Dataset
dataset = {int(k.split('_')[0]):[k.split('_')[1]]+v for k,v in dataset.items()}
dataset_clean = {k:v for k,v in dataset.items() if len(v)>1}


#### Create various dictionaries for different properties that can be used as atom/node features
enchart = {'C':2.55,'H':2.2,'N':3.04,'O':3.44,'F':3.98,'Si':1.9,'P':2.19,'Cl':3.16,'Br':2.96,'I':2.66,'S':2.58}
enmin,enmax = min(enchart.values()), max(enchart.values())
enchart = {k:(v-enmin)/(enmax-enmin) for k,v in enchart.items()}

polchart = {'C':11.3,'H':4.5,'N':7.4,'O':5.3,'F':3.74,'Si':37.3,'P':25,'Cl':14.6,'Br':21,'I':32.9,'S':19.4}
polmin,polmax = min(polchart.values()), max(polchart.values())
polchart = {k:(v-polmin)/(polmax-polmin) for k,v in polchart.items()}

masschart = {'C':12.01,'H':1.01,'N':14.01,'O':16.00,'F':18.00,'Si':28.09,'P':30.98,'Cl':35.45,'Br':79.90,'I':126.90,'S':32.07}
massmin,massmax = min(masschart.values()), max(masschart.values())
masschart = {k:(v-massmin)/(massmax-massmin) for k,v in masschart.items()}

vdwchart = {x:Chem.GetPeriodicTable().GetRvdw(Chem.GetPeriodicTable().GetAtomicNumber(x)) for x in masschart.keys()}
vdwmin,vdwmax = min(vdwchart.values()), max(vdwchart.values())
vdwchart = {k:(v-vdwmin)/(vdwmax-vdwmin) for k,v in vdwchart.items()}

covchart = {x:Chem.GetPeriodicTable().GetRcovalent(Chem.GetPeriodicTable().GetAtomicNumber(x)) for x in masschart.keys()}
covmin,covmax = min(covchart.values()), max(covchart.values())
covchart = {k:(v-covmin)/(covmax-covmin) for k,v in covchart.items()}

for k,v in dataset_clean.items():
    tempmol = Chem.MolFromSmiles(v[0])
    AllChem.ComputeGasteigerCharges(tempmol)
    charges = [x.GetDoubleProp('_GasteigerCharge') for x in tempmol.GetAtoms()]
    cmin,cmax = min(charges), max(charges)

for k,v in dataset_clean.items():
    tempmol = Chem.MolFromSmiles(v[0])    
    dmatrix = GetMoleculeBoundsMatrix(tempmol) ### use average bond length from min and max values
    bondlens = [np.mean([dmatrix[b.GetBeginAtom().GetIdx()][b.GetEndAtom().GetIdx()],dmatrix[b.GetEndAtom().GetIdx()][b.GetBeginAtom().GetIdx()]]) for b in tempmol.GetBonds()]
    bmin,bmax = min(bondlens), max(bondlens)


## 2.2) Model and Training/Testing

In [None]:
class GraphConv3(torch.nn.Module):
    'Graph Convolutional Network class with 3 convolutional layers and a linear layer'

    def __init__(self, inputdimensions,dim_h):
        super().__init__()
        self.conv1 = GraphConv(inputdimensions, dim_h)
        self.conv2 = GraphConv(dim_h, dim_h)
        self.conv3 = GraphConv(dim_h, dim_h)
        self.lin = torch.nn.Linear(dim_h, 1)

    def forward(self, data):
        e = data.edge_index
        x = data.x

        x = self.conv1(x, e)
        x = x.relu()
        x = self.conv2(x, e)
        x = x.relu()
        x = self.conv3(x, e)
        x = global_mean_pool(x, data.batch)

        x = Fun.dropout(x, p=0.5, training=self.training)
        x = self.lin(x)

        return x

newmodel=GraphConv3
epochs=2000
dimh = 120
batch_size = 32
printresults=100
AddHs=False
pattern = ['C=CC(C=C)=O','C:CC(C:C)=O','C=CC(C:C)=O','C:CC(C=C)=O']
pattneigh = 1
lr=0.0001
weight_decay=5e-4
path = [f"pretrained_GNN_models/GNN_model_fold{n}.pt" for n in range(5)]
pathNoPatt = [f"pretrained_GNN_models/GNN_model_fold{n}.pt" for n in range(5)]
if not os.path.exists(os.path.dirname(path[0])):
    os.mkdir(os.path.dirname(path[0]))
if not os.path.exists(os.path.dirname(pathNoPatt[0])):
    os.mkdir(os.path.dirname(pathNoPatt[0]))
    

### bfdic and afdic are useful for experimenting with different feature combinations
bfdic={'bondtype':True,
                'conjugated':True,
                'inring':True,
                'stereochemistry':False,
                'bondlength':False}

afdic = {'atomtypeonehot':True,
                 'useHs':AddHs,
                 'heavyatomValence':False,
                 'formalcharge':False,
                 'hybridization':False,
                 'ringmembership':True,
                 'aromatic':True,
                 'electronegativity':True,
                 'atompolarizability':False,
                 'gasteiger':True,
                 'mass':False,
                 'vdw':False,
                 'covalent':False,
                 'chirality':False, 
                 'hydrogencount':False}

# ### Run model using graph of reaction centre and nearest neighbours (Used in manuscript) 
# kfoldsresults = kfoldsGNN(list(dataset_clean.values()),model=newmodel,afdic=afdic,bfdic=bfdic,k_folds = 5,epochs = epochs,loss = torch.nn.MSELoss(),manseed=0,pattern =None,batch_size=batch_size,dim_h=dimh,printresults=printresults,AddHs=AddHs,neighbours=pattneigh,lr=lr,weight_decay=weight_decay,path=pathNoPatt)

# ### Run model using entire molecule as graph (Not used but can be beneficial for extremely large molecules)
# kfoldsresults_restrictedReactionCentre = kfoldsGNN(list(dataset_clean.values()),model=newmodel,afdic=afdic,bfdic=bfdic,k_folds = 5,epochs = epochs,loss = torch.nn.MSELoss(),manseed=0,pattern =pattern,batch_size=batch_size,dim_h=dimh,printresults=printresults,AddHs=AddHs,neighbours=pattneigh,lr=lr,weight_decay=weight_decay,path=path)




# 3) Testing on New Structures

In [None]:
### New unknown data in the format ['SMILES',BH]:
## Since these will only be tested, the barrier heights can be any number, ex: 1000
testdata = [['C=C(SC)C(C(SC)=C)=O',1000],['C(C(C1=CC=CC=C1)=O)=C',1000]]
###

### Train new models
# testingpath = [f"pretrained_GNN_models/GNN_272trainvalid_fold{n}.pt" for n in range(5)]
# kfoldsresults = test_new_molecule(list(dataset_clean.values()),testdata,model=newmodel,afdic=afdic,bfdic=bfdic,k_folds = 5,
#                                   epochs = epochs,loss = torch.nn.MSELoss(),manseed=0,pattern =None,batch_size=batch_size,
#                                   dim_h=dimh,printresults=printresults,AddHs=AddHs,neighbours=pattneigh,lr=lr,
#                                   weight_decay=weight_decay,path=testingpath)
# testingpath = [f"pretrained_GNN_models/GNNrestrictedgraph_272trainvalid_fold{n}.pt" for n in range(5)]
# kfoldsresults = test_new_molecule(list(dataset_clean.values()),testdata,model=newmodel,afdic=afdic,bfdic=bfdic,k_folds = 5,
#                                   epochs = epochs,loss = torch.nn.MSELoss(),manseed=0,pattern =pattern,batch_size=batch_size,
#                                   dim_h=dimh,printresults=printresults,AddHs=AddHs,neighbours=pattneigh,lr=lr,
#                                   weight_decay=weight_decay,path=testingpath)



### The best paths of the best models will be printed during training and you can add these to the pretrainpaths list
pretrainpaths = ['pretrained_GNN_models/GNN_272trainvalid_fold0_1452_20.77485_3.17891.pt',
                 'pretrained_GNN_models/GNN_272trainvalid_fold1_1804_16.99289_3.05668.pt',
                 'pretrained_GNN_models/GNN_272trainvalid_fold2_946_12.59959_2.28863.pt',
                 'pretrained_GNN_models/GNN_272trainvalid_fold3_1832_18.58084_3.07982.pt',
                 'pretrained_GNN_models/GNN_272trainvalid_fold4_1651_19.17843_2.82847.pt']
kfoldsresults = test_new_molecule(list(dataset_clean.values()),testdata,model=newmodel,afdic=afdic,bfdic=bfdic,k_folds = 5,
                                  epochs = epochs,loss = torch.nn.MSELoss(),manseed=0,pattern =None,batch_size=batch_size,
                                  dim_h=dimh,printresults=printresults,AddHs=AddHs,neighbours=pattneigh,lr=lr,
                                  weight_decay=weight_decay,path=testingpath,pretraining=pretrainpaths)

pretrainpaths = ['pretrained_GNN_models/GNNrestrictedgraph_272trainvalid_fold0_1712_14.8546_2.76991.pt',
                 'pretrained_GNN_models/GNNrestrictedgraph_272trainvalid_fold1_1543_16.17973_3.03558.pt',
                 'pretrained_GNN_models/GNNrestrictedgraph_272trainvalid_fold2_948_9.77825_2.10556.pt',
                 'pretrained_GNN_models/GNNrestrictedgraph_272trainvalid_fold3_1487_19.30715_2.89396.pt',
                 'pretrained_GNN_models/GNNrestrictedgraph_272trainvalid_fold4_1940_16.4872_2.78486.pt']
kfoldsresults = test_new_molecule(list(dataset_clean.values()),testdata,model=newmodel,afdic=afdic,bfdic=bfdic,k_folds = 5,
                                  epochs = epochs,loss = torch.nn.MSELoss(),manseed=0,pattern =None,batch_size=batch_size,
                                  dim_h=dimh,printresults=printresults,AddHs=AddHs,neighbours=pattneigh,lr=lr,
                                  weight_decay=weight_decay,path=testingpath,pretraining=pretrainpaths)