In [1]:
from rdkit.Chem import AllChem
from rdkit import DataStructs
import rdkit.Chem as Chem

from pandas import Series,DataFrame
import pandas as pd

import matplotlib.pyplot as plt
from matplotlib.patches import *
from matplotlib.ticker import MultipleLocator, FormatStrFormatter 
from matplotlib.axes._axes import _log as matplotlib_axes_logger
matplotlib_axes_logger.setLevel('ERROR')

import seaborn as sns
import warnings

import numpy as np
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.layers import Dense, Input, concatenate
from tensorflow.keras.losses import MeanSquaredError,MSE
from tensorflow.keras.metrics import MeanAbsoluteError
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from tensorflow.keras.models import load_model

import spektral
from spektral.data import *
from spektral.datasets.mnist import MNIST
from spektral.layers import GCNConv,GlobalSumPool,ECCConv

import csv
import time
import umap 
import random
from sklearn.metrics import r2_score
from sklearn import linear_model
import warnings
warnings.filterwarnings("ignore")

In [2]:
data = pd.read_csv('data/UMAP data/data.csv',)
data

Unnamed: 0,Pubchem ID,SMILES,Rings,label,UMAP1,UMAP2
0,T1,CCn1nc(C(=O)O)c(=O)c2cc3c(cc21)OCO3,3,Test,-1.984654,2.643590
1,T2,Cc1nccn1CC1CCc2c(c3cccc(O)c3n2C)C1=O,4,Test,0.837187,0.481715
2,T3,C/C=C/c1ccc2c(c1)OCO2,2,Test,-3.754383,5.027275
3,T4,CCCCCCCC/C=C\CCCCCCCCCCCCCCCCCCCC(=O)O,0,Test,5.537216,3.269935
4,T5,C=CC(=O)O[C@H](COCCCCCCCCCCCCCCCC)COP(=O)([O-]...,0,Test,6.193696,2.684782
...,...,...,...,...,...,...
3991,T3992,Cc1ncc([N+](=O)[O-])n1CC(O)CCl,1,Train,-2.600688,5.090298
3992,T3993,CCC(C)[C@H](NC(=O)[C@@H](N)C[C@@H](C)O)C(=O)N[...,1,Train,8.672349,1.232099
3993,T3994,Cc1cc2c(cc1S(N)(=O)=O)S(=O)(=O)CCC2,2,Train,-1.083534,3.973907
3994,T3995,COc1ccc(Cc2nccc3cc(OC)c(OC)cc23)cc1OC,3,Train,1.851467,-0.283766


In [3]:
ECC_Model = load_model('model/model.h5',
                       custom_objects = {"ECCConv": ECCConv,
                                         "GlobalSumPool": GlobalSumPool})

class MyDataset(Dataset):
    def __init__(self, features, adj, edge_features, ccs, **kwargs):
        self.features = features
        self.adj = adj
        self.edge_features = edge_features
        self.ccs = ccs
        super().__init__(**kwargs)
        
    def read(self):
        return [Graph(x = self.features[i],
                      a = self.adj[i],
                      e = self.edge_features[i],
                      y = float(self.ccs[i])) for i in range(len(self.adj))]

In [4]:
from UMAPDataset import *
features, adj, edge_features = Constructed_graph_dataset(data)
ccs = [0 for i in range(len(adj))]
DataSet = MyDataset(features, adj, edge_features, ccs)  
print(DataSet)

Constructs: 100%|███████████████████████████████████▉| 3995/3996 [00:08<00:00, 448.52it/s]

MyDataset(n_graphs=3996)





In [5]:
'''Calculate the molecular vector'''
ECC_model_layer = Model(inputs = ECC_Model.input[1:],
                        outputs= ECC_Model.get_layer('GlobalSumPool').output)
    
loader_te = BatchLoader(DataSet,batch_size=1,epochs=1,shuffle=False)
mols_nodes_f = []
for batch in loader_te:
    inputs, target = batch
    predictions = ECC_model_layer(inputs, training=False)
    result = predictions[0].numpy()
    mols_nodes_f.append(result)
np.save("data/UMAP data/data_molvec.npy",mols_nodes_f)

In [7]:
mols_nodes_f = np.load("data/data_molvec.npy")
# Dimensionality reduction of high-dimensional molecular vectors to 2 dimensions using UMAP
reducer = umap.UMAP(n_neighbors = 60, 
                    min_dist = 0.0, 
                    n_epochs = 5000, 
                    learning_rate = 0.01, 
                    n_components = 2, 
                    set_op_mix_ratio = 1,
                    metric = 'euclidean' )
embedding = reducer.fit_transform(mols_nodes_f)
np.save("data/UMAP data/data-UMAP-EUC-60.npy",embedding)