In [1]:
import numpy as np
import tensorflow as tf

from tensorflow.keras import Model
from tensorflow.keras.layers import Dense, Input, concatenate, BatchNormalization
from tensorflow.keras.losses import MeanSquaredError,MSE
from tensorflow.keras.metrics import MeanAbsoluteError
from tensorflow.keras.optimizers import Adam,RMSprop
from tensorflow.keras.regularizers import l2
from tensorflow.keras.models import load_model

from spektral.data import MixedLoader,Dataset,DisjointLoader,Graph,BatchLoader
from spektral.datasets.mnist import MNIST
from spektral.layers import GCNConv,GlobalSumPool,ECCConv,CrystalConv,GlobalMaxPool
from spektral.layers import GlobalAvgPool,GlobalAttnSumPool,GlobalAttentionPool
from spektral.layers.ops import sp_matrix_to_sp_tensor
from spektral.data import Graph
from spektral.data import Dataset

import rdkit.Chem as Chem
from rdkit.Chem import AllChem
import csv

import warnings
warnings.filterwarnings("ignore")

In [2]:
Index, X_smiles, M_adducts, CCS = [],[],[],[]
f = csv.reader(open('data/Attribute importance data/data.csv','r', encoding='gbk',errors='ignore'))

for i in f:
    Index.append(i[0])
    X_smiles.append(i[1])
    M_adducts.append(i[2])
    CCS.append(float(i[3]))

GCN_smiles, GCN_adducts, GCN_Index, GCN_CCS = [],[],[],[]
for i in range(len(X_smiles)):
    try:
        GCN_smiles.append(X_smiles[i])
        GCN_adducts.append(M_adducts[i])
        GCN_Index.append(Index[i])
        GCN_CCS.append(CCS[i])
    except:
        ;

smiles, ccs, adduct, Coordinate = [], [], [], []
for i in range(len(GCN_Index)):
    MOL = Chem.MolFromSmiles(GCN_smiles[i])
    atoms = [atom.GetSymbol() for atom in MOL.GetAtoms()]
    one_atom = []
    f = csv.reader(open('data/Attribute importance data/Coordinate data/'+str(GCN_Index[i])+'.csv','r'))
    files = [i for i in f]
    for j in range(len(atoms)):
        one_atom.append([float(iii) for iii in files[j+1][2:]])
    Coordinate.append(one_atom)
    smiles.append([GCN_smiles[i]])
    ccs.append([GCN_CCS[i]])
    adduct.append(GCN_adducts[i])
###########################################################################################
Max_Coor =  15.615155868453662 
Min_Coor = -15.475082312818216
for i in range(len(Coordinate)):
    Coordinate[i] = (np.array((Coordinate[i])) - Min_Coor) / (Max_Coor - Min_Coor)

Atom_radius = {'N' :71, 'Se':116, 'F':64, 'Co':111, 'O':63,'As':121,
               'Br':114,'Cl':99,  'S':103,'C' :75, 'P':111, 'I':133,'H':32}
Atom_radius_list = [Atom_radius[i] for i in Atom_radius]
Max_radius, Min_radius = np.max(Atom_radius_list), np.min(Atom_radius_list)
for i in Atom_radius:
    Atom_radius[i] = (Atom_radius[i] - Min_radius) / (Max_radius-Min_radius)
    
Atom_mass = {'N':14.00674,'Se':78.96,'F':18.9984032,'Co':58.933195,'As':74.92160,
             'O':15.9994,'Br':79.904,'Cl':35.453,'S':32.065,'C':12.0107,
             'P':30.973762,'I':126.90447,'H':1.00794}
Atom_mass_list = [Atom_mass[i] for i in Atom_mass]

Max_mass, Min_mass = np.max(Atom_mass_list), np.min(Atom_mass_list)
for i in Atom_mass:
    Atom_mass[i] = (Atom_mass[i] - Min_mass) / (Max_mass-Min_mass)

All_Atoms = ['As', 'Br', 'C', 'Cl', 'F', 'I', 'N', 'O', 'P', 'S', 'Se']
###########################################################################################

def convertToGraph(smi_lst):
    adj,adj_norm, features, edge_features = [], [], [], []
    maxNumAtoms = 50 
    NodeNumFeatures, EdgeNumFeatures, INDEX = 0, 4, -1
    for smi in smi_lst:
        INDEX += 1
        iMol = Chem.MolFromSmiles(smi[0]) 
        maxNumAtoms = iMol.GetNumAtoms() 
        iAdjTmp = Chem.rdmolops.GetAdjacencyMatrix(iMol)
        
        one_edge_features = edge_feature(iMol)
        edge_features.append(one_edge_features)
        
        iFeature = np.zeros((maxNumAtoms, NodeNumFeatures))
        iFeatureTmp = []
        for atom in iMol.GetAtoms():
            iFeatureTmp.append(atom_feature(atom,INDEX))
        features.append(np.array(iFeatureTmp))
        adj.append(iAdjTmp)
            
    features = np.asarray(features)
    edge_features = np.asarray(edge_features)
    return adj, features, edge_features

def atom_feature(atom,INDEX):
    return np.array(
        one_of_k_encoding_unk(atom.GetSymbol() ,All_Atoms) +
        one_of_k_encoding_unk(atom.GetDegree(), [0, 1, 2, 3, 4]) +
        [Atom_radius[atom.GetSymbol()],Atom_mass[atom.GetSymbol()]] +
        one_of_k_encoding_unk(atom.IsInRing(), [0, 1]) +
        list(Coordinate[INDEX][atom.GetIdx()])
    )

def one_of_k_encoding_unk(x, allowable_set):
    if x not in allowable_set:
        x = allowable_set[-1]
    return list(map(lambda s: x == s, allowable_set))

def edge_feature(iMol):
    # 获得分子的邻接矩阵
    iAdjTmp = Chem.rdmolops.GetAdjacencyMatrix(iMol)
    Edge_feature = []
    count = 0
    for bond in iMol.GetBonds():
        count += 1
        bond_feature = np.array(
            one_of_k_encoding_unk(bond.GetBondTypeAsDouble(),[1,1.5,2,3])
        )
        Edge_feature.append(bond_feature)
        Edge_feature.append(bond_feature)
    Edge_feature = np.array(Edge_feature)
    Edge_feature = Edge_feature.astype(np.float)
    return Edge_feature

adj, features, edge_features = convertToGraph(smiles)

In [3]:
class MyDataset(Dataset):
    def __init__(self, features, adj, edge_features, ccs, **kwargs):
        self.features = features
        self.adj = adj
        self.edge_features = edge_features
        self.ccs = ccs
        super().__init__(**kwargs)
        
    def read(self):
        return [Graph(x = self.features[i], 
                      a = self.adj[i], 
                      e = self.edge_features[i],
                      y = float(self.ccs[i][0])) for i in range(len(self.adj))]
    
DataSet = MyDataset(features, adj, edge_features, ccs)
print(DataSet)
adduct_SET = ['[M+H]+', '[M+Na]+', '[M-H]-']
adduct_SET.sort()
print(adduct_SET)

dataset_te = DataSet
adduct_te  = adduct

MyDataset(n_graphs=10)
['[M+H]+', '[M+Na]+', '[M-H]-']


In [4]:
import spektral
import umap
ECC_model = load_model('model/model.h5',
                       custom_objects = {"ECCConv": spektral.layers.ECCConv,
                                         "GlobalSumPool": spektral.layers.GlobalSumPool})

In [5]:
# 2 : Coordinates
# 3 : Elemental symbols
# 4 : Degree
# 5 : Atomic radius
# 6 : Atomic volume
# 7 : Is on the ring

def convertToGraph_2(smi_lst, SWITCH):
    adj,adj_norm, features, edge_features = [], [], [], []
    NodeNumFeatures, EdgeNumFeatures, INDEX = 0, 4, -1
    for smi in smi_lst:
        INDEX += 1
        iMol = Chem.MolFromSmiles(smi[0]) # Convert Smiles strings to mol objects    
        maxNumAtoms = iMol.GetNumAtoms()
        iAdjTmp = Chem.rdmolops.GetAdjacencyMatrix(iMol) # Obtain the adjacency matrix of mol
        # Characterization of structural chemical bonds
        one_edge_features = edge_feature(iMol)
        edge_features.append(one_edge_features)
        # Constructing node feature data
        iFeature = np.zeros((maxNumAtoms, NodeNumFeatures))
        iFeatureTmp = []
        for atom in iMol.GetAtoms():
            if SWITCH ==   1:
                iFeatureTmp.append(atom_feature_2(atom,INDEX))
            elif SWITCH == 2:
                iFeatureTmp.append(atom_feature_3(atom,INDEX))
            elif SWITCH == 3:
                iFeatureTmp.append(atom_feature_4(atom,INDEX))
            elif SWITCH == 4:
                iFeatureTmp.append(atom_feature_5(atom,INDEX))
            elif SWITCH == 5:
                iFeatureTmp.append(atom_feature_6(atom,INDEX))
            elif SWITCH == 6:
                iFeatureTmp.append(atom_feature_7(atom,INDEX))
        features.append(np.array(iFeatureTmp))
        adj.append(iAdjTmp)
            
    features = np.asarray(features)
    edge_features = np.asarray(edge_features)
    return adj, features, edge_features

'''Coordinates'''
def atom_feature_2(atom,INDEX):
    return np.array(
        one_of_k_encoding_unk(atom.GetSymbol() ,All_Atoms) +
        one_of_k_encoding_unk(atom.GetDegree(), [0, 1, 2, 3, 4]) +
        [Atom_radius[atom.GetSymbol()],Atom_mass[atom.GetSymbol()]] +
        one_of_k_encoding_unk(atom.IsInRing(), [0, 1]) +
        [0,0,0]
    )

'''Elemental symbols'''
def atom_feature_3(atom,INDEX):
    return np.array(
        [0,0,0,0,0,0,0,0,0,0,0] + 
        one_of_k_encoding_unk(atom.GetDegree(), [0, 1, 2, 3, 4]) +
        [Atom_radius[atom.GetSymbol()],Atom_mass[atom.GetSymbol()]] +
        one_of_k_encoding_unk(atom.IsInRing(), [0, 1]) +
        list(Coordinate[INDEX][atom.GetIdx()])
    )
'''Degree'''
def atom_feature_4(atom,INDEX):
    return np.array(
        one_of_k_encoding_unk(atom.GetSymbol() ,All_Atoms) +
        [0,0,0,0,0] + 
        [Atom_radius[atom.GetSymbol()],Atom_mass[atom.GetSymbol()]] +
        one_of_k_encoding_unk(atom.IsInRing(), [0, 1]) +
        list(Coordinate[INDEX][atom.GetIdx()])
    )
'''Atomic radius'''
def atom_feature_5(atom,INDEX):
    return np.array(
        one_of_k_encoding_unk(atom.GetSymbol() ,All_Atoms) +
        one_of_k_encoding_unk(atom.GetDegree(), [0, 1, 2, 3, 4]) +
        [0,Atom_mass[atom.GetSymbol()]] +
        one_of_k_encoding_unk(atom.IsInRing(), [0, 1]) +
        list(Coordinate[INDEX][atom.GetIdx()])
    )
'''Atomic volume'''
def atom_feature_6(atom,INDEX):
    return np.array(
        one_of_k_encoding_unk(atom.GetSymbol() ,All_Atoms) +
        one_of_k_encoding_unk(atom.GetDegree(), [0, 1, 2, 3, 4]) +
        [Atom_radius[atom.GetSymbol()],0] +
        one_of_k_encoding_unk(atom.IsInRing(), [0, 1]) +
        list(Coordinate[INDEX][atom.GetIdx()])
    )
'''Is on the ring'''
def atom_feature_7(atom,INDEX):
    return np.array(
        one_of_k_encoding_unk(atom.GetSymbol() ,All_Atoms) +
        one_of_k_encoding_unk(atom.GetDegree(), [0, 1, 2, 3, 4]) +
        [Atom_radius[atom.GetSymbol()],Atom_mass[atom.GetSymbol()]] +
        list([0,0]) +
        list(Coordinate[INDEX][atom.GetIdx()])
    )

In [6]:
adj_2, features_2, edge_features_2 = convertToGraph_2(smiles, 1)
adj_3, features_3, edge_features_3 = convertToGraph_2(smiles, 2)
adj_4, features_4, edge_features_4 = convertToGraph_2(smiles, 3)
adj_5, features_5, edge_features_5 = convertToGraph_2(smiles, 4)
adj_6, features_6, edge_features_6 = convertToGraph_2(smiles, 5)
adj_7, features_7, edge_features_7 = convertToGraph_2(smiles, 6)

DataSet_2 = MyDataset(features_2, adj_2, edge_features_2, ccs)
DataSet_3 = MyDataset(features_3, adj_3, edge_features_3, ccs)
DataSet_4 = MyDataset(features_4, adj_4, edge_features_4, ccs)
DataSet_5 = MyDataset(features_5, adj_5, edge_features_5, ccs)
DataSet_6 = MyDataset(features_6, adj_6, edge_features_6, ccs)
DataSet_7 = MyDataset(features_7, adj_7, edge_features_7, ccs)

dataset_te_2 = DataSet_2
dataset_te_3 = DataSet_3
dataset_te_4 = DataSet_4
dataset_te_5 = DataSet_5
dataset_te_6 = DataSet_6
dataset_te_7 = DataSet_7

In [7]:
def Fun(dataset_te,X):
    loader_te = BatchLoader(dataset_te,batch_size=1,epochs=1,shuffle=False); loader_te_data = (); ltd_index = 0;
    for i in loader_te.load():
        adduct_one_hot = [one_of_k_encoding_unk(adduct_te[ltd_index+ltd_index_i],adduct_SET) for ltd_index_i in range(len(i[1]))]
        adduct_one_hot = np.array(adduct_one_hot)
        one_sample = ((adduct_one_hot,i[0][0],i[0][1],i[0][2]),i[1])
        loader_te_data += (one_sample,)
        ltd_index += len(i[1])
    loader_te_data = (i for i in loader_te_data)
    for batch in loader_te_data:
        inputs, target = batch
        predictions = ECC_model(inputs, training=False)
        predictions = np.array(predictions[0])
        X.append(predictions[0])
    return np.array(X)

In [8]:
Target = np.array([i[0] for i in ccs])
A = Fun(dataset_te,[])
B = Fun(dataset_te_2,[])
C = Fun(dataset_te_3,[])
D = Fun(dataset_te_4,[])
E = Fun(dataset_te_5,[])
F = Fun(dataset_te_6,[])
G = Fun(dataset_te_7,[])

In [9]:
A2 = abs(Target-A)/Target*100.
B2 = abs(Target-B)/Target*100.
C2 = abs(Target-C)/Target*100.
D2 = abs(Target-D)/Target*100.
E2 = abs(Target-E)/Target*100.
F2 = abs(Target-F)/Target*100.
G2 = abs(Target-G)/Target*100.

In [10]:
from sklearn.metrics import r2_score
R2_Score = r2_score(A,Target)
print(R2_Score,np.median(A2),'\n')

ALL_ARE = [np.mean(B2)-np.mean(A2),
           np.mean(C2)-np.mean(A2),
           np.mean(D2)-np.mean(A2),
           np.mean(E2)-np.mean(A2),
           np.mean(F2)-np.mean(A2),
           np.mean(G2)-np.mean(A2)]

ALL_ARE_P = [i/np.sum(ALL_ARE) for i in ALL_ARE]
print(ALL_ARE_P)
print(ALL_ARE)

0.9966394827944038 0.4945458276011555 

[0.2286734655417619, 0.2120133110483818, 0.17139201078671926, 0.08007338174632164, 0.011328149511868692, 0.29651968136494666]
[7.629376953609119, 7.073533719090296, 5.718259676657221, 2.6715387019022674, 0.3779481917944938, 9.892973012577597]
