In [1]:
from pysmiles import read_smiles
import networkx as nx
from spektral.data import Dataset, DisjointLoader, Graph
import numpy as np
import random
import pandas as pd

2023-12-04 18:36:55.403111: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
pp = pd.read_csv('../data/test.csv')
pp = pp[["name", "p_np", "smile"]]
pp.head()

Unnamed: 0,name,p_np,smile
0,Propanolol,1,[Cl].CC(C)NCC(O)COc1cccc2ccccc12
1,Terbutylchlorambucil,1,C(=O)(OC(C)(C)C)CCCc1ccc(cc1)N(CCCl)CCCl
2,40730,1,c12c3c(N4CCN(C)CC4)c(F)cc1c(c(C(O)=O)cn2C(C)CO...
3,24,1,C1CCN(CC1)Cc1cccc(c1)OCCCNC(=O)C
4,cloxacillin,1,Cc1onc(c2ccccc2Cl)c1C(=O)N[C@H]3[C@H]4SC(C)(C)...


In [3]:
import ase

dtypes = np.dtype([('symbol', str), ('atomic mass', float)])
data = np.empty(0, dtype=dtypes)

elements = pd.DataFrame(data)
elements['symbol'] = ase.data.chemical_symbols
elements['atomic mass'] = ase.data.atomic_masses
elements.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119 entries, 0 to 118
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   symbol       119 non-null    object 
 1   atomic mass  119 non-null    float64
dtypes: float64(1), object(1)
memory usage: 2.0+ KB


In [4]:
tempList = []

##this takes forever i'm sorry

for index, row in pp.iterrows():
    smiles = row["smile"]
    mol = read_smiles(smiles)
    nx.convert_node_labels_to_integers(mol)
    nodevec = list(mol.nodes(data='element'))

    nodevecList = []
    for i in nodevec: 
        symbol = i[1]
        atomicNum = elements.query(f'symbol == "{symbol}"').index[0]
        mass = elements.query(f'symbol == "{symbol}"')['atomic mass'].values[0]
        nodevecList.append([atomicNum, mass])
    nodeFeatures = np.array(nodevecList)
    adj = nx.to_pandas_adjacency(mol).to_numpy().astype(int)
  
    g = Graph(a = adj, x = nodeFeatures, y = int(row["p_np"]))
    tempList.append(g)


Atom "[C@H]" contains stereochemical information that will be discarded.
Atom "[C@H]" contains stereochemical information that will be discarded.
Atom "[C@@H]" contains stereochemical information that will be discarded.
Atom "[C@@H]" contains stereochemical information that will be discarded.
Atom "[C@H]" contains stereochemical information that will be discarded.
Atom "[C@H]" contains stereochemical information that will be discarded.
Atom "[C@H]" contains stereochemical information that will be discarded.
Atom "[C@@H]" contains stereochemical information that will be discarded.
Atom "[C@H]" contains stereochemical information that will be discarded.
Atom "[C@@]" contains stereochemical information that will be discarded.
Atom "[C@]" contains stereochemical information that will be discarded.
E/Z stereochemical information, which is specified by "\", will be discarded
E/Z stereochemical information, which is specified by "/", will be discarded
Atom "[C@@H]" contains stereochemical inf

In [5]:
pp["graph"] = tempList

graphs = pp["graph"]
graphs


0       Graph(n_nodes=20, n_node_features=2, n_edge_fe...
1       Graph(n_nodes=23, n_node_features=2, n_edge_fe...
2       Graph(n_nodes=26, n_node_features=2, n_edge_fe...
3       Graph(n_nodes=21, n_node_features=2, n_edge_fe...
4       Graph(n_nodes=29, n_node_features=2, n_edge_fe...
                              ...                        
2032    Graph(n_nodes=17, n_node_features=2, n_edge_fe...
2033    Graph(n_nodes=27, n_node_features=2, n_edge_fe...
2034    Graph(n_nodes=24, n_node_features=2, n_edge_fe...
2035    Graph(n_nodes=28, n_node_features=2, n_edge_fe...
2036    Graph(n_nodes=21, n_node_features=2, n_edge_fe...
Name: graph, Length: 2037, dtype: object

In [6]:
class MyDataset(Dataset):
    """
    load our data
    """
    def __init__(self, graphs, **kwargs):
        self.graphs = graphs
        super().__init__(**kwargs)
    

        
    def read(self):
        # We must return a list of Graph objects
        return self.graphs


In [7]:
dataset = MyDataset(graphs = graphs)

In [8]:
dataset.filter(lambda g: g.n_nodes < 45)

In [9]:
max_degree = dataset.map(lambda g: g.a.sum(-1).max(), reduce=max)
max_degree

4

In [10]:
from spektral.transforms import Degree

dataset.apply(Degree(max_degree))

In [11]:
from spektral.transforms import GCNFilter

dataset.apply(GCNFilter())

In [52]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Dropout
from spektral.layers import GCNConv, GlobalSumPool

class MyFirstGNN(Model):

    def __init__(self, n_hidden, n_labels):
        super().__init__()
        self.graph_conv = GCNConv(n_hidden)
        self.pool = GlobalSumPool()
        self.dropout = Dropout(0.5)
        self.dense1 = Dense(n_hidden, 'relu')
        self.dense2 = Dense(n_hidden, 'relu')
        self.dense = Dense(n_labels, 'sigmoid')

    def call(self, inputs):
        out = self.graph_conv(inputs)
        out = self.dropout(out)
        out = self.pool(out)
        #out = self.dense1(out)
        #out = self.dropout(out)
        #out = self.dense2(out)
        #out = self.dropout(out)
        out = self.dense(out)

        return out


In [53]:
from keras.optimizers import Adam

model = MyFirstGNN(500, dataset.n_labels)
opt = Adam(learning_rate=.025)
model.compile(optimizer=opt, loss='binary_crossentropy', metrics=['accuracy'])

In [54]:
from spektral.data import BatchLoader

np.random.shuffle(dataset)
split = int(0.8 * len(dataset))
data_tr, data_te = dataset[:split], dataset[split:]

loader = BatchLoader(data_tr, batch_size=32)

  np.random.shuffle(dataset)


In [55]:
model.fit(loader.load(), steps_per_epoch=loader.steps_per_epoch, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x15f05af40>

In [56]:
loader = BatchLoader(data_te, batch_size=32)
loss = model.evaluate(loader.load(), steps=loader.steps_per_epoch)

print('Test loss: {}'.format(loss))


Test loss: [0.5247650742530823, 0.7690355181694031]


In [57]:
pred = model.predict(loader.load(), steps=loader.steps_per_epoch).reshape(-1)
for i in pred:
    if str(i) != '0.52067524':
        print(i)



0.9476737
0.9362106
0.9502916
0.86188287
0.9424036
0.9269051
0.9126001
0.94481075
0.9418304
0.90890723
0.9425839
0.7823354
0.37321132
0.9168576
0.89482445
0.9393724
0.8924009
0.60507804
0.95314664
0.93871385
0.9536186
0.9388702
0.9129683
0.9565379
0.9154721
0.73671913
0.89699656
0.9663193
0.93435663
0.87479234
0.9329652
0.70954275
0.948064
0.8822198
0.9373057
0.9194401
0.8717949
0.8654113
0.91394264
0.89581996
0.6642432
0.7501613
0.9527953
0.8536152
0.8169333
0.97004837
0.8789388
0.8782235
0.9440743
0.8551624
0.8760422
0.9526779
0.87534326
0.9057485
0.5002635
0.8042237
0.9592382
0.9600373
0.9076673
0.92234486
0.96830994
0.94171053
0.938805
0.8430559
0.8173743
0.9652982
0.9117282
0.9013201
0.9530813
0.9263965
0.8891565
0.86864865
0.8440865
0.9578165
0.93345517
0.8191735
0.92009556
0.8690109
0.9417677
0.95778614
0.9361764
0.954689
0.9122313
0.9194422
0.9447523
0.97257614
0.91267306
0.93222344
0.9013201
0.84115744
0.8270615
0.7497828
0.2912554
0.890495
0.96730435
0.91316503
0.9449443
0.96

In [45]:
from spektral.models import GNNExplainer

tempGraph = loader.dataset[0]

explain = GNNExplainer(model)
explain.explain_node(tempGraph.x, tempGraph.a)

n_hops was automatically inferred to be 4


TypeError: Input must be a SparseTensor.