In [45]:
from pysmiles import read_smiles
import networkx as nx
from spektral.data import Dataset, DisjointLoader, Graph
import numpy as np
import random
import pandas as pd

In [44]:
pp = pd.read_csv('../data/test.csv')
pp = pp[["name", "p_np", "smile"]]
pp.head()

Unnamed: 0,name,p_np,smile
0,Propanolol,1,[Cl].CC(C)NCC(O)COc1cccc2ccccc12
1,Terbutylchlorambucil,1,C(=O)(OC(C)(C)C)CCCc1ccc(cc1)N(CCCl)CCCl
2,40730,1,c12c3c(N4CCN(C)CC4)c(F)cc1c(c(C(O)=O)cn2C(C)CO...
3,24,1,C1CCN(CC1)Cc1cccc(c1)OCCCNC(=O)C
4,cloxacillin,1,Cc1onc(c2ccccc2Cl)c1C(=O)N[C@H]3[C@H]4SC(C)(C)...


In [95]:
import ase

dtypes = np.dtype([('symbol', str), ('atomic mass', float)])
data = np.empty(0, dtype=dtypes)

elements = pd.DataFrame(data)
elements['symbol'] = ase.data.chemical_symbols
elements['atomic mass'] = ase.data.atomic_masses
elements.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119 entries, 0 to 118
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   symbol       119 non-null    object 
 1   atomic mass  119 non-null    float64
dtypes: float64(1), object(1)
memory usage: 2.0+ KB


In [122]:
tempList = []

##this takes forever i'm sorry

for index, row in pp.iterrows():
    smiles = row["smile"]
    mol = read_smiles(smiles)
    nx.convert_node_labels_to_integers(mol)
    nodevec = list(mol.nodes(data='element'))

    nodevecList = []
    for i in nodevec: 
        symbol = i[1]
        atomicNum = elements.query(f'symbol == "{symbol}"').index[0]
        mass = elements.query(f'symbol == "{symbol}"')['atomic mass'].values[0]
        nodevecList.append([atomicNum, mass])
    nodeFeatures = np.array(nodevecList)
    adj = nx.to_pandas_adjacency(mol).to_numpy().astype(int)
  
    g = Graph(a = adj, x = nodeFeatures, y = int(row["p_np"]))
    tempList.append(g)


Atom "[C@H]" contains stereochemical information that will be discarded.
Atom "[C@H]" contains stereochemical information that will be discarded.
Atom "[C@@H]" contains stereochemical information that will be discarded.
Atom "[C@@H]" contains stereochemical information that will be discarded.
Atom "[C@H]" contains stereochemical information that will be discarded.
Atom "[C@H]" contains stereochemical information that will be discarded.
Atom "[C@H]" contains stereochemical information that will be discarded.
Atom "[C@@H]" contains stereochemical information that will be discarded.
Atom "[C@H]" contains stereochemical information that will be discarded.
Atom "[C@@]" contains stereochemical information that will be discarded.
Atom "[C@]" contains stereochemical information that will be discarded.
E/Z stereochemical information, which is specified by "\", will be discarded
E/Z stereochemical information, which is specified by "/", will be discarded
Atom "[C@@H]" contains stereochemical inf

In [124]:
pp["graph"] = tempList

graphs = pp["graph"]
graphs


0       Graph(n_nodes=20, n_node_features=2, n_edge_fe...
1       Graph(n_nodes=23, n_node_features=2, n_edge_fe...
2       Graph(n_nodes=26, n_node_features=2, n_edge_fe...
3       Graph(n_nodes=21, n_node_features=2, n_edge_fe...
4       Graph(n_nodes=29, n_node_features=2, n_edge_fe...
                              ...                        
2032    Graph(n_nodes=17, n_node_features=2, n_edge_fe...
2033    Graph(n_nodes=27, n_node_features=2, n_edge_fe...
2034    Graph(n_nodes=24, n_node_features=2, n_edge_fe...
2035    Graph(n_nodes=28, n_node_features=2, n_edge_fe...
2036    Graph(n_nodes=21, n_node_features=2, n_edge_fe...
Name: graph, Length: 2037, dtype: object

In [125]:
class MyDataset(Dataset):
    """
    load our data
    """
    def __init__(self, graphs, **kwargs):
        self.graphs = graphs
        super().__init__(**kwargs)
    

        
    def read(self):
        # We must return a list of Graph objects
        return self.graphs


In [126]:
dataset = MyDataset(graphs = graphs)

In [127]:
dataset.filter(lambda g: g.n_nodes < 45)

In [128]:
max_degree = dataset.map(lambda g: g.a.sum(-1).max(), reduce=max)
max_degree

4

In [129]:
from spektral.transforms import Degree

dataset.apply(Degree(max_degree))

In [130]:
from spektral.transforms import GCNFilter

dataset.apply(GCNFilter())

In [131]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Dropout
from spektral.layers import GCNConv, GlobalSumPool

class MyFirstGNN(Model):

    def __init__(self, n_hidden, n_labels):
        super().__init__()
        self.graph_conv = GCNConv(n_hidden)
        self.pool = GlobalSumPool()
        self.dropout = Dropout(0.5)
        self.dense1 = Dense(n_hidden, 'relu')
        self.dense2 = Dense(n_hidden, 'relu')
        self.dense = Dense(n_labels, 'sigmoid')

    def call(self, inputs):
        out = self.graph_conv(inputs)
        out = self.dropout(out)
        out = self.pool(out)
        out = self.dense1(out)
        out = self.dropout(out)
        out = self.dense2(out)
        out = self.dropout(out)
        out = self.dense(out)

        return out


In [132]:
from keras.optimizers import Adam

model = MyFirstGNN(1000, dataset.n_labels)
opt = Adam(learning_rate=.025)
model.compile(optimizer=opt, loss='binary_crossentropy', metrics=['accuracy'])

In [133]:
from spektral.data import BatchLoader

#np.random.shuffle(dataset)
split = int(0.8 * len(dataset))
data_tr, data_te = dataset[:split], dataset[split:]

loader = BatchLoader(data_tr, batch_size=32)

In [134]:
model.fit(loader.load(), steps_per_epoch=loader.steps_per_epoch, epochs=30)

Epoch 1/30


  np.random.shuffle(a)


Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.src.callbacks.History at 0x164b9cfa0>

In [135]:
loader = BatchLoader(data_te, batch_size=32)
loss = model.evaluate(loader.load(), steps=loader.steps_per_epoch)

print('Test loss: {}'.format(loss))


Test loss: [0.32085925340652466, 1.0]


In [208]:
pred = model.predict(loader.load(), steps=loader.steps_per_epoch).reshape(-1)
for i in pred:
    if str(i) != '0.52067524':
        print(i)



2023-12-04 14:40:38.817722: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int32
	 [[{{node Placeholder/_0}}]]


