In [1]:
from pysmiles import read_smiles
import networkx as nx
import rdkit
from rdkit.Chem import MolFromSmiles
from rdkit import Chem
from spektral.data import Dataset, DisjointLoader, Graph
import numpy as np
import random
import pandas as pd

2023-12-04 22:39:14.526772: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
pp = pd.read_csv('../data/split.csv', sep = ";")
pp = pp[["p_np", "smile"]]
pp.head()

Unnamed: 0,p_np,smile
0,0,CC(C)(Oc1ccc(Cl)cc1)C(O)=O
1,1,Cn1c2CCC(Cn3ccnc3C)C(=O)c2c4ccccc14
2,1,C1=CC=CC3=C1C(C2=CC=CC=C2)(SC3(C)C)CCCNC
3,0,CC(=O)OCC1=C(N2[C@H](SC1)[C@H](NC(=O)CSc3ccncc...
4,1,CN(C)CCCN1c2ccccc2Sc3ccc(cc13)C(F)(F)F


In [3]:
permitted_list = ['C','N','O','S','F','Si','P','Cl','Br','Mg','Na','Ca','Fe', 'I']

def one_hot_encoding(x, permitted_list):
    """
    Maps input elements x which are not in the permitted list to the last element
    of the permitted list.
    """
    if x not in permitted_list:
        x = permitted_list[-1]
    binary_encoding = [int(boolean_value) for boolean_value in list(map(lambda s: x == s, permitted_list))]
    return binary_encoding
    
def getAtomFeatures(atom):
     # compute atom features
    
    atom_type_enc = one_hot_encoding(str(atom.GetSymbol()), permitted_list)
    
    n_heavy_neighbors_enc = one_hot_encoding(int(atom.GetDegree()), [0, 1, 2, 3, 4, "MoreThanFour"])
    
    formal_charge_enc = one_hot_encoding(int(atom.GetFormalCharge()), [-3, -2, -1, 0, 1, 2, 3, "Extreme"])
    
    hybridisation_type_enc = one_hot_encoding(str(atom.GetHybridization()), ["S", "SP", "SP2", "SP3", "SP3D", "SP3D2", "OTHER"])
    
    is_in_a_ring_enc = [int(atom.IsInRing())]
    
    is_aromatic_enc = [int(atom.GetIsAromatic())]
    
    atomic_mass_scaled = [float((atom.GetMass() - 10.812)/116.092)]
    
    vdw_radius_scaled = [float((Chem.GetPeriodicTable().GetRvdw(atom.GetAtomicNum()) - 1.5)/0.6)]
    
    covalent_radius_scaled = [float((Chem.GetPeriodicTable().GetRcovalent(atom.GetAtomicNum()) - 0.64)/0.76)]

    atom_features = np.concatenate([atom_type_enc, n_heavy_neighbors_enc, formal_charge_enc, hybridisation_type_enc, is_in_a_ring_enc, is_aromatic_enc, atomic_mass_scaled, vdw_radius_scaled, covalent_radius_scaled])
    return atom_features

def getNodeFeatures(nodevec):
    return np.array([getAtomFeatures(atom) for atom in nodevec.GetAtoms()])
        
    

In [4]:
tempList = []

##this takes forever i'm sorry

for index, row in pp.iterrows():
    smile = row["smile"]
    mol = MolFromSmiles(smile)
    #nx.convert_node_labels_to_integers(mol)
    #nodevec = list(mol.nodes(data='element'))
    molFeatures = getNodeFeatures(mol)
    adj = rdkit.Chem.rdmolops.GetAdjacencyMatrix(mol)
    
    g = Graph(a = adj, x = molFeatures, y = int(row["p_np"]))
    tempList.append(g)




In [5]:
pp["graph"] = tempList

graphs = pp["graph"]
graphs


0      Graph(n_nodes=14, n_node_features=40, n_edge_f...
1      Graph(n_nodes=22, n_node_features=40, n_edge_f...
2      Graph(n_nodes=22, n_node_features=40, n_edge_f...
3      Graph(n_nodes=28, n_node_features=40, n_edge_f...
4      Graph(n_nodes=24, n_node_features=40, n_edge_f...
                             ...                        
847    Graph(n_nodes=20, n_node_features=40, n_edge_f...
848    Graph(n_nodes=15, n_node_features=40, n_edge_f...
849    Graph(n_nodes=36, n_node_features=40, n_edge_f...
850    Graph(n_nodes=20, n_node_features=40, n_edge_f...
851    Graph(n_nodes=33, n_node_features=40, n_edge_f...
Name: graph, Length: 852, dtype: object

In [6]:
class MyDataset(Dataset):
    """
    load our data
    """
    def __init__(self, graphs, **kwargs):
        self.graphs = graphs
        super().__init__(**kwargs)
    

        
    def read(self):
        # We must return a list of Graph objects
        return self.graphs


In [7]:
dataset = MyDataset(graphs = graphs)

In [8]:
dataset.filter(lambda g: g.n_nodes < 45)

In [9]:
max_degree = dataset.map(lambda g: g.a.sum(-1).max(), reduce=max)
max_degree

4

In [10]:
from spektral.transforms import Degree

dataset.apply(Degree(max_degree))

In [11]:
from spektral.transforms import GCNFilter

dataset.apply(GCNFilter())

In [12]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Dropout
from spektral.layers import GCNConv, GlobalSumPool

class MyFirstGNN(Model):

    def __init__(self, n_hidden, n_labels):
        super().__init__()
        self.graph_conv = GCNConv(n_hidden)
        self.pool = GlobalSumPool()
        self.dropout = Dropout(0.3)
        self.dense1 = Dense(n_hidden, 'sigmoid')
        self.dense2 = Dense(n_hidden/2, 'sigmoid')
        self.dense = Dense(n_labels, 'sigmoid')

    def call(self, inputs):
        out = self.graph_conv(inputs)
        out = self.dropout(out)
        out = self.pool(out)
        out = self.dropout(out)
        out = self.dense1(out)
        out = self.dropout(out)
        out = self.dense2(out)
        out = self.dropout(out)
        out = self.dense(out)

        return out


In [13]:
from keras.optimizers import Adam

model = MyFirstGNN(100, dataset.n_labels)
opt = Adam(learning_rate=5e-5)
model.compile(optimizer=opt, loss='binary_crossentropy', metrics=['accuracy'])

In [14]:
from spektral.data import BatchLoader

np.random.shuffle(dataset)
split = int(0.8 * len(dataset))
data_tr, data_te = dataset[:split], dataset[split:]

loader = BatchLoader(data_tr)

  np.random.shuffle(dataset)


In [15]:
model.fit(loader.load(), steps_per_epoch=loader.steps_per_epoch, epochs=300)

  np.random.shuffle(a)
2023-12-04 22:39:19.974897: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int32
	 [[{{node Placeholder/_0}}]]


Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 30/300
Epoch 31/300
Epoch 32/300
Epoch 33/300
Epoch 34/300
Epoch 35/300
Epoch 36/300
Epoch 37/300
Epoch 38/300
Epoch 39/300
Epoch 40/300
Epoch 41/300
Epoch 42/300
Epoch 43/300
Epoch 44/300
Epoch 45/300
Epoch 46/300
Epoch 47/300
Epoch 48/300
Epoch 49/300
Epoch 50/300
Epoch 51/300
Epoch 52/300
Epoch 53/300
Epoch 54/300
Epoch 55/300
Epoch 56/300
Epoch 57/300
Epoch 58/300
Epoch 59/300
Epoch 60/300
Epoch 61/300
Epoch 62/300
Epoch 63/300
Epoch 64/300
Epoch 65/300
Epoch 66/300
Epoch 67/300
Epoch 68/300
Epoch 69/300
Epoch 70/300
Epoch 71/300
Epoch 72/300
Epoch 73/300
Epoch 74/300
Epoch 75/300
Epoch 76/300
Epoch 77/300
Epoch 78

<keras.callbacks.History at 0x7f7f43a3ce50>

In [17]:
loader2 = BatchLoader(data_te)
loss = model.evaluate(loader2.load(), steps=loader2.steps_per_epoch)


#print('Test loss: {}'.format(loss))



2023-12-04 22:48:05.563222: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int32
	 [[{{node Placeholder/_0}}]]


