In [1]:
""" This is graph autoencoder.
The autoencoder is trained taking as input many graphs.
The learned task of the autoencoder is to reconstruct a given molecule. """

import pandas as pd

# Read in the data
drug_data=pd.read_csv("BBBP.csv")

#sum all values equal to 1 in p_np column of drug_data
total_unitary_pnp = sum(drug_data.p_np==1)
print ("Total number of unitary p_np values: ", total_unitary_pnp)

#sum all values equal to 0 in p_np column of drug_data
total_zero_pnp = sum(drug_data.p_np==0)
print ("Total number of zero p_np values: ", total_zero_pnp)

# Print the total number of p_np values
print ("Total number of p_np values: ", len(drug_data.p_np))

# Calculate the percentage of unitary p_np values
print ("Percentage of unitary p_np values: ", round(total_unitary_pnp*100/len(drug_data.p_np),2), "%")

#calculate the percentage of zero p_np values
print ("Percentage of zero p_np values: ", round (total_zero_pnp*100/len(drug_data.p_np),2), "%")



Total number of unitary p_np values:  1567
Total number of zero p_np values:  483
Total number of p_np values:  2050
Percentage of unitary p_np values:  76.44 %
Percentage of zero p_np values:  23.56 %


In [2]:
from rdkit import Chem
from rdkit import RDLogger
from utils import *
from matplotlib import colors
from rdkit.Chem.Draw import MolToImage

# check the data frame
drug_data.head()

#extract smiles from the data frame and check the first 10 smiles
smiles = drug_data['smiles']
smiles.head()

#convert smiles to mols and disable warnings
RDLogger.DisableLog('rdApp.*')
mols = [Chem.MolFromSmiles(s) for s in smiles]
mols
print (type(mols[0]))

#resizing the drug_data to 
end_of_array = 20
smiles=smiles[:end_of_array]
print(smiles)

# Create a graph representation of the first molecule
for i in range(0, end_of_array):
    name = drug_data['name'][i]
    img = get_image(mols[i],None , name)

IndentationError: unindent does not match any outer indentation level (utils.py, line 55)

In [None]:
from torch_geometric.data import DataLoader
import networkx as nx
from torch_geometric.nn import GAE
from torch_geometric.utils import train_test_split_edges

# Load the smiles and create the graph representation
smiles = drug_data['smiles'].to_list()
# Resize the array
smiles = smiles[:end_of_array]

# Load the labels we don't want to use them for training the autoencoder
labels = drug_data['p_np'].to_list()
# Resize the array
labels = labels[:end_of_array]


# Create a list of PyTorch Geometric Data objects
data_list = create_pytorch_geometric_graph_data_list_from_smiles_and_labels(smiles, labels)
print ("First object in the data_list: " + str(data_list[0].x))

First object in the data_list: tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [1., 0., 0.,  ..., 1., 0., 0.],
        [1., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [1., 0., 0.,  ..., 0., 0., 0.],
        [1., 0., 0.,  ..., 0., 0., 0.],
        [1., 0., 0.,  ..., 0., 0., 0.]])


In [None]:
# Split the data into training and test sets
import random
random.shuffle(data_list)
train = data_list[:int(len(data_list)*0.8)] #train set
test = data_list[int(len(data_list)*0.2):] #val set

In [None]:
from torch_geometric.loader import DataLoader
from torch_geometric.utils import scatter

print ("Number of graphs in the training set: " + str(len(train)))
print ("Number of graphs in the test set: " + str(len(test)))
print ("Number of nodes in the first graph in the training set: " + str(train[0].x.shape[0]))
loader = DataLoader(train, batch_size=32, shuffle=True)
# batch = next(iter(loader))
# print(batch)
for data in loader:
    print(data)
    print(data.num_graphs)
    x = scatter(data.x, data.batch, dim=0, reduce='mean')
    print(x.size())



Number of graphs in the training set: 16
Number of graphs in the test set: 16
Number of nodes in the first graph in the training set: 49
DataBatch(x=[430, 79], edge_index=[2, 926], edge_attr=[926, 10], y=[16], batch=[430], ptr=[17])
16
torch.Size([16, 79])


In [3]:
from torch_geometric.loader import DataLoader

# Define the model encoder
class GCNEncoder(torch.nn.Module):
    def __init__(self, in_channels, out_channels):
        super(GCNEncoder, self).__init__()
        self.conv1 = GCNConv(in_channels, 2 * out_channels, cached=True) # cached only for transductive learning
        self.conv2 = GCNConv(2 * out_channels, out_channels, cached=True) # cached only for transductive learning

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        return self.conv2(x, edge_index)
    
# parameters
out_channels = 2
num_features = loader.ba   .num_features
epochs = 100

# model
model = GAE(GCNEncoder(num_features, out_channels))

# move to GPU (if available)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
x = loader.x.to(device)
train_pos_edge_index = loader.train_pos_edge_index.to(device)

# inizialize the optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

NameError: name 'torch' is not defined

In [None]:

    
# class GAE(nn.Module):
#     def __init__(self, in_channels, out_channels):
#         super(GAE, self).__init__()
#         self.conv1 = GCNConv(in_channels, 2*out_channels)
#         self.conv2 = GCNConv(2*out_channels, out_channels)
#         self.conv3 = GCNConv(out_channels, in_channels)
        
#     def forward(self, data):
#         x, edge_index = data.x, data.edge_index
        
#         x = self.conv1(x, edge_index)
#         x = torch.relu(x)
#         x = self.conv2(x, edge_index)
#         x = torch.relu(x)
#         x = self.conv3(x, edge_index)
#         return x

In [None]:
from torch.nn import Sequential as Seq, Linear, ReLU, CrossEntropyLoss
# parameters
out_channels = 2
num_features = data_list[0].x.shape[1] #input to the encoder
print("num_features (used as input): " + str(num_features))
print("out channels: " + str(out_channels))
epochs = 100
model = GAE(GCNEncoder(num_features, out_channels))
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') #use CUDA if available
model = model.to(device) #create network and send to the device memory
print(model.parameters())
optimizer = torch.optim.Adam(model.parameters(), lr=0.001) #use Adam optimizer
CSE = CrossEntropyLoss() #define loss


num_features (used as input): 79
out channels: 2
<generator object Module.parameters at 0x29250f530>


In [None]:
#train model
from tqdm import tqdm
model.train() #set model to training mode
for epoch in range(2): #run for epochs of training
    sum_loss = 0 #used to compute average loss in an epoch
    num_correct = 0
    random.shuffle(train) #shuffle the training data each epoch
    for d in tqdm(train): #go over each training point
        data = d.to(device) #send data to device
        print("data: " + str(data))
        optimizer.zero_grad() #zero gradients
        out = model(d) #evaluate data point
        # if torch.argmax(out) == torch.argmax(data.y): #if prediction is correct, increment counter for accuracy calculation
        #     num_correct += 1
        print("out: " + str(out))
        loss = CSE(torch.reshape(out, [1, 3]), torch.reshape(torch.argmax(data.y),[1])) #compute mean squared error loss
        sum_loss += float(loss) #add loss value to aggregate loss
        loss.backward() #compute gradients
        optimizer.step() #apply optimization
    print('Epoch: {:03d}, Average loss: {:.5f}, Accuracy: {:.5f}'.format(epoch, sum_loss/len(train), num_correct/len(train)))

  0%|          | 0/16 [00:00<?, ?it/s]

data: Data(x=[3, 79], edge_index=[2, 4], edge_attr=[4, 10], y=[1])





NotImplementedError: Module [GAE] is missing the required "forward" function