In [16]:
import torch
import torch.nn as nn
import torch.nn.functional as functional
import sklearn.metrics as metrics
from rdkit import Chem
from rdkit.Chem import rdchem as utils

import numpy as np
from numpy import exp
from numpy.random import normal


In [36]:
base_folder = r'spare_data/'

train_r_file = base_folder + 'train_reactants.sdf'
train_ts_file = base_folder + 'train_ts.sdf'
train_p_file = base_folder + 'train_products.sdf'

test_r_file = base_folder + 'test_reactants.sdf'
test_ts_file = base_folder + 'test_ts.sdf'
test_p_file = base_folder + 'test_products.sdf'

train_r = Chem.ForwardSDMolSupplier(train_r_file, removeHs=False, sanitize=False)
train_r = [x for x in train_r]
train_ts = Chem.ForwardSDMolSupplier(train_ts_file, removeHs=False, sanitize=False)
train_ts = [x for x in train_ts]
train_p = Chem.ForwardSDMolSupplier(train_p_file, removeHs=False, sanitize=False)
train_p = [x for x in train_p]

test_r = Chem.ForwardSDMolSupplier(test_r_file, removeHs=False, sanitize=False)
test_r = [x for x in test_r]
test_ts = Chem.ForwardSDMolSupplier(test_ts_file, removeHs=False, sanitize=False)
test_ts = [x for x in test_ts]
test_p = Chem.ForwardSDMolSupplier(test_p_file, removeHs=False, sanitize=False)
test_p = [x for x in test_p]


In [42]:
# restrict number of products created to 30 for testing

len(train_r[0:100])

100

In [71]:
# figuring out padding
max(mol.GetNumAtoms() for mol in train_r) # = 21, same for ts, p
# min(mol.GetNumAtoms() for mol in train_r) # = 4
# need to get more

# train_r_small = train_r[0:100]
# train_ts_small = train_ts[0:100]
# train_p_small = train_p[0:100]

print(torch.__version__)
# do AE
# then get to grips with PTG


21

In [56]:
train_r_data[9].z

tensor([6, 7, 6, 7, 7, 6, 6, 1, 1, 1, 1, 1, 1, 1])

In [2]:
from ts_vae.data_processors.grambow_processor import ReactionDataset

# x = atom features; edge_attr = bond types (4 types, one-hot); pos = (x,y,z) coords;  edge_index = graph connectivity; y = list of atomic numbers in mol

base_path = r'data/'
train_r_data = ReactionDataset(base_path, geo_file = 'train_r') 

# want Embedding(GNN(MLP))

In [77]:
train_r_data

ReactionDataset(6739)

In [76]:
import torch
import torch.nn as nn
from torch.nn import ReLU

class MLP(nn.Module):
    """ Standard MLP. 
        Reminders because I have no memory: 
        - MLP with 1 input layer, 1 output layer, and no activation is linear layer.
        - DNN = NN with >1 hidden layer. MLP with >1 layer is DNN. 
        - MLPs are subset of DNN. DNNs can have loops, MLPs are always feed-forward.
        - So generally, MLP = FFNN with FC layers and non-linear activation.
    """

    def __init__(self, input_dim, output_dim, activation=ReLU, num_hidden=2):
        # may need to add batchnorm and dropout here later
        super().__init__()
        # fc_layers = fully connected layers
        fc_layers = [nn.Linear(input_dim, input_dim) for hidden_idx in range(num_hidden)]
        # why this line if already do list above?
        fc_layers.append(nn.Linear(input_dim, output_dim))
        # sequential here or modulelist? sequential gives ordering, modulelist allows any execution order
        self.fc_layers = nn.ModuleList(fc_layers)
        self.num_hidden = num_hidden
        self.activation = activation
        
    def forward(self, x):
        y = x
        for hidden_idx in range(self.num_hidden):
            y = self.fc_layers[hidden_idx](y)
            y = self.activation(y)
        y = self.fc_layers[self.num_hidden](y)
        return y

In [None]:
################

In [80]:
import torch_geometric.nn as pyg_nn

# standard GCN, generalise to taken any convolution
class MoleculeGCN(nn.Module):
    # could also define task here if wanted to specify embedding vs say node prediction or graph
    def __init__(self, dataset, input_dim, hidden_dim, output_dim, hidden_layers=2, dropout=0.25, task='node'):
        super(MoleculeGCN, self).__init()

        # make sure we are performing a possible task
        if not (task == 'node' or task == 'graph_classification'):
            raise RuntimeError('Unknown task.')
        self.task = task

        # atm, doing convs, could generalise to any layer combination after
        self.convs = nn.ModuleList()
        self.convs.append(self.build_conv_model(input_dim, hidden_dim))
        self.layer_norms = nn.ModuleList()
        for layer_idx in range(hidden_layers):
            self.convs.append(self.build_conv_model(hidden_dim, hidden_dim))
            self.layer_norms.append(nn.LayerNorm(hidden_dim))
        
        # linear layers post message passing
        self.dropout = dropout
        self.num_layers = 1 + hidden_layers
        self.post_mp = nn.Sequential( # should generalise this for number of hidden_layers
            nn.Linear(hidden_dim, hidden_dim), nn.Dropout(self.dropout),
            nn.Linear(hidden_dim, output_dim)
        )

        # needed?
        self.dataset = dataset
    
    def build_model(self, input_dim, hidden_dim):
        # currently doing simple node prediction
        # can have graph classification later if needed
        return pyg_nn.GCNConv(input_dim, hidden_dim)

    def forward(self, data):
        # data = self.dataset.data [and then remove from func def]
        # x = node feature matrix [num_nodes, num_node_features]
        # edge_index = adjacency matrix (sparse adjacency list). what are edges in your graph.
        # batch = batching more complicated here since graphs have diff number of nodes (unlike, say, images) so this param records which elem index each node attribute belongs to
        x, edge_index, batch = data.x, data.edge_index, data.batch
        
        # check: if no features, use constant feature
        if data.num_node_features == 0: 
            x = torch.ones(data.num_nodes, 1)
        
        # execute convolutions over each layer
        for i in range(self.num_layers):
            x = self.convs[i](x, edge_index)
            embedding = x
            x = F.relu(x)
            # do this because dropout different at train time vs test time
            x = F.dropout(x, p=self.dropout, training=self.training)
            if not i == (self.num_layers - 1):
                x = self.layer_norms[i](x)

            # pool nodes then apply message passing layers
            # x = pyg_nn.global_mean_pool(x, batch) --> only if graph classification
            x = self.post_mp(x)

            return embedding, F.log_softmax(x, dim=1)
        
        def loss(self, pred, label):
            return F.nll_loss(pred, label)




In [81]:
train_r_data.num_classes

# loose note: could have classes for each type of reaction, uni vs bimolecular, etc.

0

In [83]:
import torch.optim as optim
from tensorboardX import SummaryWriter

def train(geom_dataset, task, writer):
    num_geometries = len(geom_dataset)
    train_loader = DataLoader(geom_dataset[ :int(num_geometries * 0.8)], batch_size=20)
    test_loader = DataLoader(geom_dataset[int(num_geometries * 0.8): ], batch_size=20)

    # build model and optimiser
    model = MoleculeGCN(dataset=geom_dataset, input_dim=max(dataset.num_node_features, 1), hidden_dim=5, output_dim=2, hidden_layers=1)
    opt = optim.Adam(model.parameters(), lr=0.01)

    # train
    for epoch in range(3):
        total_loss = 0
        model.train()
        for batch in train_loader:
            opt.zero_grad()
            embedding, pred = model(batch)
            label = batch.y
            if task == 'node':
                pred = pred[batch.train_mask]
                label = label[batch.train_mask]
            loss = model.loss(pred, label)
            loss.backward()
            opt.step()
            total_loss += loss.item()* batch.num_graphs
        total_loss /= len(train_loader.dataset)
        writer.add_scalar("loss", total_loss, epoch)
        
        # currently printing for all epochs
        test_acc = test(test_loader, model)
        print("Epoch {}. Loss: {:.4f}. Test accuracy: {:4f.}".format(epoch, total_loss, test_acc))
        writer.add_scalar("Test accuracy", test_acc, epoch)
    
    return model



In [85]:
def test(loader, model, is_validation=False):
    model.eval()
    correct = 0
    for data in loader:
        with torch.no_grad():
            embedding, pred = model(data)
            pred = pred.argmax(dim=1)
            label = data.y
        if model.task == 'node':
            mask = data.val_mask if is_validation else data.test_mask
            # node classification: only evaluate nodes in test set
            pred = pred[mask]
            label = data.y[mask]
    
    if model.task == 'graph':
        total = len(loader.dataset)
    else:
        total = 0
        for data in loader.dataset:
            total += torch.sum(data.test_mask).item()
        return correct / total

In [97]:
get_ipython().system_raw(
    'tensorboard --logdir {} --host 0.0.0.0 --port 6006 &'
    .format("./log")
)
get_ipython().system_raw('./ngrok http 6006 &')
!curl -s http://localhost:4040/api/tunnels | python3 -c \
    "import sys, json; print(json.load(sys.stdin)['tunnels'][0]['public_url'])"

In [None]:
writer = SummaryWriter("./log/" + datetime.now().strftime("%Y%m%d-%H%M%S"))

from ts_vae.data_processors.grambow_processor import ReactionDataset

base_path = r'data/'
train_r_dataset = ReactionDataset(base_path, geo_file = 'train_r') 
task = 'node'
model = train(train_r_dataset, task, writer)


In [None]:
###################

In [None]:
import torch
import torch.nn.functional as F

from torch.nn import Linear, ReLU
# from torch_geometric.nn import Sequential

# need to put MLP into this
class StandardGNN(nn.Module):
    def __init__(self, dataset, input_dim, hidden_dim, output_dim):
        super(StandardGNN, self).__init__()
        # num_features = node features
        self.l1 = Linear(dataset.num_features, 6)
        self.l2 = Linear(6, 3)
    
    def forward(self, x):
        x = self.l1(x)
        x = ReLU(x)

In [None]:
import torch_geometric.nn as pyg_nn

class MoleculeEmbedder(nn.Module):
    # could also define task here if wanted to specify embedding vs say node prediction or graph
    def __init__(self, dataset, hidden_dim, num_hidden=2, activation=ReLU):
        super().__init__()
        self.gnn = StandardGNN(...) # maybe with MLP here



    def __init__(self, dataset, input_dim, hidden_dim, output_dim, activation=ReLU, num_hidden=2, dropout=0.25):
        super(MoleculeGCNEmbedder, self).__init()

        # pass in input_dim, output_dim, activation, num_hidden
        self.mlp1 = MLP(input_dim, output_dim, activation, num_hidden)

        

        self.convs = nn.ModuleList()
        self.convs.append(self.build_conv_model(input_dim, hidden_dim))
        self.lns = nn.ModuleList()
        for layer_idx in range(hidden_layers):
            self.convs.append(self.build_conv_model(hidden_dim, hidden_dim))
            self.lns.append(nn.LayerNorm(hidden_dim))
        
        # linear layers post message passing
        self.dropout = dropout
        self.num_layers = 1 + hidden_layers
        self.post_mp = nn.Sequential( # should generalise this for number of hidden_layers
            nn.Linear(hidden_dim, hidden_dim), nn.Dropout(self.dropout),
            nn.Linear(hidden_dim, output_dim)
        )
    
    def build_model(self, input_dim, hidden_dim):
        # currently doing simple node prediction
        return pyg_nn.GCNConv(input_dim, hidden_dim)

    def forward(self, data):
        x, edge_index, batch = data.x, data.edge_index, data.batch
        if data.num_node_features == 0:
            x = torch.ones(data.num_nodes, 1)


        



In [None]:
# note: each graph has atom features, edge features (actually just bond type), connectivity, and coordinates. currently, the y=list of atomic numbers present
# may need to define easier dataset for TS prediction with R-P together with y=TS coords
# like Gregor RL paper, place atoms 

# note: can define data class for params e.g. 
# @dataclass
# class GNNParams:
#   input_dim: int
#   output_dim: int 
#   ... (hidden_sizes, dropout, batchnorm, activation) 

# could also have enum for different representations

In [73]:
# each graph: num_nodes; num_features/num_node_features; num_edges; num_edge_features; 

train_r_data[0].keys

['x', 'edge_index', 'edge_attr', 'pos', 'z', 'idx']

In [None]:
# notes
num_latent_params = 2 * latent_space_dim
network = nn.Sequential(nn.Linear(data_dim, 300), ..., nn.Linear(400, num_latent_params))
encoder(network)