In [6]:
# atom features, bond type, graph connectivity, (x,y,z) coordinates  
#   - when we encode the graph, we're doing it through atom features, bond types, and connectivitity (i.e. which atoms are connected to each other and how?)
#   - the coordinate-based representation is particularly useful 
#   - for reaction centre, find adjacency matrix differences then map to 3D matrix

# convert MLP to GNN by swapping torch.nn.linear with PyG's GNN operators e.g. GCN layer

In [1]:
import torch
import torch.nn as nn
from torch_geometric.nn import GCNConv, GAE
from torch_geometric.utils import train_test_split_edges

#import sys
#sys.path.insert(0, "Users/rmhavij/3d-reactions/") # azure again
from ts_vae.data_processors.grambow_processor import ReactionDataset

In [4]:
# normal:
base_path = r'data/'
# azure base_path = r'Users/rmhavij/3d-reactions/data/'
r_dataset = ReactionDataset(base_path, geo_file = 'train_r') 

data = r_dataset.data
data.train_mask = data.val_mask = data.test_mask = data.y = None
data = train_test_split_edges(data = data, val_ratio = 0, test_ratio = 0.2)

In [5]:
class LinearEncoder(nn.Module):
    def __init__(self, in_channels, out_channels):
        super(LinearEncoder, self).__init__()
        # use single GC to get embeddings for nodes here
        self.conv = GCNConv(in_channels, out_channels)
    
    def forward(self, x, edge_index):
        # no relu for linearity
        return self.conv(x, edge_index)
    


In [6]:
num_node_fs = r_dataset.data.num_node_features # = 11
out_channels = 2

# build model and optimiser
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GAE(LinearEncoder(num_node_fs, out_channels))
model = model.to(device)
x = data.x.to(device)
train_pos_edge_index = data.train_pos_edge_index.to(device)
opt = torch.optim.Adam(model.parameters(), lr=0.01)


In [7]:
def train():
    model.train() # sets training flag and params (doesn't actually train model!)
    opt.zero_grad()
    z = model.encode(x, train_pos_edge_index)
    loss = model.recon_loss(z, train_pos_edge_index)
    loss.backward()
    opt.step()
    return float(loss)

def test(pos_edge_index, neg_edge_index):
    model.eval()
    with torch.no_grad():
        z = model.encode(x, train_pos_edge_index)
    return model.test(z, pos_edge_index, neg_edge_index)

epochs = 10
for epoch in range(1, epochs + 1):
    loss = train()
    auc, ap = test(data.test_pos_edge_index, data.test_neg_edge_index)
    print('Epoch: {:03d}, AUC: {:.4f}, AP: {:.4f}'.format(epoch, auc, ap))

Epoch: 001, AUC: 0.7729, AP: 0.6303
Epoch: 002, AUC: 0.7727, AP: 0.6303
Epoch: 003, AUC: 0.7730, AP: 0.6306
Epoch: 004, AUC: 0.7732, AP: 0.6309
Epoch: 005, AUC: 0.7728, AP: 0.6305
Epoch: 006, AUC: 0.7729, AP: 0.6306
Epoch: 007, AUC: 0.7735, AP: 0.6312
Epoch: 008, AUC: 0.7737, AP: 0.6315
Epoch: 009, AUC: 0.7743, AP: 0.6321
Epoch: 010, AUC: 0.7747, AP: 0.6327


In [20]:
# build models and optimiser
# base_path = r'Users/rmhavij/3d-reactions/data/'
base_path = r'data/'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
val_ratio = 0
test_ratio = 0.2 # TODO: combine train and test .sdf files for each geom then split (as works better with PyG)
r_latent_dim = p_latent_dim = ts_latent_dim = 2 # fine for now. may have to include more later.

# reactant data
r_dataset = ReactionDataset(base_path, geo_file = 'train_r') 
r_data = r_dataset.data
r_data.train_mask = r_data.val_mask = r_data.test_mask = r_data.y = None
r_data = train_test_split_edges(data = r_data, val_ratio = val_ratio, test_ratio = test_ratio)
r_x = r_data.x.to(device)
r_pos_edge_index = r_data.train_pos_edge_index.to(device)
# reactant encoder
r_num_node_fs = r_data.num_node_features # = 11
r_encoder = GAE(LinearEncoder(r_num_node_fs, r_latent_dim))
r_encoder = r_encoder.to(device)
r_opt = torch.optim.Adam(r_encoder.parameters(), lr = 0.01)

# product data
p_dataset = ReactionDataset(base_path, geo_file = 'train_p') 
p_data = p_dataset.data
p_data.train_mask = p_data.val_mask = p_data.test_mask = p_data.y = None
p_data = train_test_split_edges(data = p_data, val_ratio = val_ratio, test_ratio = test_ratio)
p_x = p_data.x.to(device)
p_pos_edge_index = p_data.train_pos_edge_index.to(device)
# product encoder
p_num_node_fs = r_data.num_node_features # = 11
p_encoder = GAE(LinearEncoder(p_num_node_fs, p_latent_dim))
p_encoder = p_encoder.to(device)
p_opt = torch.optim.Adam(p_encoder.parameters(), lr = 0.01)

# ts data
ts_dataset = ReactionDataset(base_path, geo_file = 'train_ts') 
ts_data = ts_dataset.data
ts_data.train_mask = ts_data.val_mask = ts_data.test_mask = ts_data.y = None
ts_data = train_test_split_edges(data = ts_data, val_ratio = val_ratio, test_ratio = test_ratio)
ts_x = ts_data.x.to(device)
ts_pos_edge_index = ts_data.train_pos_edge_index.to(device)
# ts decoder
ts_num_node_fs = ts_data.num_node_features # = 11
ts_decoder = TSDecoder(ts_latent_dim, ts_num_node_fs)
ts_decoder = ts_decoder.to(device)
ts_opt = torch.optim.Adam(ts_encoder.parameters(), lr = 0.01)

# note: I have GAEs for LinearEncoder here. not for ts decoder.

TypeError: __init__() missing 1 required positional argument: 'ts_data'

In [34]:
class TSDecoder(torch.nn.Module):
    """ Take TS embedding (i.e. combined R-P embedding) and decode to TS geometry. """
    # ref: right now, just using InnerProductDecoder

    def __init__(self, latent_dim, geometry_dim):
        super(TSDecoder, self).__init__()
        self.ts_data = ts_data

    def forward(self, ts_z, ts_edge_index, sigmoid=True):
        """ Decode TS embedding into edge probabilities for the given node-pairs of TS edge_index. """
        value = (ts_z[ts_edge_index[0]] * ts_z[ts_edge_index[1]]).sum(dim=1)
        return torch.sigmoid(value) if sigmoid else value

    def forward_all(self, z, sigmoid=True):
        """ Decode latent embeddings into probabilistic adjacency matrix. """
        adj = torch.matmul(z, z.t())
        return torch.sigmoid(adj) if sigmoid else adj

class MolEncoder(nn.Module):
    """ Takes in geometry data and creates embedding. Used for reactants OR products, not both. """
    # based off LinearEncoder

    def __init__(self, in_channels, out_channels, geometry_data):
        super(MolEncoder, self).__init__()
        self.geometry_data = geometry_data
        # single GC to get embeddings for nodes here
        self.conv = GCNConv(in_channels, out_channels)
    
    def forward(self, x, edge_index):
        # no relu for linearity
        return self.conv(x, edge_index)

    # TODO: try these funcs; may need to pass in other args later; add these to decoder and TSVAE too
    def save(self, path):
        """ Save model to the path specified. """
        torch.save(self.state_dict(), path)
    
    def load(self, path):
        """ Load model from path specified. """
        model_weights = torch.load(path)
        self.load_state_dict(model_weights)


In [27]:
from sklearn.metrics import roc_auc_score, average_precision_score
from torch_geometric.utils import (negative_sampling, remove_self_loops, add_self_loops)

EPS = 1e-15

class TSGAE(nn.Module):
    """ Takes in reactant encoder, product encoder, transition state decoder.
        Creates embeddings for reactants and products. 
        Combines these reactant and product embeddings to create a transition state.
        TODO: do I pass in data here?
    """

    def __init__(self, r_encoder, p_encoder, ts_decoder, r_data, p_data, ts_data):
        super(TSGAE, self).__init__()
        self.r_encoder = r_encoder
        self.p_encoder = p_encoder
        self.ts_decoder = ts_decoder
        self.r_data = r_data
        self.p_data = p_data
        self.ts_data = ts_data
        TSGAE.reset_parameters(self)

    def reset_parameters(self):
        self.reset_model()
        self.reset_data()

    def reset_model(self):
        reset(self.r_encoder)
        reset(self.p_encoder)
        reset(self.ts_decoder)

    def reset_data(self):
        reset(self.r_data)
        reset(self.p_data)
        reset(self.ts_data)

    def encode_reactant(self):
        return self.r_encoder()

    def encode_product(self):
        return self.p_encoder()

    def combine_reactant_and_product(self, r_z, p_z):
        """ Encode reactant and product, then combine their embeddings to get TS embedding.
            Each encoder produces an embedding for each node of the input molecule.
            TODO: different combination methods (e.g. concat z vectors so have dim=2d, multiply?) 
            TODO: confused here about whether I return encoder or z from encoder?
            TODO: take in data?
        """
        ### first go
        # encode reactant and product
        #r_z = self.r_encoder(*args, **kwargs)
        #p_z = self.p_encoder(*args, **kwargs)
        # trying first with simpler linear combination
        #ts_z = r_z + p_z
        
        ### second go
        ts_z = r_z + p_z
        return ts_z
        
    def decode(self, generated_ts_latent, ts_data):
        """ Runs the TS decoder to decode to TS (as probabilistic adjacency matrix) and computes edge probabilities. 
            TODO: decode to actual TS geometry. Can start with NL-WLS from MIT, then add in coordinate features, etc.
        """
        return self.ts_decoder(generated_ts_latent, ts_data)

    def ts_construction_loss(self, ts_z, ts_pos_edge_index, ts_neg_edge_index = None):
        """ Compute BCE for positive edges and, optionally, for negative sampled edges. 
            If negative edges not given, uses negative sampling to calculate.
            TODO: more specific loss func?
        """
        pos_loss = - torch.log(self.ts_decoder(ts_z, ts_pos_edge_index, sigmoid = True) + EPS).mean()
        
        # don't include self-loops in neg samples
        ts_pos_edge_index, _ = remove_self_loops(ts_pos_edge_index)
        ts_pos_edge_index, _ = add_self_loops(ts_pos_edge_index)
        if ts_neg_edge_index is None:
            ts_neg_edge_index = negative_sampling(ts_pos_edge_index, ts_z.size(0))
        neg_loss = - torch.log(1 - self.ts_decoder(ts_z, ts_neg_edge_index, sigmoid = True) + EPS).mean()

        return pos_loss + neg_loss

    def test(self, ts_z, ts_pos_edge_index, ts_neg_edge_index):
        """ Compute ROC-AUC and average precision (AP) scores. 
            TODO: what is y here?
        """
        pos_y = ts_z.new_ones(ts_pos_edge_index.size(1))
        neg_y = ts_z.new_zeros(ts_neg_edge_index.size(1))
        y = torch.cat([pos_y, neg_y], dim = 0)

        pos_pred = self.ts_decoder(ts_z, ts_pos_edge_index, sigmoid = True)
        neg_pred = self.ts_decoder(ts_z, ts_neg_edge_index, sigmoid = True)
        pred = torch.cat([pos_pred, neg_pred], dim = 0)

        y, pred = y.detach().cpu().numpy(), pred.detach().cpu().numpy()

        return roc_auc_score(y, pred), average_precision_score(y, pred)

def reset(nn):
    def _reset(item):
        if hasattr(item, 'reset_parameters'):
            item.reset_parameters()

    if nn is not None:
        if hasattr(nn, 'children') and len(list(nn.children())) > 0:
            for item in nn.children():
                _reset(item)
        else:
            _reset(nn)

In [32]:
# TODO: should I just have to encoders rather than GAEs for reactant and product here?
#       am I trying to learn the latent space for each and combine or combine them in a way to create TS?

# build models and optimiser
# base_path = r'Users/rmhavij/3d-reactions/data/'
base_path = r'data/'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
val_ratio = 0
test_ratio = 0.2 # TODO: combine train and test .sdf files for each geom then split (as works better with PyG)
r_latent_dim = p_latent_dim = ts_latent_dim = 2 # fine for now. may have to include more later.

# reactant data
r_dataset = ReactionDataset(base_path, geo_file = 'train_r') 
r_data = r_dataset.data
r_data.train_mask = r_data.val_mask = r_data.test_mask = r_data.y = None
r_data = train_test_split_edges(data = r_data, val_ratio = val_ratio, test_ratio = test_ratio)
r_x = r_data.x.to(device)
r_train_pos_edge_index = r_data.train_pos_edge_index.to(device)
# reactant encoder
r_num_node_fs = r_data.num_node_features # = 11
r_encoder = GAE(LinearEncoder(r_num_node_fs, r_latent_dim))
r_opt = torch.optim.Adam(r_encoder.parameters(), lr = 0.01)

# product data
p_dataset = ReactionDataset(base_path, geo_file = 'train_p') 
p_data = p_dataset.data
p_data.train_mask = p_data.val_mask = p_data.test_mask = p_data.y = None
p_data = train_test_split_edges(data = p_data, val_ratio = val_ratio, test_ratio = test_ratio)
p_x = p_data.x.to(device)
p_train_pos_edge_index = p_data.train_pos_edge_index.to(device)
# product encoder
p_num_node_fs = r_data.num_node_features # = 11
p_encoder = GAE(LinearEncoder(p_num_node_fs, p_latent_dim))
p_opt = torch.optim.Adam(p_encoder.parameters(), lr = 0.01)

# ts data
ts_dataset = ReactionDataset(base_path, geo_file = 'train_ts') 
ts_data = ts_dataset.data
ts_data.train_mask = ts_data.val_mask = ts_data.test_mask = ts_data.y = None
ts_data = train_test_split_edges(data = ts_data, val_ratio = val_ratio, test_ratio = test_ratio)
ts_x = ts_data.x.to(device)
ts_train_pos_edge_index = ts_data.train_pos_edge_index.to(device)
# ts decoder (no opt)
ts_num_node_fs = ts_data.num_node_features # = 11
ts_decoder = TSDecoder(ts_latent_dim, ts_num_node_fs)

# ts gae
ts_gae = TSGAE(r_encoder = r_encoder, p_encoder = p_encoder, ts_decoder = ts_decoder,
                r_data = r_data, p_data = p_data, ts_data = ts_data)
ts_gae = ts_gae.to(device)
gae_opt = torch.optim.Adam(model.parameters(), lr=0.01)
    

In [30]:
def train_individual_encoder(encoder, opt, train_pos_edge_index, data):
    """ Use this for reactant or product encoder training. """
    encoder.train()
    opt.zero_grad()
    z = encoder.encode(data, train_pos_edge_index)
    loss = encoder.recon_loss(z, train_pos_edge_index)
    loss.backward()
    opt.step()
    return z, float(loss)

def train_ts_gae(ts_gae, gae_opt, ts_train_pos_edge_index):
    """ Train TSGAE.
        Training this model trains the individual R and P encoders.
    """
    ts_gae.train() 
    gae_opt.zero_grad()
    ts_z = ts_gae.combine_reactant_and_product()
    gae_loss = ts_gae.ts_construction_loss(ts_z, ts_train_pos_edge_index)
    gae_loss.backward()
    gae_opt.step()
    # r_z, r_loss = train_individual_encoder(r_encoder, r_opt, r_pos_edge_index, r_data)
    # p_z, p_loss = train_individual_encoder(p_encoder, p_opt, p_pos_edge_index, p_data)
    return float(gae_loss)

In [33]:
def test_ts_gae(ts_gae, ts_train_pos_edge_index, ts_test_pos_edge_index, ts_test_neg_edge_index):
    ts_gae.eval()
    with torch.no_grad():
        ts_z = ts_gae.combine_reactant_and_product(x, ts_train_pos_edge_index)
    return ts_gae.test(ts_z, ts_test_pos_edge_index, ts_test_neg_edge_index)

epochs = 10
for epoch in range(1, epochs + 1):
    loss = train_ts_gae(ts_gae, gae_opt, ts_train_pos_edge_index)
    auc, ap = test(ts_data.test_pos_edge_index, ts_data.test_neg_edge_index)
    print('Epoch: {:03d}, AUC: {:.4f}, AP: {:.4f}'.format(epoch, auc, ap))

NotImplementedError: 

In [None]:
def train_individual_geometry(geometry_encoder, opt, pos_edge_indices):
    # use this on reactant or product
    # TODO: where is this meant to be???
    geometry_encoder.train()
    opt.zero_grad()
    z = geometry_encoder.encode(x, pos_edge_indices)
    loss = geometry_encoder.recon_loss(z, pos_edge_indices)
    loss.backward()
    opt.step()
    return z, float(loss)

def train_reaction(ts_gae, ts_opt, ts_pos_edge_index):
    """ Train reactant and product together then decode to TS. """
    ts_gae.train() # training this model should train the individual encoders
    ts_opt.zero_grad()
    ts_embedding = ts_gae.combine_reactant_and_product() # atm, pass in r and p data to this func
    ts_loss = ts_gae.recon_loss(ts_embedding, ts_pos_edge_index)
    ts_loss.backward()
    ts_opt.step()
    return float(loss)

def test_reaction(ts_gae, ts_test_pos_edges, ts_test_neg_edges):
    ts_gae.eval()
    with torch.no_grad():
        ts_embedding = model.encode(ts_data, ts_train_pos_edges)
    return ts_gae.test(ts_embedding, ts_test_pos_edges, ts_test_neg_edges)

    return

In [None]:
# lucky's work
# PairFeatures: a manual MP I think. it has to be otherwise what he's doing isn't a GNN at all.

# set edges
#   iterate:
#       compute features (i.e. MP) -> MLP(features) -> update edges
#       compute features (i.e. MP) -> MLP(MLP(edges)) -> update vertices