Go over notes and build up from simpler models

1. R AE 
2. R-P AE 
3. R encoder and TS decoder 
4. R-P encoder, TS decoder

TODO:
- Have an visualise function where you can plot embeddings e.g. umap, pca, tsne and interpolate between rs and ps

In [1]:
import torch
import torch.nn as nn
from torch_geometric.nn import GCNConv, GAE
from torch_geometric.utils import train_test_split_edges
from ts_vae.data_processors.grambow_processor import ReactionDataset, ConcatReactionDataset, Temp

## Molecule Autoencoder

### Set up

In [6]:
def reset(nn):
    def _reset(item):
        if hasattr(item, 'reset_parameters'):
            item.reset_parameters()

    if nn is not None:
        if hasattr(nn, 'children') and len(list(nn.children())) > 0:
            for item in nn.children():
                _reset(item)
        else:
            _reset(nn)

In [4]:
from sklearn.metrics import roc_auc_score, average_precision_score
from torch_geometric.utils import (negative_sampling, remove_self_loops, add_self_loops)
EPS = 1e-15

class GAE(nn.Module):
    def __init__(self, encoder, decoder = None):
        super(GAE, self).__init__()
        self.encoder = encoder
        self.decoder = InnerProductDecoder()
        GAE.reset_parameters(self)

    def reset_parameters(self):
        reset(self.encoder)
        reset(self.decoder) 

    def encode(self, *args, **kwargs):
        return self.encoder(*args, **kwargs)

    def decode(self, *args, **kwargs):
        return self.decoder(*args, **kwargs)

    def recon_loss(self, z, pos_edge_index, neg_edge_index = None):
        """ BCE for input on its reconstruction. """
        pos_loss = -torch.log(self.decoder(z, pos_edge_index, sigmoid = True) + EPS).mean()

        # no self-loops in negative samples
        pos_edge_index, _ = remove_self_loops(pos_edge_index)
        pos_edge_index, _ = add_self_loops(pos_edge_index)
        if neg_edge_index is None:
            neg_edge_index = negative_sampling(pos_edge_index, z.size(0))
        neg_loss = -torch.log(1 - self.decoder(z, neg_edge_index, sigmoid = True) + EPS).mean()

        return pos_loss + neg_loss

    def test(self, z, pos_edge_index, neg_edge_index):
        pos_y = z.new_ones(pos_edge_index.size(1))
        neg_y = z.new_zeros(neg_edge_index.size(1))
        y = torch.cat([pos_y, neg_y], dim=0)

        pos_pred = self.decoder(z, pos_edge_index, sigmoid=True)
        neg_pred = self.decoder(z, neg_edge_index, sigmoid=True)
        pred = torch.cat([pos_pred, neg_pred], dim=0)

        y, pred = y.detach().cpu().numpy(), pred.detach().cpu().numpy()

        return roc_auc_score(y, pred), average_precision_score(y, pred)

class MolEncoder(nn.Module):
    def __init__(self, in_channels, out_channels):
        super(MolEncoder, self).__init__()
        self.conv = GCNConv(in_channels, out_channels)

    def forward(self, x, edge_index):
        return self.conv(x, edge_index)

class InnerProductDecoder(nn.Module):
    def forward(self, z, edge_index, sigmoid = True):
        value = (z[edge_index[0]] * z[edge_index[1]]).sum(dim = 1)
        return torch.sigmoid(value) if sigmoid else value
    
    def forward_all(self, z, sigmoid = True):
        adj = torch.matmul(z, z.t())
        return torch.sigmoid(adj) if sigmoid else adj


### Training

In [5]:
def train_gae(gae, opt, x, train_pos_edge_index):
    gae.train()
    opt.zero_grad()
    z = gae.encode(x, train_pos_edge_index)
    loss = gae.recon_loss(z, train_pos_edge_index)
    loss.backward()
    opt.step()
    return float(loss)

def test_gae(gae, x, train_pos_edge_index, test_pos_edge_index, test_neg_edge_index):
    gae.eval()
    with torch.no_grad():
        z = gae.encode(x, train_pos_edge_index)
    return gae.test(z, test_pos_edge_index, test_neg_edge_index)

### Reactant Autoencoder

In [7]:
# model data
base_path = r'data/'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
val_ratio = 0
test_ratio = 0.2

# reactant data
r_dataset = ReactionDataset(base_path, geo_file = 'train_r') 
r_data = r_dataset.data
r_data.train_mask = r_data.val_mask = r_data.test_mask = r_data.y = None
r_data = train_test_split_edges(data = r_data, val_ratio = val_ratio, test_ratio = test_ratio)
r_x = r_data.x.to(device)
r_train_pos_edge_index = r_data.train_pos_edge_index.to(device)

# reactant encoder
r_num_node_fs = r_data.num_node_features
r_latent_dim = 2
r_ae = GAE(MolEncoder(r_num_node_fs, r_latent_dim))
r_opt = torch.optim.Adam(r_ae.parameters(), lr = 0.01)

In [8]:
r_ae.reset_parameters()

epochs = 100
for epoch in range(1, epochs + 1):
    loss = train_gae(r_ae, r_opt, r_x, r_data.train_pos_edge_index)
    auc, ap = test_gae(r_ae, r_x, r_data.train_pos_edge_index, r_data.test_pos_edge_index, r_data.test_neg_edge_index)
    if epoch % 10 == 0:
        print('Epoch: {:03d}, AUC: {:.4f}, AP: {:.4f}'.format(epoch, auc, ap))

Epoch: 010, AUC: 0.8173, AP: 0.6778
Epoch: 020, AUC: 0.8166, AP: 0.6770
Epoch: 030, AUC: 0.8162, AP: 0.6764
Epoch: 040, AUC: 0.7617, AP: 0.6447
Epoch: 050, AUC: 0.6425, AP: 0.5646
Epoch: 060, AUC: 0.5798, AP: 0.5157
Epoch: 070, AUC: 0.5806, AP: 0.5166
Epoch: 080, AUC: 0.5808, AP: 0.5169
Epoch: 090, AUC: 0.6277, AP: 0.5545
Epoch: 100, AUC: 0.6814, AP: 0.5949


### Product Autoencoder

In [12]:
# model data
base_path = r'data/'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
val_ratio = 0
test_ratio = 0.2

# product data
p_dataset = ReactionDataset(base_path, geo_file = 'train_p') 
p_data = p_dataset.data
p_data.train_mask = p_data.val_mask = p_data.test_mask = p_data.y = None
p_data = train_test_split_edges(data = p_data, val_ratio = val_ratio, test_ratio = test_ratio)
p_x = p_data.x.to(device)
p_train_pos_edge_index = p_data.train_pos_edge_index.to(device)

# product encoder
p_num_node_fs = p_data.num_node_features
p_latent_dim = 2
p_ae = GAE(MolEncoder(p_num_node_fs, p_latent_dim))
p_opt = torch.optim.Adam(p_ae.parameters(), lr = 0.01)

In [13]:
p_ae.reset_parameters()

epochs = 100
for epoch in range(1, epochs + 1):
    loss = train_gae(p_ae, p_opt, p_x, p_data.train_pos_edge_index)
    auc, ap = test_gae(p_ae, p_x, p_data.train_pos_edge_index, p_data.test_pos_edge_index, p_data.test_neg_edge_index)
    if epoch % 10 == 0:
        print('Epoch: {:03d}, AUC: {:.4f}, AP: {:.4f}'.format(epoch, auc, ap))

Epoch: 010, AUC: 0.7871, AP: 0.6643
Epoch: 020, AUC: 0.7860, AP: 0.6698
Epoch: 030, AUC: 0.7876, AP: 0.6959
Epoch: 040, AUC: 0.8174, AP: 0.7411
Epoch: 050, AUC: 0.8711, AP: 0.7817
Epoch: 060, AUC: 0.8898, AP: 0.7884
Epoch: 070, AUC: 0.8851, AP: 0.7721
Epoch: 080, AUC: 0.8831, AP: 0.7654
Epoch: 090, AUC: 0.8754, AP: 0.7582
Epoch: 100, AUC: 0.8780, AP: 0.7616


## Same AE architecture but with reactant and product data concatenated

### Hacky way
Create train_rp_50.sdf file with first 50 reactants and products.


In [10]:
# model data
base_path = r'data/'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
val_ratio = 0
test_ratio = 0.2 # 50 reactants and 50 products. will end up training on 50 reactants + 30 products and testing on 20 products

# rp data
rp_dataset = ReactionDataset(base_path, geo_file = 'train_rp_50')
rp_data = rp_dataset.data
rp_data.train_mask = rp_data.val_mask = rp_data.test_mask = rp_data.y = None
rp_data = train_test_split_edges(data = rp_data, val_ratio = val_ratio, test_ratio = test_ratio)
rp_x = rp_data.x.to(device)
rp_train_pos_edge_index = rp_data.train_pos_edge_index.to(device)

# rp autoencoder
rp_num_node_fs = rp_data.num_node_features
rp_latent_dim = 2
rp_ae = GAE(MolEncoder(rp_num_node_fs, rp_latent_dim))
rp_opt = torch.optim.Adam(rp_ae.parameters(), lr = 0.01)

In [15]:
rp_ae.reset_parameters()

epochs = 100
for epoch in range(1, epochs + 1):
    loss = train_gae(rp_ae, rp_opt, rp_x, rp_data.train_pos_edge_index)
    auc, ap = test_gae(rp_ae, rp_x, rp_data.train_pos_edge_index, rp_data.test_pos_edge_index, rp_data.test_neg_edge_index)
    if epoch % 10 == 0:
        print('Epoch: {:03d}, AUC: {:.4f}, AP: {:.4f}'.format(epoch, auc, ap))

Epoch: 010, AUC: 0.6097, AP: 0.5554
Epoch: 020, AUC: 0.3769, AP: 0.4165
Epoch: 030, AUC: 0.5066, AP: 0.4703
Epoch: 040, AUC: 0.6515, AP: 0.5624
Epoch: 050, AUC: 0.7327, AP: 0.6283
Epoch: 060, AUC: 0.8057, AP: 0.6766
Epoch: 070, AUC: 0.8086, AP: 0.6799
Epoch: 080, AUC: 0.8160, AP: 0.6874
Epoch: 090, AUC: 0.8062, AP: 0.6833
Epoch: 100, AUC: 0.8261, AP: 0.6953


### Create ConcatReactionDataset

i.e. not hacky, create a concatenated dataset at source.

Key words for network: dual, double, siamese, twin networks.
- Siamese/twin: same weights while working in tandem on two different input vectors to compute comparable output vectors.

Key words for training: simultaneous training.

In [6]:
# model data
base_path = r'data/'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
val_ratio = 0
test_ratio = 0.2

# concat rp data
concat_rp_dataset = ConcatReactionDataset(base_path)
concat_rp_data = concat_rp_dataset.data
concat_rp_data.train_mask = concat_rp_data.val_mask = concat_rp_data.test_mask = concat_rp_data.y = None
concat_rp_data = train_test_split_edges(data = concat_rp_data, val_ratio = val_ratio, test_ratio = test_ratio)
concat_rp_x = concat_rp_data.x.to(device)
concat_rp_train_pos_edge_index = concat_rp_data.train_pos_edge_index.to(device)

# concat rp encoder
concat_rp_num_node_fs = concat_rp_data.num_node_features
concat_rp_latent_dim = 2
concat_rp_ae = GAE(MolEncoder(concat_rp_num_node_fs, concat_rp_latent_dim))
concat_rp_opt = torch.optim.Adam(concat_rp_ae.parameters(), lr = 0.01)

In [7]:
concat_rp_ae.reset_parameters()

epochs = 100
for epoch in range(1, epochs + 1):
    loss = train_gae(concat_rp_ae, concat_rp_opt, concat_rp_x, concat_rp_data.train_pos_edge_index)
    auc, ap = test_gae(concat_rp_ae, concat_rp_x, concat_rp_data.train_pos_edge_index, concat_rp_data.test_pos_edge_index, concat_rp_data.test_neg_edge_index)
    if epoch % 10 == 0:
        print('Epoch: {:03d}, AUC: {:.4f}, AP: {:.4f}'.format(epoch, auc, ap))

Epoch: 010, AUC: 0.7733, AP: 0.6341
Epoch: 020, AUC: 0.6032, AP: 0.5291
Epoch: 030, AUC: 0.4524, AP: 0.4456
Epoch: 040, AUC: 0.4638, AP: 0.4493
Epoch: 050, AUC: 0.5354, AP: 0.4963
Epoch: 060, AUC: 0.6539, AP: 0.5780
Epoch: 070, AUC: 0.7542, AP: 0.6320
Epoch: 080, AUC: 0.7680, AP: 0.6445
Epoch: 090, AUC: 0.7813, AP: 0.6524
Epoch: 100, AUC: 0.7971, AP: 0.6696


In [None]:
# rp data
rp_dataset = ReactionDataset(base_path, geo_file = 'train_rp_50')
rp_data = rp_dataset.data
rp_data.train_mask = rp_data.val_mask = rp_data.test_mask = rp_data.y = None
rp_data = train_test_split_edges(data = rp_data, val_ratio = val_ratio, test_ratio = test_ratio)
rp_x = rp_data.x.to(device)
rp_train_pos_edge_index = rp_data.train_pos_edge_index.to(device)

# rp autoencoder
rp_num_node_fs = rp_data.num_node_features
rp_latent_dim = 2
rp_ae = GAE(MolEncoder(rp_num_node_fs, rp_latent_dim))
rp_opt = torch.optim.Adam(rp_ae.parameters(), lr = 0.01)

## Reactant-Product Autoencoder

Train reactant and product autoencoders simultaneously and decode to either (i) $z_{RP}$ or (ii) $(z_R, z_P)$

### One GAE on (R, P) tuple $\rightarrow z_{RP} \rightarrow$ reactant

In [12]:
EPS = 1e-15

class DualGAE(nn.Module):
    
    def __init__(self, r_encoder, p_encoder, decoder = None):
        super(DualGAE, self).__init__()
        self.r_encoder = r_encoder
        self.p_encoder = p_encoder
        self.decoder = InnerProductDecoder() if decoder is None else decoder
        DualGAE.reset_parameters(self)
    
    def reset_parameters(self):
        reset(self.r_encoder)
        reset(self.p_encoder)
        reset(self.decoder)
    
    def encode(self, r_x, p_x, r_train_pos_edge_index, p_train_pos_edge_index):
        z_r = self.r_encoder(r_x, r_train_pos_edge_index)
        z_p = self.p_encoder(p_x, p_train_pos_edge_index)
        return self.combine_r_and_p(z_r, z_p)

    def combine_r_and_p(self, r, p):
        return r + p
    
    def decode(self, *args, **kwargs):
        return self.decoder(*args, **kwargs)

    def recon_loss(self, z, pos_edge_index, neg_edge_index = None):
        """ BCE for input on its reconstruction. 
            TODO: recon both as a tuple?
        """
        pos_loss = - torch.log(self.decoder(z, pos_edge_index, sigmoid = True) + EPS).mean()

        # no self-loops in negative samples
        pos_edge_index, _ = remove_self_loops(pos_edge_index)
        pos_edge_index, _ = add_self_loops(pos_edge_index)
        if neg_edge_index is None:
            neg_edge_index = negative_sampling(pos_edge_index, z.size(0))
        neg_loss = -torch.log(1 - self.decoder(z, neg_edge_index, sigmoid = True) + EPS).mean()

        return pos_loss + neg_loss

    def test(self, z, pos_edge_index, neg_edge_index):
        pos_y = z.new_ones(pos_edge_index.size(1))
        neg_y = z.new_zeros(neg_edge_index.size(1))
        y = torch.cat([pos_y, neg_y], dim=0)

        pos_pred = self.decoder(z, pos_edge_index, sigmoid=True)
        neg_pred = self.decoder(z, neg_edge_index, sigmoid=True)
        pred = torch.cat([pos_pred, neg_pred], dim=0)

        y, pred = y.detach().cpu().numpy(), pred.detach().cpu().numpy()

        return roc_auc_score(y, pred), average_precision_score(y, pred)

class MolEncoder(nn.Module):
    def __init__(self, in_channels, out_channels):
        super(MolEncoder, self).__init__()
        self.conv = GCNConv(in_channels, out_channels)

    def forward(self, x, edge_index):
        z = self.conv(x, edge_index)
        return z

class InnerProductDecoder(nn.Module):
    def forward(self, z, edge_index, sigmoid = True):
        value = (z[edge_index[0]] * z[edge_index[1]]).sum(dim = 1)
        return torch.sigmoid(value) if sigmoid else value
    
    def forward_all(self, z, sigmoid = True):
        adj = torch.matmul(z, z.t())
        return torch.sigmoid(adj) if sigmoid else adj


In [16]:
# model data
base_path = r'data/'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
val_ratio = 0
test_ratio = 0.2

# reactant dataset
r_dataset = ReactionDataset(base_path, geo_file = 'train_r')
r_data = r_dataset.data
r_data.train_mask = r_data.val_mask = r_data.test_mask = r_data.y = None
r_data = train_test_split_edges(data = r_data, val_ratio = val_ratio, test_ratio = test_ratio)
r_x = r_data.x.to(device)
r_train_pos_edge_index = r_data.train_pos_edge_index.to(device)

# product dataset
p_dataset = ReactionDataset(base_path, geo_file = 'train_p')  
p_data = p_dataset.data
p_data.train_mask = p_data.val_mask = p_data.test_mask = p_data.y = None
p_data = train_test_split_edges(data = p_data, val_ratio = val_ratio, test_ratio = test_ratio)
p_x = p_data.x.to(device)
p_train_pos_edge_index = p_data.train_pos_edge_index.to(device)

# dual gae
dgae = DualGAE(r_encoder = MolEncoder(p_data.num_node_features, 2), p_encoder = MolEncoder(p_data.num_node_features, 2))
dgae_opt = torch.optim.Adam(dgae.parameters(), lr = 0.01) 

In [17]:
def train_dgae(dgae, opt, r_x, p_x, r_train_pos_edge_index, p_train_pos_edge_index):
    dgae.train()
    opt.zero_grad()
    z_rp = dgae.encode(r_x, p_x, r_train_pos_edge_index, p_train_pos_edge_index)
    loss = dgae.recon_loss(z_rp, r_train_pos_edge_index)
    loss.backward()
    opt.step()
    return float(loss)

# the test indices here will be for reactant
def test_dgae(dgae, r_x, p_x, r_train_pos_edge_index, p_train_pos_edge_index, test_pos_edge_index, test_neg_edge_index):
    dgae.eval()
    with torch.no_grad():
        z_rp = dgae.encoder(r_x, p_x, r_train_pos_edge_index, p_train_pos_edge_index)
    return dgae.test(z_rp, test_pos_edge_index, test_neg_edge_index)

In [18]:
dgae.reset_parameters()

epochs = 100
for epoch in range(1, epochs + 1):
    loss = train_dgae(dgae, dgae_opt, r_x, p_x, r_train_pos_edge_index, p_train_pos_edge_index)
    auc, ap = test_gae(dgae, r_x, p_x, r_train_pos_edge_index, p_train_pos_edge_index, r_data.test_pos_edge_index, r_data.test_neg_edge_index)
    if epoch % 10 == 0:
        print('Epoch: {:03d}, AUC: {:.4f}, AP: {:.4f}'.format(epoch, auc, ap))

NameError: name 'remove_self_loops' is not defined

### Combine R and P on latent space for corresponding features and then decode to R, say.

In [2]:
from sklearn.metrics import roc_auc_score, average_precision_score
from torch_geometric.utils import (negative_sampling, remove_self_loops, add_self_loops)
EPS = 1e-15

class SameGAE(nn.Module):
    # r and p in same encoder
    # either train sequentially i.e. concat r and p data or train on tuples of (r[i], p[i])

    def __init__(self, rp_encoder, decoder, type = "concat"):
        super(SameGAE, self).__init__()
        self.rp_encoder = rp_encoder
        self.decoder = decoder
        self.type = type # "concat" or "tuple"
        SameGAE.reset_parameters(self)
    
    def reset_parameters(self):
        reset(self.rp_encoder)
        reset(self.decoder)
        reset(self.type)
    
    def encode(self, *args, **kwargs):
        return self.rp_encoder(*args, **kwargs)
    
    def decode(self, *args, **kwargs):
        return self.decoder(*args, **kwargs)
    
    def recon_loss(self, z, pos_edge_index, neg_edge_index = None):
        pos_loss = - torch.log(self.decoder(z, pos_edge_index, sigmoid = True) + EPS).mean()

        # no self-loops in negative samples
        pos_edge_index, _ = remove_self_loops(pos_edge_index)
        pos_edge_index, _ = add_self_loops(pos_edge_index)
        if neg_edge_index is None:
            neg_edge_index = negative_sampling(pos_edge_index, z.size(0))
        neg_loss = - torch.log(1 - self.decoder(z, neg_edge_index, sigmoid = True) + EPS).mean()

        return pos_loss + neg_loss
    
    def test(self, z, pos_edge_index, neg_edge_index):
        pos_y = z.new_ones(pos_edge_index.size(1))
        neg_y = z.new_zeros(neg_edge_index.size(1))
        y = torch.cat([pos_y, neg_y], dim=0)

        pos_pred = self.decoder(z, pos_edge_index, sigmoid=True)
        neg_pred = self.decoder(z, neg_edge_index, sigmoid=True)
        pred = torch.cat([pos_pred, neg_pred], dim=0)

        y, pred = y.detach().cpu().numpy(), pred.detach().cpu().numpy()

        return roc_auc_score(y, pred), average_precision_score(y, pred)

class MolEncoder(nn.Module):
    def __init__(self, in_channels, out_channels):
        super(MolEncoder, self).__init__()
        self.conv = GCNConv(in_channels, out_channels)

    def forward(self, x, edge_index):
        return self.conv(x, edge_index)

class InnerProductDecoder(nn.Module):
    def forward(self, z, edge_index, sigmoid = True):
        value = (z[edge_index[0]] * z[edge_index[1]]).sum(dim = 1)
        return torch.sigmoid(value) if sigmoid else value
    
    def forward_all(self, z, sigmoid = True):
        adj = torch.matmul(z, z.t())
        return torch.sigmoid(adj) if sigmoid else adj

### Train SameGAE

In [3]:
def train_samegae(gae, opt, x, train_pos_edge_index):
    gae.train()
    opt.zero_grad()
    z = gae.encode(x, train_pos_edge_index)
    loss = gae.recon_loss(z, train_pos_edge_index)
    loss.backward()
    opt.step()
    return float(loss)

def test_samegae(gae, x, train_pos_edge_index, test_pos_edge_index, test_neg_edge_index):
    gae.eval()
    with torch.no_grad():
        z = gae.encode(x, train_pos_edge_index)
    return gae.test(z, test_pos_edge_index, test_neg_edge_index)

p_ae.reset_parameters()

epochs = 100
for epoch in range(1, epochs + 1):
    loss = train_gae(p_ae, p_opt, p_x, p_data.train_pos_edge_index)
    auc, ap = test_gae(p_ae, p_x, p_data.train_pos_edge_index, p_data.test_pos_edge_index, p_data.test_neg_edge_index)
    if epoch % 10 == 0:
        print('Epoch: {:03d}, AUC: {:.4f}, AP: {:.4f}'.format(epoch, auc, ap))

NameError: name 'p_ae' is not defined

In [5]:
# model data
base_path = r'data/'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
val_ratio = 0
test_ratio = 0.1 # 50 reactants and 50 products. will end up training on 50 reactants + 40 products and testing on 10 products

# rp data
rp_dataset = ReactionDataset(base_path, geo_file = 'train_rp_50')
rp_data = rp_dataset.data
rp_data.train_mask = rp_data.val_mask = rp_data.test_mask = rp_data.y = None
rp_data = train_test_split_edges(data = rp_data, val_ratio = val_ratio, test_ratio = test_ratio)
rp_x = rp_data.x.to(device)
rp_train_pos_edge_index = rp_data.train_pos_edge_index.to(device)

# rp autoencoder
rp_num_node_fs = rp_data.num_node_features
rp_latent_dim = 2
rp_ae = GAE(MolEncoder(rp_num_node_fs, rp_latent_dim))
rp_opt = torch.optim.Adam(rp_ae.parameters(), lr = 0.01)

NameError: name 'MolEncoder' is not defined

In [80]:
r_dataset.data.__dict__.keys()

dict_keys(['x', 'edge_index', 'edge_attr', 'y', 'pos', 'normal', 'face', 'z', 'idx'])

In [85]:
# ways of combining:
#   - combine sdf files (need just first 100) and process that normally
#   - manually once created datasets

from torch_geometric.data.data import Data

rp_data_dict = {}

for dataset_param in r_dataset.data.__dict__.keys():
    rp_data_dict[dataset_param] = torch.cat((r_dataset.data[dataset_param], p_dataset.data[dataset_param]))


RuntimeError: Sizes of tensors must match except in dimension 0. Got 2486 and 2504 in dimension 1 (The offending index is 1)

In [79]:
r_dataset.data['edge_index']

tensor([[0, 0, 0,  ..., 7, 8, 9],
        [7, 8, 9,  ..., 3, 3, 5]])

In [74]:
print(len(r_dataset.data.edge_attr), len(p_dataset.data.edge_attr)) 

2486 2504


In [88]:
for i in r_dataset.data.__dict__.keys():
    print(i)

x
edge_index
edge_attr
y
pos
normal
face
z
idx


In [84]:
torch.cat((r_dataset.data['edge_attr'], p_dataset.data['edge_attr']))

tensor([[1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        ...,
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.]])

In [None]:
# model data
base_path = r'data/'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
val_ratio = 0
test_ratio = 0.2

# reactant data
r_dataset = ReactionDataset(base_path, geo_file = 'train_r') 
r_data = r_dataset.data
r_data.train_mask = r_data.val_mask = r_data.test_mask = r_data.y = None
r_data = train_test_split_edges(data = r_data, val_ratio = val_ratio, test_ratio = test_ratio)
r_x = r_data.x.to(device)
r_train_pos_edge_index = r_data.train_pos_edge_index.to(device)

# product data
p_dataset = ReactionDataset(base_path, geo_file = 'train_p') 
p_data = p_dataset.data
p_data.train_mask = p_data.val_mask = p_data.test_mask = p_data.y = None
p_data = train_test_split_edges(data = p_data, val_ratio = val_ratio, test_ratio = test_ratio)
p_x = p_data.x.to(device)
p_train_pos_edge_index = p_data.train_pos_edge_index.to(device)

# product encoder
p_num_node_fs = p_data.num_node_features
p_latent_dim = 2
p_ae = GAE(MolEncoder(p_num_node_fs, p_latent_dim))
p_opt = torch.optim.Adam(p_ae.parameters(), lr = 0.01)