Go over notes and build up from simpler models

1. R AE 
2. R-P AE 
3. R encoder and TS decoder 
4. R-P encoder, TS decoder

TODO:
- Have an visualise function where you can plot embeddings e.g. umap, pca, tsne and interpolate between rs and ps

In [1]:
import torch
import torch.nn as nn
from torch_geometric.nn import GCNConv, GAE
from torch_geometric.utils import train_test_split_edges
from ts_vae.data_processors.grambow_processor import ReactionDataset

## Molecule Autoencoder

### Set up

In [4]:
def reset(nn):
    def _reset(item):
        if hasattr(item, 'reset_parameters'):
            item.reset_parameters()

    if nn is not None:
        if hasattr(nn, 'children') and len(list(nn.children())) > 0:
            for item in nn.children():
                _reset(item)
        else:
            _reset(nn)

In [5]:
from sklearn.metrics import roc_auc_score, average_precision_score
from torch_geometric.utils import (negative_sampling, remove_self_loops, add_self_loops)
EPS = 1e-15

class GAE(nn.Module):
    def __init__(self, encoder, decoder = None):
        super(GAE, self).__init__()
        self.encoder = encoder
        self.decoder = InnerProductDecoder()
        GAE.reset_parameters(self)

    def reset_parameters(self):
        reset(self.encoder)
        reset(self.decoder) 

    def encode(self, *args, **kwargs):
        return self.encoder(*args, **kwargs)

    def decode(self, *args, **kwargs):
        return self.decoder(*args, **kwargs)

    def recon_loss(self, z, pos_edge_index, neg_edge_index = None):
        """ BCE for input on its reconstruction. """
        pos_loss = -torch.log(self.decoder(z, pos_edge_index, sigmoid = True) + EPS).mean()

        # no self-loops in negative samples
        pos_edge_index, _ = remove_self_loops(pos_edge_index)
        pos_edge_index, _ = add_self_loops(pos_edge_index)
        if neg_edge_index is None:
            neg_edge_index = negative_sampling(pos_edge_index, z.size(0))
        neg_loss = -torch.log(1 - self.decoder(z, neg_edge_index, sigmoid = True) + EPS).mean()

        return pos_loss + neg_loss

    def test(self, z, pos_edge_index, neg_edge_index):
        pos_y = z.new_ones(pos_edge_index.size(1))
        neg_y = z.new_zeros(neg_edge_index.size(1))
        y = torch.cat([pos_y, neg_y], dim=0)

        pos_pred = self.decoder(z, pos_edge_index, sigmoid=True)
        neg_pred = self.decoder(z, neg_edge_index, sigmoid=True)
        pred = torch.cat([pos_pred, neg_pred], dim=0)

        y, pred = y.detach().cpu().numpy(), pred.detach().cpu().numpy()

        return roc_auc_score(y, pred), average_precision_score(y, pred)

class MolEncoder(nn.Module):
    def __init__(self, in_channels, out_channels):
        super(MolEncoder, self).__init__()
        self.conv = GCNConv(in_channels, out_channels)

    def forward(self, x, edge_index):
        return self.conv(x, edge_index)

class InnerProductDecoder(nn.Module):
    def forward(self, z, edge_index, sigmoid = True):
        value = (z[edge_index[0]] * z[edge_index[1]]).sum(dim = 1)
        return torch.sigmoid(value) if sigmoid else value
    
    def forward_all(self, z, sigmoid = True):
        adj = torch.matmul(z, z.t())
        return torch.sigmoid(adj) if sigmoid else adj


### Training

In [6]:
def train_gae(gae, opt, x, train_pos_edge_index):
    gae.train()
    opt.zero_grad()
    z = gae.encode(x, train_pos_edge_index)
    loss = gae.recon_loss(z, train_pos_edge_index)
    loss.backward()
    opt.step()
    return float(loss)

def test_gae(gae, x, train_pos_edge_index, test_pos_edge_index, test_neg_edge_index):
    gae.eval()
    with torch.no_grad():
        z = gae.encode(x, train_pos_edge_index)
    return gae.test(z, test_pos_edge_index, test_neg_edge_index)

### Reactant Autoencoder

In [7]:
# model data
base_path = r'data/'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
val_ratio = 0
test_ratio = 0.2

# reactant data
r_dataset = ReactionDataset(base_path, geo_file = 'train_r') 
r_data = r_dataset.data
r_data.train_mask = r_data.val_mask = r_data.test_mask = r_data.y = None
r_data = train_test_split_edges(data = r_data, val_ratio = val_ratio, test_ratio = test_ratio)
r_x = r_data.x.to(device)
r_train_pos_edge_index = r_data.train_pos_edge_index.to(device)

# reactant encoder
r_num_node_fs = r_data.num_node_features
r_latent_dim = 2
r_ae = GAE(MolEncoder(r_num_node_fs, r_latent_dim))
r_opt = torch.optim.Adam(r_ae.parameters(), lr = 0.01)

In [8]:
r_ae.reset_parameters()

epochs = 100
for epoch in range(1, epochs + 1):
    loss = train_gae(r_ae, r_opt, r_x, r_data.train_pos_edge_index)
    auc, ap = test_gae(r_ae, r_x, r_data.train_pos_edge_index, r_data.test_pos_edge_index, r_data.test_neg_edge_index)
    if epoch % 10 == 0:
        print('Epoch: {:03d}, AUC: {:.4f}, AP: {:.4f}'.format(epoch, auc, ap))

Epoch: 010, AUC: 0.8173, AP: 0.6778
Epoch: 020, AUC: 0.8166, AP: 0.6770
Epoch: 030, AUC: 0.8162, AP: 0.6764
Epoch: 040, AUC: 0.7617, AP: 0.6447
Epoch: 050, AUC: 0.6425, AP: 0.5646
Epoch: 060, AUC: 0.5798, AP: 0.5157
Epoch: 070, AUC: 0.5806, AP: 0.5166
Epoch: 080, AUC: 0.5808, AP: 0.5169
Epoch: 090, AUC: 0.6277, AP: 0.5545
Epoch: 100, AUC: 0.6814, AP: 0.5949


### Product Autoencoder

In [12]:
# model data
base_path = r'data/'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
val_ratio = 0
test_ratio = 0.2

# product data
p_dataset = ReactionDataset(base_path, geo_file = 'train_p') 
p_data = p_dataset.data
p_data.train_mask = p_data.val_mask = p_data.test_mask = p_data.y = None
p_data = train_test_split_edges(data = p_data, val_ratio = val_ratio, test_ratio = test_ratio)
p_x = p_data.x.to(device)
p_train_pos_edge_index = p_data.train_pos_edge_index.to(device)

# product encoder
p_num_node_fs = p_data.num_node_features
p_latent_dim = 2
p_ae = GAE(MolEncoder(p_num_node_fs, p_latent_dim))
p_opt = torch.optim.Adam(p_ae.parameters(), lr = 0.01)

In [13]:
p_ae.reset_parameters()

epochs = 100
for epoch in range(1, epochs + 1):
    loss = train_gae(p_ae, p_opt, p_x, p_data.train_pos_edge_index)
    auc, ap = test_gae(p_ae, p_x, p_data.train_pos_edge_index, p_data.test_pos_edge_index, p_data.test_neg_edge_index)
    if epoch % 10 == 0:
        print('Epoch: {:03d}, AUC: {:.4f}, AP: {:.4f}'.format(epoch, auc, ap))

Epoch: 010, AUC: 0.7871, AP: 0.6643
Epoch: 020, AUC: 0.7860, AP: 0.6698
Epoch: 030, AUC: 0.7876, AP: 0.6959
Epoch: 040, AUC: 0.8174, AP: 0.7411
Epoch: 050, AUC: 0.8711, AP: 0.7817
Epoch: 060, AUC: 0.8898, AP: 0.7884
Epoch: 070, AUC: 0.8851, AP: 0.7721
Epoch: 080, AUC: 0.8831, AP: 0.7654
Epoch: 090, AUC: 0.8754, AP: 0.7582
Epoch: 100, AUC: 0.8780, AP: 0.7616


### Same AE architecture but with reactant and product data concatenated

I did this first by hackily by creating a train_rp.sdf file with first 50 reactants and first 50 products so that the PyG data class and associated functions were automatically preserved. 

Next step is to see if I can concatenate the individual train_r and train_p datasets and then train on them normally.

In [10]:
# model data
base_path = r'data/'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
val_ratio = 0
test_ratio = 0.2 # 50 reactants and 50 products. will end up training on 50 reactants + 30 products and testing on 20 products

# rp data
rp_dataset = ReactionDataset(base_path, geo_file = 'train_rp_50')
rp_data = rp_dataset.data
rp_data.train_mask = rp_data.val_mask = rp_data.test_mask = rp_data.y = None
rp_data = train_test_split_edges(data = rp_data, val_ratio = val_ratio, test_ratio = test_ratio)
rp_x = rp_data.x.to(device)
rp_train_pos_edge_index = rp_data.train_pos_edge_index.to(device)

# rp autoencoder
rp_num_node_fs = rp_data.num_node_features
rp_latent_dim = 2
rp_ae = GAE(MolEncoder(rp_num_node_fs, rp_latent_dim))
rp_opt = torch.optim.Adam(rp_ae.parameters(), lr = 0.01)

In [15]:
rp_ae.reset_parameters()

epochs = 100
for epoch in range(1, epochs + 1):
    loss = train_gae(rp_ae, rp_opt, rp_x, rp_data.train_pos_edge_index)
    auc, ap = test_gae(rp_ae, rp_x, rp_data.train_pos_edge_index, rp_data.test_pos_edge_index, rp_data.test_neg_edge_index)
    if epoch % 10 == 0:
        print('Epoch: {:03d}, AUC: {:.4f}, AP: {:.4f}'.format(epoch, auc, ap))

Epoch: 010, AUC: 0.6097, AP: 0.5554
Epoch: 020, AUC: 0.3769, AP: 0.4165
Epoch: 030, AUC: 0.5066, AP: 0.4703
Epoch: 040, AUC: 0.6515, AP: 0.5624
Epoch: 050, AUC: 0.7327, AP: 0.6283
Epoch: 060, AUC: 0.8057, AP: 0.6766
Epoch: 070, AUC: 0.8086, AP: 0.6799
Epoch: 080, AUC: 0.8160, AP: 0.6874
Epoch: 090, AUC: 0.8062, AP: 0.6833
Epoch: 100, AUC: 0.8261, AP: 0.6953


Concatenate individual files. 

The new datasets may need to follow PyT or PyG dataset conventions e.g. pass in to InMemoryDataset? \
All operations are done on rp_dataset.data which is of type torch_geometric.data.data.Data. So will need to pass into this. 

Key words for network: dual, double, siamese, twin networks.
- Siamese/twin: same weights while working in tandem on two different input vectors to compute comparable output vectors.

Key words for training: simultaneous training.

TODO:
- Create ConcatDataset or more similar class.

In [52]:
base_path = r'data/'
r_dataset = ReactionDataset(base_path, geo_file = 'train_r')
p_dataset = ReactionDataset(base_path, geo_file = 'train_p')
total_mols = len(r_dataset) + len(p_dataset)

concat_rp_dataset = []
for i in range(total_mols):
    if i < 100:
        concat_rp_dataset.append(r_dataset[i])
    else:
        concat_rp_dataset.append(p_dataset[i - 100])


In [53]:
from torch_geometric.data import Data, DataLoader
# data_list = [Data(...), ..., Data(...)] # loader = DataLoader(data_list, batch_size=32)

concat_rp_loader = DataLoader(concat_rp_dataset, batch_size = 5)


In [56]:
import torch
import torch.nn.functional as F
from torch_scatter import scatter
from torch_geometric.data import InMemoryDataset, Data
from rdkit import Chem
from rdkit.Chem.rdchem import HybridizationType
from rdkit.Chem.rdchem import BondType as BT
from tqdm import tqdm
from enum import Enum

TEMP_MOLS_LIMIT = 100

class ConcatReactionDataset(InMemoryDataset):
    types = {'H': 0, 'C': 1, 'N': 2, 'O': 3, 'F': 4}
    bonds = {BT.SINGLE: 0, BT.DOUBLE: 1, BT.TRIPLE: 2, BT.AROMATIC: 3}

    def __init__(self, root, transform=None, pre_transform=None):
        super(ConcatReactionDataset, self).__init__(root, transform, pre_transform)
        self.data, self.slices = torch.load(self.processed_paths[0])

    @property
    def raw_file_names(self):
        return ['/raw/train_reactants.sdf', '/raw/train_products.sdf']
    
    @property
    def processed_file_names(self):
        """ If files already in processed folder, this processing is skipped. 
            Convenient for accessing the individual processed files without having to recreate them each time. 
        """
        return ['train_concat_rp.pt']

    def download(self):
        """ Not required in this project. """
        pass

    def process(self):
        """ Processes each of the six geometry files and appends to a list. 
            Code mostly lifted from QM9 dataset creation https://pytorch-geometric.readthedocs.io/en/latest/_modules/torch_geometric/datasets/qm9.html 
            Transforms molecules to their atom features and adjacency lists.
        """

        limit = TEMP_MOLS_LIMIT

        concat_data_list = []

        for g_idx, geometry_file in enumerate(self.raw_file_names): 
            
            # data_list = [] # data_list for each geometry file
            full_path = self.root + geometry_file
            geometries = Chem.SDMolSupplier(full_path, removeHs=False, sanitize=False)

            # get atom and edge features for each geometry
            for i, mol in enumerate(tqdm(geometries)):

                # temp soln cos of split edge memory issues
                if i == limit:
                    break
                
                N = mol.GetNumAtoms()
                # get atom positions as matrix w shape [num_nodes, num_dimensions] = [num_atoms, 3]
                atom_data = geometries.GetItemText(i).split('\n')[4:4 + N] 
                atom_positions = [[float(x) for x in line.split()[:3]] for line in atom_data]
                atom_positions = torch.tensor(atom_positions, dtype=torch.float)
                # all the features
                type_idx = []
                atomic_number = []
                aromatic = []
                sp = []
                sp2 = []
                sp3 = []
                num_hs = []

                # atom/node features
                for atom in mol.GetAtoms():
                    type_idx.append(self.types[atom.GetSymbol()])
                    atomic_number.append(atom.GetAtomicNum())
                    aromatic.append(1 if atom.GetIsAromatic() else 0)
                    hybridisation = atom.GetHybridization()
                    sp.append(1 if hybridisation == HybridizationType.SP else 0)
                    sp2.append(1 if hybridisation == HybridizationType.SP2 else 0)
                    sp3.append(1 if hybridisation == HybridizationType.SP3 else 0)
                    # !!! should do the features that lucky does: whether bonded, 3d_rbf

                # bond/edge features
                row, col, edge_type = [], [], []
                for bond in mol.GetBonds(): 
                    start, end = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()
                    row += [start, end]
                    col += [end, start]
                    # edge type for each bond type; *2 because both ways
                    edge_type += 2 * [self.bonds[bond.GetBondType()]]
                # edge_index is graph connectivity in COO format with shape [2, num_edges]
                edge_index = torch.tensor([row, col], dtype=torch.long)
                edge_type = torch.tensor(edge_type, dtype=torch.long)
                # edge_attr is edge feature matrix with shape [num_edges, num_edge_features]
                edge_attr = F.one_hot(edge_type, num_classes=len(self.bonds)).to(torch.float) 

                # order edges based on combined ascending order
                perm = (edge_index[0] * N + edge_index[1]).argsort() # TODO
                edge_index = edge_index[:, perm]
                edge_type = edge_type[perm]
                edge_attr = edge_attr[perm]

                row, col = edge_index
                z = torch.tensor(atomic_number, dtype=torch.long)
                hs = (z == 1).to(torch.float) # hydrogens
                num_hs = scatter(hs[row], col, dim_size=N).tolist() # scatter helps with one-hot
                
                x1 = F.one_hot(torch.tensor(type_idx), num_classes=len(self.types))
                x2 = torch.tensor([atomic_number, aromatic, sp, sp2, sp3, num_hs], dtype=torch.float).t().contiguous()
                x = torch.cat([x1.to(torch.float), x2], dim=-1)

                data = Data(x=x, z=z, pos=atom_positions, edge_index=edge_index, edge_attr=edge_attr, idx=i)
                concat_data_list.append(data)

            # concat_data_list.append(data_list)

        torch.save(self.collate(data_list), self.processed_paths[0]) 

In [57]:
base_path = r'data/'
ConcatReactionDataset(base_path)

Processing...


NameError: name 'TEMP_MOLS_LIMIT' is not defined

In [54]:
torch.save(self.collate(concat_rp_dataset), 'test.pt')

NameError: name 'self' is not defined

In [None]:
# Loop over epochs
for epoch in range(max_epochs):
    # Training
    for batch, labels in loader:
        # Transfer to GPU if available
        batch, labels = batch.to(device), labels.to(device)

        # Model computations
        [...]

In [49]:
rp_data

Data(edge_attr=[2558, 4], idx=[100], pos=[1330, 3], test_neg_edge_index=[2, 255], test_pos_edge_index=[2, 255], train_neg_adj_mask=[1330, 1330], train_pos_edge_index=[2, 146], val_neg_edge_index=[2, 0], val_pos_edge_index=[2, 0], x=[1330, 11], z=[1330])

In [None]:
# rp data
rp_dataset = ReactionDataset(base_path, geo_file = 'train_rp_50')
rp_data = rp_dataset.data
rp_data.train_mask = rp_data.val_mask = rp_data.test_mask = rp_data.y = None
rp_data = train_test_split_edges(data = rp_data, val_ratio = val_ratio, test_ratio = test_ratio)
rp_x = rp_data.x.to(device)
rp_train_pos_edge_index = rp_data.train_pos_edge_index.to(device)

# rp autoencoder
rp_num_node_fs = rp_data.num_node_features
rp_latent_dim = 2
rp_ae = GAE(MolEncoder(rp_num_node_fs, rp_latent_dim))
rp_opt = torch.optim.Adam(rp_ae.parameters(), lr = 0.01)


### (Reactant, Product) tuples?

Either create a tuple embedding (z_r, z_p) or an embedding of a tuple z_rp.

In [21]:
# data = Data(x=x, z=z, pos=atom_positions, edge_index=edge_index, edge_attr=edge_attr, idx=i)
# data_list.append(data)

from torch_geometric.data.data import Data

# rp_data_dict = {}

#for dataset_param in r_dataset.data.__dict__.keys():
#    rp_data_dict[dataset_param] = torch.cat((r_dataset.data[dataset_param], p_dataset.data[dataset_param]))

base_path = r'data/'
r_dataset = ReactionDataset(base_path, geo_file = 'train_r')
p_dataset = ReactionDataset(base_path, geo_file = 'train_p')

In [30]:
combined_rp_dataset = [(r_dataset[i], p_dataset[i]) for i in range(len(r_dataset))]

## Reactant-Product Autoencoder

Train reactant and product autoencoders simultaneously and decode to either (i) themselves or (ii) 

Do I want to train GAE for R and P separately then combine their latent space and decode to R, say. Or do I want to train R and P together?

### Combine R and P on latent space for corresponding features and then decode to R, say.

In [2]:
from sklearn.metrics import roc_auc_score, average_precision_score
from torch_geometric.utils import (negative_sampling, remove_self_loops, add_self_loops)
EPS = 1e-15

class SameGAE(nn.Module):
    # r and p in same encoder
    # either train sequentially i.e. concat r and p data or train on tuples of (r[i], p[i])

    def __init__(self, rp_encoder, decoder, type = "concat"):
        super(SameGAE, self).__init__()
        self.rp_encoder = rp_encoder
        self.decoder = decoder
        self.type = type # "concat" or "tuple"
        SameGAE.reset_parameters(self)
    
    def reset_parameters(self):
        reset(self.rp_encoder)
        reset(self.decoder)
        reset(self.type)
    
    def encode(self, *args, **kwargs):
        return self.rp_encoder(*args, **kwargs)
    
    def decode(self, *args, **kwargs):
        return self.decoder(*args, **kwargs)
    
    def recon_loss(self, z, pos_edge_index, neg_edge_index = None):
        pos_loss = - torch.log(self.decoder(z, pos_edge_index, sigmoid = True) + EPS).mean()

        # no self-loops in negative samples
        pos_edge_index, _ = remove_self_loops(pos_edge_index)
        pos_edge_index, _ = add_self_loops(pos_edge_index)
        if neg_edge_index is None:
            neg_edge_index = negative_sampling(pos_edge_index, z.size(0))
        neg_loss = - torch.log(1 - self.decoder(z, neg_edge_index, sigmoid = True) + EPS).mean()

        return pos_loss + neg_loss
    
    def test(self, z, pos_edge_index, neg_edge_index):
        pos_y = z.new_ones(pos_edge_index.size(1))
        neg_y = z.new_zeros(neg_edge_index.size(1))
        y = torch.cat([pos_y, neg_y], dim=0)

        pos_pred = self.decoder(z, pos_edge_index, sigmoid=True)
        neg_pred = self.decoder(z, neg_edge_index, sigmoid=True)
        pred = torch.cat([pos_pred, neg_pred], dim=0)

        y, pred = y.detach().cpu().numpy(), pred.detach().cpu().numpy()

        return roc_auc_score(y, pred), average_precision_score(y, pred)

class MolEncoder(nn.Module):
    def __init__(self, in_channels, out_channels):
        super(MolEncoder, self).__init__()
        self.conv = GCNConv(in_channels, out_channels)

    def forward(self, x, edge_index):
        return self.conv(x, edge_index)

class InnerProductDecoder(nn.Module):
    def forward(self, z, edge_index, sigmoid = True):
        value = (z[edge_index[0]] * z[edge_index[1]]).sum(dim = 1)
        return torch.sigmoid(value) if sigmoid else value
    
    def forward_all(self, z, sigmoid = True):
        adj = torch.matmul(z, z.t())
        return torch.sigmoid(adj) if sigmoid else adj

### Train SameGAE

In [3]:
def train_samegae(gae, opt, x, train_pos_edge_index):
    gae.train()
    opt.zero_grad()
    z = gae.encode(x, train_pos_edge_index)
    loss = gae.recon_loss(z, train_pos_edge_index)
    loss.backward()
    opt.step()
    return float(loss)

def test_samegae(gae, x, train_pos_edge_index, test_pos_edge_index, test_neg_edge_index):
    gae.eval()
    with torch.no_grad():
        z = gae.encode(x, train_pos_edge_index)
    return gae.test(z, test_pos_edge_index, test_neg_edge_index)

p_ae.reset_parameters()

epochs = 100
for epoch in range(1, epochs + 1):
    loss = train_gae(p_ae, p_opt, p_x, p_data.train_pos_edge_index)
    auc, ap = test_gae(p_ae, p_x, p_data.train_pos_edge_index, p_data.test_pos_edge_index, p_data.test_neg_edge_index)
    if epoch % 10 == 0:
        print('Epoch: {:03d}, AUC: {:.4f}, AP: {:.4f}'.format(epoch, auc, ap))

NameError: name 'p_ae' is not defined

In [5]:
# model data
base_path = r'data/'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
val_ratio = 0
test_ratio = 0.1 # 50 reactants and 50 products. will end up training on 50 reactants + 40 products and testing on 10 products

# rp data
rp_dataset = ReactionDataset(base_path, geo_file = 'train_rp_50')
rp_data = rp_dataset.data
rp_data.train_mask = rp_data.val_mask = rp_data.test_mask = rp_data.y = None
rp_data = train_test_split_edges(data = rp_data, val_ratio = val_ratio, test_ratio = test_ratio)
rp_x = rp_data.x.to(device)
rp_train_pos_edge_index = rp_data.train_pos_edge_index.to(device)

# rp autoencoder
rp_num_node_fs = rp_data.num_node_features
rp_latent_dim = 2
rp_ae = GAE(MolEncoder(rp_num_node_fs, rp_latent_dim))
rp_opt = torch.optim.Adam(rp_ae.parameters(), lr = 0.01)

NameError: name 'MolEncoder' is not defined

In [80]:
r_dataset.data.__dict__.keys()

dict_keys(['x', 'edge_index', 'edge_attr', 'y', 'pos', 'normal', 'face', 'z', 'idx'])

In [85]:
# ways of combining:
#   - combine sdf files (need just first 100) and process that normally
#   - manually once created datasets

from torch_geometric.data.data import Data

rp_data_dict = {}

for dataset_param in r_dataset.data.__dict__.keys():
    rp_data_dict[dataset_param] = torch.cat((r_dataset.data[dataset_param], p_dataset.data[dataset_param]))


RuntimeError: Sizes of tensors must match except in dimension 0. Got 2486 and 2504 in dimension 1 (The offending index is 1)

In [79]:
r_dataset.data['edge_index']

tensor([[0, 0, 0,  ..., 7, 8, 9],
        [7, 8, 9,  ..., 3, 3, 5]])

In [74]:
print(len(r_dataset.data.edge_attr), len(p_dataset.data.edge_attr)) 

2486 2504


In [88]:
for i in r_dataset.data.__dict__.keys():
    print(i)

x
edge_index
edge_attr
y
pos
normal
face
z
idx


In [84]:
torch.cat((r_dataset.data['edge_attr'], p_dataset.data['edge_attr']))

tensor([[1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        ...,
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.]])

In [None]:
# model data
base_path = r'data/'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
val_ratio = 0
test_ratio = 0.2

# reactant data
r_dataset = ReactionDataset(base_path, geo_file = 'train_r') 
r_data = r_dataset.data
r_data.train_mask = r_data.val_mask = r_data.test_mask = r_data.y = None
r_data = train_test_split_edges(data = r_data, val_ratio = val_ratio, test_ratio = test_ratio)
r_x = r_data.x.to(device)
r_train_pos_edge_index = r_data.train_pos_edge_index.to(device)

# product data
p_dataset = ReactionDataset(base_path, geo_file = 'train_p') 
p_data = p_dataset.data
p_data.train_mask = p_data.val_mask = p_data.test_mask = p_data.y = None
p_data = train_test_split_edges(data = p_data, val_ratio = val_ratio, test_ratio = test_ratio)
p_x = p_data.x.to(device)
p_train_pos_edge_index = p_data.train_pos_edge_index.to(device)

# product encoder
p_num_node_fs = p_data.num_node_features
p_latent_dim = 2
p_ae = GAE(MolEncoder(p_num_node_fs, p_latent_dim))
p_opt = torch.optim.Adam(p_ae.parameters(), lr = 0.01)