### Experiment Tracking with W&B

- config: store hp and metadata for each run
- wandb.init
- wandb.watch: log model gradients and params over time (helps detect bugs e.g. weird grad behaviour)
- wandb.log: log stuff we care about
- wandb.save: save online

use with block in context manager syntax

In [None]:
import wandb
wandb.login()

In [None]:
config = dict(
    epochs = 50,
    val_ratio = 0,
    test_ratio = 0.2
)

In [None]:
def make(base_path, val_ratio, test_ratio, encode_data_name, decode_data_name, latent_dim):
    # TODO: make edges to device here on when called on
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # dataset to encode
    encode_dataset = ReactionDataset(base_path, geo_file = encode_data_name, dataset_type= 'individual')
    encode_data = encode_dataset.data
    encode_data.train_mask = encode_data.val_mask = encode_data.test_mask = encode_data.y = None
    encode_data = train_test_split_edges(data = encode_data, val_ratio = val_ratio, test_ratio = test_ratio)
    encode_x = encode_data.x.to(device)
    encode_train_pos_edge_index = encode_data.train_pos_edge_index.to(device)

    # dataset to decode
    decode_dataset = ReactionDataset(base_path, geo_file = decode_data_name, dataset_type= 'individual')
    decode_data = decode_dataset.data
    decode_data.train_mask = decode_data.val_mask = decode_data.test_mask = decode_data.y = None
    decode_data = train_test_split_edges(data = decode_data, val_ratio = val_ratio, test_ratio = test_ratio)
    decode_x = decode_data.x.to(device)
    decode_train_pos_edge_index = decode_data.train_pos_edge_index.to(device)

    # model creation
    gae = GAE(MolEncoder(encode_data.num_node_features, latent_dim))
    opt = torch.optim.Adam(gae.parameters(), lr = 0.01)

    return gae, opt, encode_data, decode_data

In [None]:
def model_pipeline(hps):

    # start wandb
    with wandb.init(project="test", config=hps):
        
        # access hps through wandb.config so logging matches execution
        config = wandb.config

        # model data
        
        val_ratio = 0
        test_ratio = 0.2
        
        # make model, data, opt problem
        ts_r_gae, ts_r_opt, r_data, ts_data = make(r'data/', 0, 0.2, 'train_r', 'train_ts', 2)

### Testing GAEs

In [1]:
# data processing
from ts_vae.data_processors.grambow_processor import ReactionDataset

# my GAEs
from ts_vae.gaes.n_gae import Node_AE, train_node_ae, test_node_ae
from ts_vae.gaes.ne_gae import NodeEdge_AE, train_ne_ae, test_ne_ae
from ts_vae.gaes.nec_gae import NodeEdgeCoord_AE, train_nec_ae, test_nec_ae, main

# torch
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import ConcatDataset

# torch geometric
from torch_geometric.data import DataLoader
from torch_geometric.utils import to_dense_adj

# other
import numpy as np
from sklearn.metrics import roc_auc_score, average_precision_score

In [2]:
# remove processed files
import os
import glob

files = glob.glob(r'data/processed/*')
for f in files:
    os.remove(f)

In [3]:
rxns = ReactionDataset(r'data')
reactants = rxns.data.r
transition_states = rxns.data.ts
products = rxns.data.p

# train_loader = DataLoader(rxns[: num_train], batch_size = 2, follow_batch = ['r', 'p'])
# test_loader = DataLoader(rxns[num_train:], batch_size = 2, follow_batch = ['r', 'p'])

num_rxns = len(rxns)
train_ratio = 0.8
num_train = int(np.floor(train_ratio * num_rxns))

batch_size = 2

# need to be able to recover original reactants after encoding
# note: no padding, since PyG automatically factors this in
train_loaders = {'r':  DataLoader(reactants[: num_train], batch_size), 
                 'ts': DataLoader(transition_states[: num_train], batch_size), 
                 'p':  DataLoader(products[: num_train], batch_size)}

test_loaders =  {'r':  DataLoader(reactants[num_train: ], batch_size), 
                 'ts': DataLoader(transition_states[num_train: ], batch_size), 
                 'p':  DataLoader(products[num_train: ], batch_size)}

# doesn't create combined graphs for batches if batch_size > 1, there are hacks to do it with above
train_loader = DataLoader(rxns[: num_train], batch_size, follow_batch = ['r'])
test_loader = DataLoader(rxns[num_train: ], batch_size, follow_batch = ['r'])

Processing...


  0%|          | 30/6739 [00:00<01:19, 84.17it/s]
  4%|▎         | 30/842 [00:00<00:01, 624.99it/s]
  0%|          | 30/6739 [00:00<00:10, 625.01it/s]
  4%|▎         | 30/842 [00:00<00:01, 535.82it/s]
  0%|          | 30/6739 [00:00<00:05, 1251.12it/s]
  4%|▎         | 30/842 [00:00<00:02, 351.67it/s]


Done!


AttributeError: 'OtherReactionTriple' object has no attribute 'r'

In [9]:
rxns = ReactionDataset(r'data')

num_rxns = len(rxns)
train_ratio = 0.8
num_train = int(np.floor(train_ratio * num_rxns))

batch_size = 2
to_follow = ['edge_index_r', 'edge_index_ts', 'edge_index_p', 'edge_attr_r', 'edge_attr_ts', 'edge_attr_p'
             'pos_r', 'pos_ts', 'pos_p', 'x_r', 'x_ts', 'x_p']


train_loader = DataLoader(rxns[: num_train], batch_size = 2, follow_batch = to_follow)
test_loader = DataLoader(rxns[num_train: ], batch_size = 2, follow_batch = to_follow)

## Old models

In [3]:
### Node AE
max_num_nodes = max([r.z.size(0) for r in train_loaders['r'].dataset])
assert([r.x.size(1) for r in train_loaders['r'].dataset] == [train_loaders['r'].dataset[0].x.size(1)] * len(train_loaders['r'].dataset))
num_node_fs = train_loaders['r'].dataset[0].x.size(1)
num_edge_fs = train_loaders['r'].dataset[0].edge_attr.size(1)
h_nf = 5
emb_nf = 2

# in_node_nf + in_edge_nf >= h_nf >= out_nf > emb_nf 
node_ae = Node_AE(in_node_nf = num_node_fs, in_edge_nf = num_edge_fs, h_nf = h_nf, out_nf = h_nf, emb_nf = emb_nf)
node_opt = torch.optim.Adam(node_ae.parameters(), lr = 1e-3)

# train and test, add epochs after
train_loss, train_res = train_node_ae(node_ae, node_opt, train_loaders['r'])
test_loss, test_res = test_node_ae(node_ae, node_opt, test_loaders['r']) 

In [4]:
### NodeEdge AE
max_num_nodes = max([r.z.size(0) for r in train_loaders['r'].dataset])
assert([r.x.size(1) for r in train_loaders['r'].dataset] == [train_loaders['r'].dataset[0].x.size(1)] * len(train_loaders['r'].dataset))
num_node_fs = train_loaders['r'].dataset[0].x.size(1)
num_edge_fs = train_loaders['r'].dataset[0].edge_attr.size(1)
h_nf = 5
emb_nf = 2

# model and opt
ne_ae = NodeEdge_AE(in_node_nf = num_node_fs, in_edge_nf = num_edge_fs, h_nf = h_nf, out_nf = h_nf, emb_nf = emb_nf)
ne_opt = torch.optim.Adam(ne_ae.parameters(), lr = 1e-3)

# train and test
train_loss, train_res = train_ne_ae(ne_ae, ne_opt, train_loaders['r'])
test_loss, test_res = test_ne_ae(ne_ae, test_loaders['r']) 

In [None]:
### NodeEdge Model
epochs = 20
test_interval = 5
final_res = {'epochs': [], 'train_loss_arr': [], 'train_res_arr': [], 
             'test_loss_arr': [], 'test_res_arr': [], 'best_test': 1e10, 'best_epoch': 0}

# r_ae.reset_parameters()

for epoch in range(1, epochs + 1):
    
    train_loss, train_res = train_ne_ae(ne_ae, ne_opt, train_loaders['r'])
    final_res['train_loss_arr'].append(train_loss)
    final_res['train_res_arr'].append(train_res)
    print(f"===== Training epoch {epoch:03d} complete with loss: {train_loss:.4f} ====")
    
    if epoch % test_interval == 0:
    
        test_loss, test_res = test_ne_ae(ne_ae, test_loaders['r'])
        final_res['test_loss_arr'].append(test_loss)
        final_res['test_res_arr'].append(test_res)
        print(f'===== Testing epoch: {epoch:03d}, Loss: {test_loss:.4f} ===== \n')
        
        if test_loss < final_res['best_test']:
            final_res['best_test'] = test_loss
            final_res['best_epoch'] = epoch

## Coordinate model

In [12]:
### NodeEdgeCoord AE
max_num_nodes = max([r.z.size(0) for r in train_loaders['r'].dataset])
assert([r.x.size(1) for r in train_loaders['r'].dataset] == [train_loaders['r'].dataset[0].x.size(1)] * len(train_loaders['r'].dataset))
num_node_fs = train_loaders['r'].dataset[0].x.size(1)
num_edge_fs = train_loaders['r'].dataset[0].edge_attr.size(1)
h_nf = 5
emb_nf = 2

# model and opt
nec_ae = NodeEdgeCoord_AE(in_node_nf = num_node_fs, in_edge_nf = num_edge_fs, h_nf = h_nf, out_nf = h_nf, emb_nf = emb_nf)
nec_opt = torch.optim.Adam(nec_ae.parameters(), lr = 1e-3)

# train and test
#train_loss, train_res = train_nec_ae(nec_ae, nec_opt, train_loaders['r'])
#test_loss, test_res = test_nec_ae(nec_ae, test_loaders['r'])

NameError: name 'train_loaders' is not defined

In [8]:
def train_nec_ae_r2ts(nec_ae, opt, loader):

    res = {'rxn_coord_loss': [], 'batch_coord_loss': [], 'num_rxns': 0}

    for i, rxn_batch in enumerate(loader):

        nec_ae.train()
        opt.zero_grad()
        
        rs, tss = rxn_batch.r, rxn_batch.ts
        assert len(rs) == len(tss), "Don't have same number of R and TS in batch!"
        batch_size = len(rs)
        batch_loss = None

        for b in range(0, batch_size):

            r, ts = rs[b], tss[b]
            assert r.idx == ts.idx, "R and TS not in the same reaction!"

            # run model on reactant
            node_emb, edge_emb, recon_node_fs, recon_edge_fs, adj_pred, coord_out = nec_ae(r.x, r.edge_index, r.edge_attr, r.pos)

            # ground truth values
            adj_gt = to_dense_adj(ts.edge_index).squeeze(dim = 0)
            assert adj_gt.shape == adj_pred.shape, f"Your adjacency matrices don't have the same shape! \
                    GT shape: {adj_gt.shape}, Pred shape: {adj_pred.shape}, Batch size: {batch_size}, \
                    Node fs shape: {node_feats.shape} "
            
            rxn_loss = F.mse_loss(coord_out, ts.pos)
            res['rxn_coord_loss'].append(rxn_loss.item())
            batch_loss += rxn_loss
            
        batch_loss.backward()
        opt.step()

        # record batch results
        res['batch_coord_loss'].append(batch_loss.item())
        res['num_rxns'] += batch_size
    
    return sum(res['rxn_coord_loss']) / res['num_rxns'], res

In [9]:
### NodeEdgeCoord Model 
### NEW

epochs = 5
test_interval = 5
final_res = {'epochs': [], 'train_loss_arr': [], 'train_res_arr': [], 
             'test_loss_arr': [], 'test_res_arr': [], 'best_test': 1e10, 'best_epoch': 0}

# r_ae.reset_parameters()

torch.set_printoptions(precision=2)

for epoch in range(1, epochs + 1):
    
    train_loss, train_res = train_nec_ae_r2ts(nec_ae, nec_opt, train_loader)
    final_res['train_loss_arr'].append(train_loss)
    final_res['train_res_arr'].append(train_res)
    print(f"===== Training epoch {epoch:03d} complete with loss: {train_loss:.4f} ====")
    
    if epoch % test_interval == 0:
    
        test_loss, test_res = test_nec_ae(nec_ae, test_loaders['r'])
        final_res['test_loss_arr'].append(test_loss)
        final_res['test_res_arr'].append(test_res)
        print(f'===== Testing epoch: {epoch:03d}, Loss: {test_loss:.4f} ===== \n')
        
        if test_loss < final_res['best_test']:
            final_res['best_test'] = test_loss
            final_res['best_epoch'] = epoch

===== Training epoch 001 complete with loss: 0.1663 ====


RuntimeError: Trying to backward through the graph a second time, but the saved intermediate results have already been freed. Specify retain_graph=True when calling .backward() or autograd.grad() the first time.

In [7]:
### NodeEdgeCoord Model
epochs = 10
test_interval = 5
final_res = {'epochs': [], 'train_loss_arr': [], 'train_res_arr': [], 
             'test_loss_arr': [], 'test_res_arr': [], 'best_test': 1e10, 'best_epoch': 0}

# r_ae.reset_parameters()

torch.set_printoptions(precision=2)

for epoch in range(1, epochs + 1):
    
    train_loss, train_res = train_nec_ae(nec_ae, nec_opt, train_loaders['r'])
    final_res['train_loss_arr'].append(train_loss)
    final_res['train_res_arr'].append(train_res)
    print(f"===== Training epoch {epoch:03d} complete with loss: {train_loss:.4f} ====")
    
    if epoch % test_interval == 0:
    
        test_loss, test_res = test_nec_ae(nec_ae, test_loaders['r'])
        final_res['test_loss_arr'].append(test_loss)
        final_res['test_res_arr'].append(test_res)
        print(f'===== Testing epoch: {epoch:03d}, Loss: {test_loss:.4f} ===== \n')
        
        if test_loss < final_res['best_test']:
            final_res['best_test'] = test_loss
            final_res['best_epoch'] = epoch

===== Training epoch 001 complete with loss: 0.0000 ====
===== Training epoch 002 complete with loss: 0.0000 ====
===== Training epoch 003 complete with loss: 0.0000 ====
===== Training epoch 004 complete with loss: 0.0000 ====
===== Training epoch 005 complete with loss: 0.0000 ====
===== Testing epoch: 005, Loss: 0.7080 ===== 

===== Training epoch 006 complete with loss: 0.0000 ====
===== Training epoch 007 complete with loss: 0.0000 ====
===== Training epoch 008 complete with loss: 0.0000 ====
===== Training epoch 009 complete with loss: 0.0000 ====
===== Training epoch 010 complete with loss: 0.0000 ====
===== Testing epoch: 010, Loss: 0.7080 ===== 



In [23]:
# final_res['train_res_arr'][0]['node_recon_loss_arr']
# improves slightly: 1.95-> 1.6
# final_res['train_res_arr'][0]['edge_recon_loss_arr']
# improves slightly: 0.19 -> 0.13
# final_res['train_res_arr'][0]['adj_loss_arr']
# stays same: 0.71
# final_res['train_res_arr'][0]['coord_loss_arr']
# stays same: 0

[0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0]

In [14]:

# final_res['test_res_arr'][0]['node_recon_loss_arr']
final_res['test_res_arr'][0]['adj_loss_arr']
# ['total_loss', 'counter', 'total_loss_arr', 'coord_loss_arr', 'node_recon_loss_arr', 'edge_recon_loss_arr', 'adj_loss_arr']

# total_loss not changing, ~0.73
# coord_loss = 0 always [expect for R->R]
# node_recon_loss 


# test_loss doesn't seem to be changing
# node_loss > 1.7, usually > 2 but gets worse...
# adj_loss always 0.3
# edge_loss goes from 0.8 -> 0.1


[0.7285862565040588,
 0.730161190032959,
 0.7295154929161072,
 0.7291362285614014,
 0.7294766306877136,
 0.7294126152992249,
 0.7305843830108643,
 0.7299090623855591,
 0.7289371490478516,
 0.7288703322410583,
 0.7297692894935608,
 0.7288429737091064,
 0.7287067770957947,
 0.7288702726364136,
 0.7291567921638489,
 0.7293187379837036,
 0.7297435402870178,
 0.7284173965454102,
 0.7233184576034546]

Convert MLP to GNN by swapping torch.nn.Linear with PyG's GNN operators e.g. GCN layer
Lucky's work
- PairFeatures: a manual MP I think. it has to be otherwise what he's doing isn't a GNN.
- set edges: iterate: 
    - compute features (i.e. MP) -> MLP(features) -> update edges
    - compute features (i.e. MP) -> MLP(MLP(edges)) -> update vertices

Loose notes
- Can define data class for parameters e.g. 
    - @dataclass
      class GNNParams:
        input_dim: int
        output_dim: int
        ... (hidden_sizes, dropout, batchnorm, activation)