In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import os
import glob
from tqdm.notebook import tqdm

# tf.config.set_visible_devices([], 'GPU')

import sys
sys.path.append('../src/')

import logging
tf.get_logger().setLevel(logging.ERROR)

from gcn import datasets as gcn_datasets
from gcn import models as gcn_models

from rgcn import datasets as rgcn_datasets
from rgcn import models as rgcn_models

from utils import metrics


### Define (random) hyperparameter search

In [2]:

parameters = {
    'num_bases':        lambda:       np.random.choice([-1, 2, 4]),
    'num_gconv_layers': lambda:       np.random.randint(*[3, 5+1]),
    'num_gconv_units':  lambda:       np.random.randint(*[128, 256+1]),
    'learning_rate':    lambda:       np.random.uniform(*[1e-4, 1e-3]),
    'num_epochs':       lambda:       np.random.randint(*[100, 300+1]),
    'batch_size':       lambda:       np.random.choice([32, 64, 128]),
    'weight_decay':     lambda:   10**np.random.uniform(*[-6, -3]),
    'num_dense_layers': lambda:       np.random.randint(*[1, 2+1]),
    'num_dense_units':  lambda:       np.random.randint(*[256, 1024]),
    'dense_dropout':    lambda:       np.random.uniform(*[0.0, 0.3]),
}


dataset_names = ['SMRT', 'RIKEN', 'Fiehn_HILIC', 'SMRT']

NUM_SEARCHES = 20

### Helper function to generate prediction files

In [3]:
def generate_output(model_obj, model_params, model_weights, 
                    dataset_obj, save_path):
    
    if not os.path.isdir(save_path):
        os.makedirs(save_path)
    
    print(model_params)
    
    files = glob.glob(f'../input/tfrecords/{save_path.split("/")[-3]}/*')

    for file in files:
        
        dataset = dataset_obj(file, batch_size=128, training=False)
        model = model_obj(**model_params)
        
        dummy_data = next(iter(dataset.get_iterator()))
        model([dummy_data['adjacency_matrix'], dummy_data['feature_matrix']])
        model.set_weights(model_weights)
        
        trues, preds = model.predict(dataset.get_iterator())
        #latent = model.get_latent_spaces(dataset.get_iterator())
        
        path = save_path + file.split('/')[-1].split('.')[0] # + "_1"
        np.save(path, np.stack([trues, preds]))
        #np.save(path + '_latent', latent)
        

### GCN

In [None]:
for dataset_name in dataset_names:

    best_error = float('inf')
    
    for i in tqdm(range(NUM_SEARCHES)):

        np.random.seed(42+i)

        num_gconv_layers = parameters['num_gconv_layers']()
        num_gconv_units = parameters['num_gconv_units']()
        learning_rate = parameters['learning_rate']()
        batch_size = parameters['batch_size']()
        num_epochs = parameters['num_epochs']()
        weight_decay = parameters['weight_decay']()
        num_dense_layers = parameters['num_dense_layers']()
        num_dense_units = parameters['num_dense_units']()
        dense_dropout = parameters['dense_dropout']()
        
        params = {
            "gconv_units": [num_gconv_units] * num_gconv_layers,
            "gconv_regularizer": tf.keras.regularizers.L2(weight_decay),
            "initial_learning_rate": learning_rate,
            'dense_units': [num_dense_units] * num_dense_layers,
            'dense_dropout': dense_dropout,
        }
        
        if dataset_name == "RIKEN" or dataset_name == "Fiehn_HILIC":
            train_dataset = gcn_datasets.GCNDataset(
                f'../input/tfrecords/{dataset_name}/train.tfrec', batch_size, True)
            valid_dataset = gcn_datasets.GCNDataset(
                f'../input/tfrecords/{dataset_name}/valid.tfrec', batch_size, False)
            test_1_dataset = gcn_datasets.GCNDataset(
                f'../input/tfrecords/{dataset_name}/test_1.tfrec', batch_size, False)
            test_2_dataset = gcn_datasets.GCNDataset(
                f'../input/tfrecords/{dataset_name}/test_2.tfrec', batch_size, False)
            additional_datasets={
                'valid': valid_dataset.get_iterator(),
                'test_1': test_1_dataset.get_iterator(),
                'test_2': test_2_dataset.get_iterator(),
            }
        else:
            train_dataset = gcn_datasets.GCNDataset(
                f'../input/tfrecords/{dataset_name}/train.tfrec', batch_size, True)
            valid_dataset = gcn_datasets.GCNDataset(
                f'../input/tfrecords/{dataset_name}/valid.tfrec', batch_size, False)
            test_1_dataset = gcn_datasets.GCNDataset(
                f'../input/tfrecords/{dataset_name}/test_1.tfrec', batch_size, False)
            additional_datasets={
                'valid': valid_dataset.get_iterator(),
                'test': test_1_dataset.get_iterator(),
            }
            
        
        model = gcn_models.GCNModel(**params)
        model.fit(
            train_dataset.get_iterator(), 
            additional_datasets=additional_datasets,
            epochs=num_epochs, verbose=0
        )
        
        if not os.path.isdir(f'../output/learning_curves/{dataset_name}/gcn/'):
            os.makedirs(f'../output/learning_curves/{dataset_name}/gcn/')
            
        for k, v in model.learning_curves.items():
            np.save(f'../output/learning_curves/{dataset_name}/gcn/{k}_{i}.npy', 
                    np.array(list(v)))
            
     
        trues, preds = model.predict(valid_dataset.get_iterator())

        error = metrics.get('mae')(trues, preds)
        print(f'MAE                       : {error}\n')
        if dataset_name != 'SMRT':
            print(f'test 1 = {model.learning_curves["test_1_mae"][-1]}')
            print(f'test 1 = {model.learning_curves["test_1_r2"][-1]}')
            print(f'test 2 = {model.learning_curves["test_2_mae"][-1]}')
            print(f'test 2 = {model.learning_curves["test_2_r2"][-1]}')
        else:
            print(f'test = {model.learning_curves["test_mae"][-1]}')
            print(f'test = {model.learning_curves["test_r2"][-1]}')
        
        if error < best_error:
            best_error = error
            best_params = params.copy()
            best_weights = model.get_weights()
            
            for k, v in model.learning_curves.items():
                np.save(f'../output/learning_curves/{dataset_name}/gcn/{k}_best.npy', 
                        np.array(list(v)))

    generate_output(
        model_obj=gcn_models.GCNModel,
        model_params=best_params,
        model_weights=best_weights,
        dataset_obj=gcn_datasets.GCNDataset,
        save_path=f'../output/predictions/{dataset_name}/gcn/'
    )


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))

In [5]:
# 0, 2, 4, 8, 9, 10, 17, ..., ... 

### RGCN

In [6]:
for dataset_name in dataset_names:

    best_error = float('inf')
    
    for i in tqdm(range(NUM_SEARCHES)):

        np.random.seed(42+i)
        
        num_bases = parameters['num_bases']()
        num_gconv_layers = parameters['num_gconv_layers']()
        num_gconv_units = parameters['num_gconv_units']()
        learning_rate = parameters['learning_rate']()
        batch_size = parameters['batch_size']()
        num_epochs = parameters['num_epochs']()
        weight_decay = parameters['weight_decay']()
        num_dense_layers = parameters['num_dense_layers']()
        num_dense_units = parameters['num_dense_units']()
        dense_dropout = parameters['dense_dropout']()
        
        #if i not in [0, 2, 4, 8, 9, 10, 17, 18]:
        #    continue
        
        params = {
           # "gconv_num_bases": num_bases,
            "gconv_units": [num_gconv_units] * num_gconv_layers,
            "gconv_regularizer": tf.keras.regularizers.L2(weight_decay),
            "initial_learning_rate": learning_rate,
            'dense_units': [num_dense_units] * num_dense_layers,
            'dense_dropout': dense_dropout,
        }

        train_dataset = rgcn_datasets.RGCNDataset(
            f'../input/tfrecords/{dataset_name}/train.tfrec', batch_size, True)
        valid_dataset = rgcn_datasets.RGCNDataset(
            f'../input/tfrecords/{dataset_name}/valid.tfrec', batch_size, False)
        
        if dataset_name == "RIKEN" or dataset_name == "Fiehn_HILIC":
            test_1_dataset = rgcn_datasets.RGCNDataset(
                f'../input/tfrecords/{dataset_name}/test_1.tfrec', batch_size, False)
            test_2_dataset = rgcn_datasets.RGCNDataset(
                f'../input/tfrecords/{dataset_name}/test_2.tfrec', batch_size, False)
            additional_datasets={
                'valid': valid_dataset.get_iterator(),
                'test_1': test_1_dataset.get_iterator(),
                'test_2': test_2_dataset.get_iterator(),
            }
        else:
            test_1_dataset = rgcn_datasets.RGCNDataset(
                f'../input/tfrecords/{dataset_name}/test.tfrec', batch_size, False)
            additional_datasets={
                'valid': valid_dataset.get_iterator(),
                'test': test_1_dataset.get_iterator(),
            }
            

        model = rgcn_models.RGCNModel(**params)
        model.fit(
            train_dataset.get_iterator(), 
            additional_datasets=additional_datasets,
            epochs=num_epochs, verbose=0
        )
        
        if not os.path.isdir(f'../output/learning_curves/{dataset_name}/rgcn/'):
            os.makedirs(f'../output/learning_curves/{dataset_name}/rgcn/')
                
        for k, v in model.learning_curves.items():
            np.save(f'../output/learning_curves/{dataset_name}/rgcn/{k}_{i}.npy', 
                    np.array(list(v)))
            
     
        trues, preds = model.predict(valid_dataset.get_iterator(), verbose=0)

        error = metrics.get('mae')(trues, preds)
        print(f'MAE                       : {error}\n')
        if dataset_name != 'SMRT':
            print(f'test 1 = {model.learning_curves["test_1_mae"][-1]}')
            print(f'test 1 = {model.learning_curves["test_1_r2"][-1]}')
            print(f'test 2 = {model.learning_curves["test_2_mae"][-1]}')
            print(f'test 2 = {model.learning_curves["test_2_r2"][-1]}')
        else:
            print(f'test = {model.learning_curves["test_mae"][-1]}')
            print(f'test = {model.learning_curves["test_r2"][-1]}')
        
        if error < best_error:
            best_error = error
            best_params = params.copy()
            best_weights = model.get_weights()
            
            for k, v in model.learning_curves.items():
                np.save(f'../output/learning_curves/{dataset_name}/rgcn/{k}_best.npy', 
                        np.array(list(v)))


    generate_output(
        model_obj=rgcn_models.RGCNModel,
        model_params=best_params,
        model_weights=best_weights,
        dataset_obj=rgcn_datasets.RGCNDataset,
        save_path=f'../output/predictions/{dataset_name}/rgcn/'
    )

HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.




Traceback (most recent call last):
  File "/home/alex/Envs/dl/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3417, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-6-c61280029467>", line 57, in <module>
    model.fit(
  File "/home/alex/Projects/research/finished/GCN-molecular-machine-learning/notebooks/../src/base_classes/models.py", line 89, in fit
    result = self._train_step(batch)
  File "/home/alex/Projects/research/finished/GCN-molecular-machine-learning/notebooks/../src/base_classes/models.py", line 48, in _train_step
    loss = self.compiled_loss(y, y_pred, regularization_losses=self.losses)
  File "/home/alex/Envs/dl/lib/python3.8/site-packages/tensorflow/python/keras/engine/base_layer.py", line 1477, in losses
    loss_tensor = regularizer()
  File "/home/alex/Envs/dl/lib/python3.8/site-packages/tensorflow/python/keras/engine/base_layer.py", line 1553, in _tag_callable
    loss = loss()
  File "/home/alex/Envs/dl

TypeError: object of type 'NoneType' has no len()