In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import os
import glob
from tqdm.notebook import tqdm

# tf.config.set_visible_devices([], 'GPU')

import sys
sys.path.append('../src/')

import logging
tf.get_logger().setLevel(logging.ERROR)

from gcn import datasets as gcn_datasets
from gcn import models as gcn_models

from rgcn import datasets as rgcn_datasets
from rgcn import models as rgcn_models

from mlp import datasets as mlp_datasets
from mlp import models as mlp_models

from ml import datasets as ml_datasets
from ml import models as ml_models

from utils import metrics


In [None]:
dataset_names = ['RIKEN', 'Fiehn_HILIC', 'SMRT']

NUM_SEARCHES = 20

### GCN and RGCN

In [None]:
def generate_output(model_obj, model_params, model_weights, 
                    dataset_obj, save_path):
    
    if not os.path.isdir(save_path):
        os.makedirs(save_path)
    
    files = glob.glob(f'../input/tfrecords/{save_path.split("/")[-3]}/*')
    
    model = model_obj(**model_params)
    dummy_data = next(iter(dataset_obj(
        files[0], batch_size=128, training=False).get_iterator()))
    model([dummy_data['adjacency_matrix'], dummy_data['feature_matrix']])
    model.set_weights(model_weights)
        
    for file in files:
        dataset = dataset_obj(file, batch_size=128, training=False)
        trues, preds = model.predict(dataset.get_iterator())
        path = save_path + file.split('/')[-1].split('.')[0] # + "_1"
        np.save(path, np.stack([trues, preds]))
        
        
parameters = {
    'num_bases':        lambda:       np.random.choice([-1, 2, 4]),
    'num_gconv_layers': lambda:       np.random.randint(*[3, 5+1]),
    'num_gconv_units':  lambda:       np.random.randint(*[128, 256+1]),
    'learning_rate':    lambda:       np.random.uniform(*[1e-4, 1e-3]),
    'num_epochs':       lambda:       np.random.randint(*[100, 300+1]),
    'batch_size':       lambda:       np.random.choice([32, 64, 128]),
    'weight_decay':     lambda:   10**np.random.uniform(*[-6, -3]),
    'num_dense_layers': lambda:       np.random.randint(*[1, 2+1]),
    'num_dense_units':  lambda:       np.random.randint(*[256, 1024]),
    'dense_dropout':    lambda:       np.random.uniform(*[0.0, 0.3]),
}

        
for dataset_name in dataset_names:

    best_error = float('inf')
    
    for i in tqdm(range(NUM_SEARCHES)):

        np.random.seed(42+i)

        num_gconv_layers = parameters['num_gconv_layers']()
        num_gconv_units = parameters['num_gconv_units']()
        learning_rate = parameters['learning_rate']()
        batch_size = parameters['batch_size']()
        num_epochs = parameters['num_epochs']()
        weight_decay = parameters['weight_decay']()
        num_dense_layers = parameters['num_dense_layers']()
        num_dense_units = parameters['num_dense_units']()
        dense_dropout = parameters['dense_dropout']()
        
        params = {
            "gconv_units": [num_gconv_units] * num_gconv_layers,
            "gconv_regularizer": tf.keras.regularizers.L2(weight_decay),
            "initial_learning_rate": learning_rate,
            'dense_units': [num_dense_units] * num_dense_layers,
            'dense_dropout': dense_dropout,
        }
        
        train_dataset = gcn_datasets.GCNDataset(
            f'../input/tfrecords/{dataset_name}/train.tfrec', batch_size, True)
        valid_dataset = gcn_datasets.GCNDataset(
            f'../input/tfrecords/{dataset_name}/valid.tfrec', batch_size, False)
        
        model = gcn_models.GCNModel(**params)
        model.fit(
            train_dataset.get_iterator(), 
            epochs=num_epochs, 
            verbose=0
        )
            
        trues, preds = model.predict(valid_dataset.get_iterator())

        error = metrics.get('mae')(trues, preds)
 
        if error < best_error:
            best_error = error
            best_params = params.copy()
            best_weights = model.get_weights()
            
    generate_output(
        model_obj=gcn_models.GCNModel,
        model_params=best_params,
        model_weights=best_weights,
        dataset_obj=gcn_datasets.GCNDataset,
        save_path=f'../output/predictions/{dataset_name}/gcn/'
    )

    
for dataset_name in dataset_names:

    best_error = float('inf')
    
    for i in tqdm(range(NUM_SEARCHES)):

        np.random.seed(42+i)
        
        num_bases = parameters['num_bases']()
        num_gconv_layers = parameters['num_gconv_layers']()
        num_gconv_units = parameters['num_gconv_units']()
        learning_rate = parameters['learning_rate']()
        batch_size = parameters['batch_size']()
        num_epochs = parameters['num_epochs']()
        weight_decay = parameters['weight_decay']()
        num_dense_layers = parameters['num_dense_layers']()
        num_dense_units = parameters['num_dense_units']()
        dense_dropout = parameters['dense_dropout']()
        
        
        params = {
           # "gconv_num_bases": num_bases,
            "gconv_units": [num_gconv_units] * num_gconv_layers,
            "gconv_regularizer": tf.keras.regularizers.L2(weight_decay),
            "initial_learning_rate": learning_rate,
            'dense_units': [num_dense_units] * num_dense_layers,
            'dense_dropout': dense_dropout,
        }

        train_dataset = rgcn_datasets.RGCNDataset(
            f'../input/tfrecords/{dataset_name}/train.tfrec', batch_size, True)
        valid_dataset = rgcn_datasets.RGCNDataset(
            f'../input/tfrecords/{dataset_name}/valid.tfrec', batch_size, False)
            
        model = rgcn_models.RGCNModel(**params)
        model.fit(
            train_dataset.get_iterator(), 
            epochs=num_epochs, 
            verbose=0
        )

        trues, preds = model.predict(valid_dataset.get_iterator(), verbose=0)

        error = metrics.get('mae')(trues, preds)
        
        if error < best_error:
            best_error = error
            best_params = params.copy()
            best_weights = model.get_weights()
    
    generate_output(
        model_obj=rgcn_models.RGCNModel,
        model_params=best_params,
        model_weights=best_weights,
        dataset_obj=rgcn_datasets.RGCNDataset,
        save_path=f'../output/predictions/{dataset_name}/rgcn/'
    )

### MLP

In [None]:
def generate_output(model_obj,
                    model_params,
                    model_weights,
                    save_path):


    if not os.path.isdir(save_path):
        os.makedirs(save_path)
    
    batch_size = model_params['batch_size']
    num_epochs = model_params['num_epochs']
    bits = model_params['bits']
    radius = model_params['radius']
    use_counts = model_params['use_counts']

    del model_params['batch_size']
    del model_params['num_epochs']
    del model_params['bits']
    del model_params['radius']
    del model_params['use_counts']
    
    train, valid, test_1, test_2 = mlp_datasets.get_ecfp_datasets(
        f"../input/datasets/{save_path.split('/')[-2]}.csv",
        bits=bits, radius=radius, use_counts=use_counts,
    )
    
    model = model_obj(**model_params)
    model(train['X'][:1])
    model.set_weights(model_weights)
    
    for name, dataset in zip(['train', 'valid', 'test_1', 'test_2'], [train, valid, test_1, test_2]):
        if dataset is not None:
            y_pred = model.predict(dataset['X'], dataset['y'])[1]
            np.save(save_path + '/' + name, np.stack([dataset['y'], y_pred]))
    
parameters = {
    'num_layers':    lambda:       np.random.randint(*[1, 3+1]),
    'num_units':     lambda:       np.random.randint(*[256, 1024+1]),
    'learning_rate': lambda:       np.random.uniform(*[1e-4, 1e-3]),
    'num_epochs':    lambda:       np.random.randint(*[50, 200+1]),
    'batch_size':    lambda:       np.random.choice([32, 64, 128]),
    'dropout_rate':  lambda:       np.random.uniform(*[0.0, 0.3]),
    'bits':          lambda:       np.random.randint(*[512, 2048+1]),
    'radius':        lambda:       np.random.randint(*[1, 3+1]),
    'use_counts':    lambda:       np.random.choice([True, False]),
}


for dataset_name in dataset_names:
    
    best_error = float('inf')
    
    for i in tqdm(range(NUM_SEARCHES)):
        
        np.random.seed(42+i)
        
        num_layers = parameters['num_layers']()
        num_units = parameters['num_units']()
        learning_rate = parameters['learning_rate']()
        batch_size = parameters['batch_size']()
        num_epochs = parameters['num_epochs']()
        dropout_rate = parameters['dropout_rate']()
        bits = parameters['bits']()
        radius = parameters['radius']()
        use_counts = parameters['use_counts']()
        
        train, valid, test_1, test_2 = mlp_datasets.get_ecfp_datasets(
            '../input/datasets/{}.csv'.format(dataset_name),
            bits=bits, radius=radius, use_counts=use_counts,
        )
        
        model = mlp_models.MLPModel(
            hidden_units=[num_units] * num_layers,
            dropout_rate=dropout_rate,
            loss_fn=tf.keras.losses.Huber,
            optimizer=tf.keras.optimizers.Adam,
            initial_learning_rate=learning_rate,
        )
        
        model.fit(train['X'], train['y'], 
                  batch_size=batch_size, 
                  verbose=0,
                  epochs=num_epochs)
        
        trues, preds = model.predict(valid['X'], valid['y'])

        error = metrics.get('mae')(trues, preds)

        if error < best_error:
            best_error = error
            best_params = {
                "num_epochs": num_epochs,
                "batch_size": batch_size,
                "hidden_units": [num_units] * num_layers,
                "initial_learning_rate": learning_rate,
                "dropout_rate": dropout_rate,
                "bits": bits,
                "radius": radius,
                "use_counts": use_counts,
            }
            best_weights = model.get_weights()
            

    generate_output(
        model_obj=mlp_models.MLPModel,
        model_params=best_params,
        model_weights=best_weights,
        save_path='../output/predictions/{}/{}'.format(dataset_name, 'mlp')
    )

### RF, GB, AB and SVM

In [None]:
def generate_output(model_obj, datasets, save_path):

    if not os.path.isdir(save_path):
        os.makedirs(save_path)
        
    for name, dataset in zip(['train', 'valid', 'test_1', 'test_2'], datasets):
        if dataset is not None:
            y_pred = model_obj.predict(dataset['X'])
            np.save(save_path + '/' + name, np.stack([dataset['y'], y_pred]))
    

for dataset_name in dataset_names:
    
    train, valid, test_1, test_2 = ml_datasets.get_descriptor_datasets(
            dataset_path=f'../input/datasets/{dataset_name}.csv')
    
    for model_name in ['rf', 'gb', 'ab', 'svm']:
    
        model_iter = ml_models.ModelGenerator(model_name, NUM_SEARCHES)
        
        best_error = float('inf')
        for model in tqdm(model_iter):
            
            model.fit(train['X'], train['y'])
            preds = model.predict(valid['X'])
            error = metrics.get('mae')(valid['y'], preds)
            
            if error < best_error:
                best_error = error
                best_model = model
        
        generate_output(
            model_obj=best_model,
            datasets=[train, valid, test_1, test_2],
            save_path='../output/predictions/{}/{}'.format(
                dataset_name, model_name)
        )
        

### Obtain results

In [None]:
models = ['gcn', 'rgcn', 'mlp', 'rf', 'svm', 'gb', 'ab']
datasets = ['RIKEN', 'Fiehn_HILIC', 'SMRT']

d = {
    'SMRT': {
        'GCN': [],
        'RGCN': [],
        'MLP': [],
        'RF': [],
        'SVM': [],
        'GB': [],
        'AB': []
    },
    'RIKEN': {
        'GCN': [],
        'RGCN': [],
        'MLP': [],
        'RF': [],
        'SVM': [],
        'GB': [],
        'AB': []
    },
    'Fiehn_HILIC': {
        'GCN': [],
        'RGCN': [],
        'MLP': [],
        'RF': [],
        'SVM': [],
        'GB': [],
        'AB': []
    },
}

for dataset in datasets:
    for model in models:
        files = glob.glob(f'../output/predictions/{dataset}/{model}/*')
    
        files[1], files[2] = files[2], files[1]
        
        assert files[0].split('/')[-1].split('.')[0] == 'train'
        assert files[1].split('/')[-1].split('.')[0] == 'valid'
        assert files[2].split('/')[-1].split('.')[0] == 'test' or files[2].split('/')[-1].split('.')[0] == 'test_1'
        
        if len(files) == 4:
            assert files[3].split('/')[-1].split('.')[0] == 'test_2'
        
        for file in files:
            data = np.load(file)
            mae = metrics.get('mae')(data[0], data[1])
            mre = metrics.get('mre')(data[0], data[1])
            rmse = metrics.get('rmse')(data[0], data[1])
            r2  = metrics.get('r2')(data[0], data[1])
            
            subset = file.split('/')[-1].split('.')[0]

            d[dataset][model.upper()].extend([mae, mre, rmse, r2])

            

In [None]:
data = pd.DataFrame.from_dict(pd.DataFrame.from_dict({(i,j): d[i][j] 
                           for i in d.keys() 
                           for j in d[i].keys()},
                       orient='index'))
column_names =  ["train_mae", "train_mre", "train_rmse", "train_r2"]
column_names += ["valid_mae", "valid_mre", "valid_rmse", "valid_r2"]
column_names += ["test1_mae", "test1_mre", "test1_rmse", "test1_r2"]
column_names += ["test2_mae", "test2_mre", "test2_rmse", "test2_r2"]
data.columns = column_names
data