In [None]:
import h5py, time
import click
import numpy as np
import pandas as pd
import os
from pathlib import Path

from filelock import FileLock
from ray import tune
from ray.tune.integration.keras import TuneReportCallback
from ray.tune.schedulers import AsyncHyperBandScheduler
from ray.air.callbacks.wandb import WandbLoggerCallback
import tensorflow as tf
from tensorflow import keras
from scipy.stats import spearmanr
import wandb
import yaml

from hominid import model_zoo, utils

# set seed for reproducibility
np.random.seed(0)

In [None]:
wandb.login()

In [None]:
# metrics

def Spearman(y_true, y_pred):
     return ( tf.py_function(spearmanr, [tf.cast(y_pred, tf.float32),
                       tf.cast(y_true, tf.float32)], Tout = tf.float32) ) 
from keras import backend as K

def pearson_r(y_true, y_pred):
    # use smoothing for not resulting in NaN values
    # pearson correlation coefficient
    # https://github.com/WenYanger/Keras_Metrics
    epsilon = 10e-5
    x = y_true
    y = y_pred
    mx = K.mean(x)
    my = K.mean(y)
    xm, ym = x - mx, y - my
    r_num = K.sum(xm * ym)
    x_square_sum = K.sum(xm * xm)
    y_square_sum = K.sum(ym * ym)
    r_den = K.sqrt(x_square_sum * y_square_sum)
    r = r_num / (r_den + epsilon)
    return K.mean(r)         

def hominid_pipeline(config):

    # ==============================================================================
    # Load dataset
    # ==============================================================================

    dataset_path = '/home/chandana/projects/hominid/data/hepg2.h5'
    with h5py.File(dataset_path, "r") as f:
        x_train = f["x_train"][:]
        y_train = f["y_train"][:]

        x_valid = f["x_valid"][:]
        y_valid = f["y_valid"][:]

        x_test = f["x_test"][:]
        y_test = f["y_test"][:]

    N, L, A = x_train.shape
    output_shape = y_train.shape[-1]

    print(f"Input shape: {N, L, A}. Output shape: {output_shape}")

    config["input_shape"] = (L, A)
    config["output_shape"] = output_shape

    print(output_shape)

    # ==============================================================================
    # Build model
    # ==============================================================================

    print("Building model...")

    model = model_zoo.base_model(**config)

    return x_train, y_train, x_valid, y_valid, x_test, y_test, model

def tune_hominid(config: dict):

    x_train, y_train, x_valid, y_valid, x_test, y_test, model = hominid_pipeline(config)

    model.compile(
        tf.keras.optimizers.Adam(lr=0.001),
        loss='mse',
        metrics=[Spearman, pearson_r] #PearsonCorrelation()
        )
    model.summary()

    # train model
    model.fit(
          x_train, y_train,
          epochs=60,
          batch_size=128,
          shuffle=True,
          validation_data=(x_valid, y_valid),
          callbacks=[TuneReportCallback({
              "pearson_r": "pearson_r"
          })]
      )

In [None]:
config = {
    "conv1_activation": tune.choice(["exponential", "relu"]), # relu
    "conv1_batchnorm": tune.choice([True, False]), 
    "conv1_channel_weight": tune.choice(["softconv", "se"]),
    "conv1_dropout": 0.2,
    "conv1_filters": tune.choice([128, 192, 256, 512]), # go to 256, 512
    "conv1_kernel_size": tune.choice([11, 15, 19]), # go to 11 15 19
    "conv1_max_pool": 10,
    "conv1_pool_type": "attention",
    "conv1_type": "pw",
    "dense_activation": "relu",
    "dense_batchnorm": True,
    "dense_dropout": tune.choice([[0.3, 0.3], [0.4, 0.4], [0.5, 0.5]]), # go to 0.30.3 to 0.5 0.5
    "dense_units": tune.choice([[128, 256],[512, 256], [256, 256], [512, 512],[512, 1024]]), # use [512, 256] (explore values in between as well + higher)
    "input_shape": None,
    "mha_d_model": tune.choice([96, 192]),
    "mha_dropout": 0.1,
    "mha_head_type": "pool",
    "mha_heads": tune.choice([4, 8]),
    "mha_layernorm": False,
    "mha_pool_type": "attention",
    "output_activation": "linear",
    "output_shape": None
}

In [None]:
smoke_test = False  # For testing purposes: set this to False to run the full experiment
analysis = tune.run(
    tune_hominid,
    name="tune_hominid",
    callbacks=[WandbLoggerCallback(project="raytune-expanded-v1")],
    scheduler=AsyncHyperBandScheduler(
        time_attr="training_iteration",
        max_t=400,
        grace_period=20
    ),
    metric="pearson_r",
    mode="max",
    stop={
        "pearson_r": 0.9,
        "training_iteration": 5 if smoke_test else 100
    },
    num_samples=2 if smoke_test else 50,
    resources_per_trial={
        "cpu": 4,
        "gpu": 1
    },
    config=config
)

In [None]:
print("Best hyperparameters found were: ", analysis.best_config)

In [None]:
RESULTS_DIR = '/home/chandana/projects/hominid/results/'
print("Best hyperparameters found were: ", analysis.best_config)

pd.DataFrame.from_dict(
        analysis.best_config,
        orient='index'
).to_csv(
        RESULTS_DIR + 'tune_best.csv',
        header=True,
        index=True
        )
analysis.results_df.to_csv(RESULTS_DIR + 'tune_all.csv')