In [1]:
# Imports
import csv
import os
import pickle as pkl

import numpy as np
import tensorflow_addons as tfa
import tensorflow.keras as keras
import kerastuner
from sklearn.model_selection import train_test_split
from sklearn.utils import compute_class_weight
from tensorflow.keras.callbacks import TensorBoard

from tools import keras as tk
import tools.preprocessing as tp

In [2]:
 # GLOBALS   
DAY_ONE_ONLY = True
TIME_SEQ = 225
TARGET = "multi_class"
BATCH_SIZE = 128
EPOCHS = 15
MAX_TRIALS = 500
TEST_SPLIT = 0.2
VAL_SPLIT = 0.1
RAND = 2021
TB_UPDATE_FREQ = 200
WEIGHTED_LOSS = False

# Paths
pwd = globals()['_dh'][0]

output_dir = os.path.abspath(os.path.join(pwd, "..", "output"))
data_dir = os.path.abspath(os.path.join(pwd, "..", "data", "data"))
tensorboard_dir = os.path.abspath(
    os.path.join(data_dir, "..", "model_checkpoints"))
pkl_dir = os.path.join(output_dir, "pkl")
stats_dir = os.path.join(output_dir, "analysis")

# Create analysis dir if it doesn't exist
os.makedirs(stats_dir, exist_ok=True)

stats_filename = TARGET + "_stats.csv"

In [3]:
# Data load
with open(os.path.join(pkl_dir, TARGET + "_trimmed_seqs.pkl"), "rb") as f:
    inputs = pkl.load(f)

with open(os.path.join(pkl_dir, "all_ftrs_dict.pkl"), "rb") as f:
    vocab = pkl.load(f)

with open(os.path.join(pkl_dir, "feature_lookup.pkl"), "rb") as f:
    all_feats = pkl.load(f)

with open(os.path.join(pkl_dir, "demog_dict.pkl"), "rb") as f:
    demog_lookup = pkl.load(f)

# Determining number of vocab entries
N_VOCAB = len(vocab) + 1
N_DEMOG = len(demog_lookup) + 1
MAX_DEMOG = max(len(x) for _, x, _ in inputs)
N_CLASS = max(x for _, _, x in inputs) + 1


In [4]:
# Model Metrics and callbacks
callbacks = [
    TensorBoard(
    log_dir=os.path.join(tensorboard_dir, "dan_hp_tune_tb", ""),
    histogram_freq=1,
    profile_batch=0,
    write_graph=False,
    update_freq=TB_UPDATE_FREQ
    ),
    keras.callbacks.EarlyStopping(monitor="val_loss",
                                min_delta=0,
                                patience=3,
                                restore_best_weights=True,
                                mode="min")
]

# Create some metrics
metrics = [
    keras.metrics.AUC(num_thresholds=int(1e5), name="ROC-AUC"),
    keras.metrics.AUC(num_thresholds=int(1e5), curve="PR", name="PR-AUC"),
    # NOTE: I think F1 Score is kind of wonky here, but I'll add it anyways
    # because that's pretty cool.
    tfa.metrics.F1Score(num_classes=N_CLASS, average="weighted")
]

In [5]:
# TTV
# Splitting the data
train, test = train_test_split(
    range(len(inputs)),
    test_size=TEST_SPLIT,
    stratify=[labs for _, _, labs in inputs],
    random_state=RAND)

train, validation = train_test_split(
    train,
    test_size=VAL_SPLIT,
    stratify=[samp[2] for i, samp in enumerate(inputs) if i in train],
    random_state=RAND)

In [6]:
if DAY_ONE_ONLY:
    # Optionally limiting the features to only those from the first day
    # of the actual COVID visit
    features = [l[0][-1] for l in inputs]
else:
    features = [tp.flatten(l[0]) for l in inputs]

new_demog = [[i + N_VOCAB - 1 for i in l[1]] for l in inputs]
features = [
    features[i] + new_demog[i] for i in range(len(features))
]
demog_vocab = {k: v + N_VOCAB - 1 for k, v in demog_lookup.items()}
vocab.update(demog_vocab)
N_VOCAB = np.max([np.max(l) for l in features]) + 1

# Making the variables
X = keras.preprocessing.sequence.pad_sequences(features, padding='post')
y = np.array([l[2] for l in inputs])

N_FEATS = X.shape[1]

In [7]:
classes = np.unique([labs for _, _, labs in inputs]).tolist()

if WEIGHTED_LOSS:
    class_weights = compute_class_weight(
        class_weight="balanced",
        classes=classes,
        y=[labs for _, _, labs in inputs],
    )

    class_weights = dict(zip(classes, class_weights))

    print(class_weights)

## Generate Hypermodel

In [8]:

hyper_model = tk.DANHyper(
    vocab_size = N_VOCAB,
    input_size=N_FEATS,
    metrics = metrics,
    n_classes = N_CLASS
)

tuner = kerastuner.tuners.BayesianOptimization(
    hyper_model,
    max_trials=MAX_TRIALS,
    objective="val_loss",
    project_name="dan_hp_tune",
    # NOTE: This could be in output as well if we don't want to track/version it
    directory=tensorboard_dir,
)

INFO:tensorflow:Reloading Oracle from existing project C:\Users\oet5\premier_analysis\data\model_checkpoints\dan_hp_tune\oracle.json
INFO:tensorflow:Reloading Tuner from C:\Users\oet5\premier_analysis\data\model_checkpoints\dan_hp_tune\tuner0.json


## Search

In [9]:
# Announce the search space
tuner.search_space_summary()

Search space summary
Default search space size: 7
Feature Embedding L1 (Choice)
{'default': 0.0, 'conditions': [], 'values': [0.0, 1e-06, 1e-05, 0.0001, 0.001, 0.01, 0.1], 'ordered': True}
Feature Embedding L2 (Choice)
{'default': 0.0, 'conditions': [], 'values': [0.0, 1e-06, 1e-05, 0.0001, 0.001, 0.01, 0.1], 'ordered': True}
Embedding Dimension (Int)
{'default': 64, 'conditions': [], 'min_value': 64, 'max_value': 512, 'step': 64, 'sampling': None}
Dropout from Embeddings (Float)
{'default': 0.0, 'conditions': [], 'min_value': 0.0, 'max_value': 0.9, 'step': 0.05, 'sampling': None}
Dense Units (Int)
{'default': 32, 'conditions': [], 'min_value': 2, 'max_value': 128, 'step': 1, 'sampling': 'log'}
Learning Rate (Choice)
{'default': 1e-06, 'conditions': [], 'values': [1e-06, 5e-06, 1e-05, 5e-05, 0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1], 'ordered': True}
Momentum (Float)
{'default': 0.0, 'conditions': [], 'min_value': 0.0, 'max_value': 0.9, 'step': 0.1, 'sampling': None}


In [None]:
if N_CLASS > 2:
    # We have to pass one-hot labels for model fit, but CLF metrics
    # will take indices
    y_one_hot = np.eye(N_CLASS)[y]

    tuner.search(X[train],
                validation_data=(X[validation], y_one_hot[validation]),
                epochs=EPOCHS,
                callbacks=callbacks
    )
else:
    tuner.search(X[train],
            validation_data=(X[validation], y_one_hot[validation]),
            epochs=EPOCHS,
            callbacks=callbacks
    )


Search: Running Trial #1

Hyperparameter    |Value             |Best Value So Far 
Feature Embeddi...|0.01              |1e-06             
Feature Embeddi...|0.01              |0                 
Embedding Dimen...|384               |64                
Dropout from Em...|0.65              |0.85              
Dense Units       |70                |94                
Learning Rate     |0.001             |0.005             
Momentum          |0.4               |0.5               

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15

## Pull Best Model

In [None]:
tuner.results_summary()

In [None]:
# Pull the best model
best_hp = tuner.get_best_hyperparameters()[0]
best_model = tuner.hypermodel.build(best_hp)

best_model.summary()

In [None]:
best_model.save(os.path.join(tensorboard_dir, "dan_hp_tune", "best"))