In [1]:
# Imports
import csv
import os
import pickle as pkl
import pandas as pd
import argparse
import cond_rnn

import numpy as np
import tensorflow as tf
import tensorflow.keras as keras
import kerastuner
from kerastuner import HyperModel
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import TensorBoard

import tools.analysis as ta
from tools import keras as tk
import tools.preprocessing as tp

In [4]:
 # GLOBALS   
TIME_SEQ = 225
TARGET = "multi_class"
BATCH_SIZE = 32
EPOCHS = 15
TEST_SPLIT = 0.2
VAL_SPLIT = 0.1
RAND = 2021
TB_UPDATE_FREQ = 200

# Paths
output_dir = os.path.abspath(os.path.join("..", "output"))
data_dir = os.path.abspath(os.path.join("..", "data", "data"))
tensorboard_dir = os.path.abspath(
    os.path.join(data_dir, "..", "model_checkpoints"))
pkl_dir = os.path.join(output_dir, "pkl")
stats_dir = os.path.join(output_dir, "analysis")

# Create analysis dir if it doesn't exist
os.makedirs(stats_dir, exist_ok=True)

stats_filename = TARGET + "_stats.csv"

In [5]:
# Data load
with open(os.path.join(pkl_dir, TARGET + "_trimmed_seqs.pkl"), "rb") as f:
    inputs = pkl.load(f)

with open(os.path.join(pkl_dir, "all_ftrs_dict.pkl"), "rb") as f:
    vocab = pkl.load(f)

with open(os.path.join(pkl_dir, "feature_lookup.pkl"), "rb") as f:
    all_feats = pkl.load(f)

with open(os.path.join(pkl_dir, "demog_dict.pkl"), "rb") as f:
    demog_lookup = pkl.load(f)

# Determining number of vocab entries
N_VOCAB = len(vocab) + 1
N_DEMOG = len(demog_lookup) + 1
MAX_DEMOG = max(max(x) for _, x, _ in inputs)
N_CLASS = max(x for _, _, x in inputs) + 1


In [6]:
# Model Metrics and callbacks
callbacks = [
    TensorBoard(
    log_dir=os.path.join(tensorboard_dir, "new_lstm_topo", ""),
    histogram_freq=1,
    update_freq=TB_UPDATE_FREQ
    ),
    keras.callbacks.EarlyStopping(monitor="val_loss",
                                min_delta=0,
                                patience=3,
                                restore_best_weights=True,
                                mode="auto")
]

# Create some metrics
metrics = [
    keras.metrics.AUC(num_thresholds=int(1e5), name="ROC-AUC"),
    keras.metrics.AUC(num_thresholds=int(1e5), curve="PR", name="PR-AUC"),
]

# Define loss function
# NOTE: We were experimenting with focal loss at one point, maybe we can try that again at some point
loss_fn = keras.losses.categorical_crossentropy if TARGET == "multi_class" else keras.losses.binary_crossentropy


In [7]:
# TTV
# Splitting the data
train, test = train_test_split(
    range(len(inputs)),
    test_size=TEST_SPLIT,
    stratify=[labs for _, _, labs in inputs],
    random_state=RAND)

train, validation = train_test_split(
    train,
    test_size=VAL_SPLIT,
    stratify=[samp[2] for i, samp in enumerate(inputs) if i in train],
    random_state=RAND)

train_gen = tk.create_ragged_data_gen(
    [inputs[samp] for samp in train],
    max_demog=MAX_DEMOG,
    epochs=EPOCHS,
    multiclass=N_CLASS > 2,
    random_seed=RAND,
    batch_size=BATCH_SIZE)

validation_gen = tk.create_ragged_data_gen(
    [inputs[samp] for samp in validation],
    max_demog=MAX_DEMOG,
    epochs=EPOCHS,
    shuffle=False,
    multiclass=N_CLASS > 2,
    random_seed=RAND,
    batch_size=BATCH_SIZE)

# NOTE: don't shuffle test data
test_gen = tk.create_ragged_data_gen([inputs[samp] for samp in test],
                                        max_demog=MAX_DEMOG,
                                        epochs=1,
                                        multiclass=N_CLASS > 2,
                                        shuffle=False,
                                        random_seed=RAND,
                                        batch_size=BATCH_SIZE)

# %% Compute steps-per-epoch
# NOTE: Sometimes it can't determine this properly from tf.data
STEPS_PER_EPOCH = np.ceil(len(train) / BATCH_SIZE)
VALID_STEPS_PER_EPOCH = np.ceil(len(validation) / BATCH_SIZE)

(10188, 11)
(1132, 11)
(2831, 11)


In [20]:
# Hyperparameter Model builder
class LSTMHyper(kerastuner.HyperModel):
    def __init__(self, vocab_size, metrics, loss, n_classes=1, n_demog=32):
        self.vocab_size = vocab_size
        self.n_classes = n_classes
        self.n_demog = n_demog
        self.metrics = metrics,
        self.loss = loss

    def build(self, hp: kerastuner.HyperParameters) -> keras.Model:
        # Model Topology

        # Should we use a "conditional RNN"
        rnn_conditional = hp.Boolean("Conditional RNN")

        # Should we multiply the feature embeddings by their averages?
        weighting = hp.Boolean("Feature Weighting")

        # Should we add a dense layer between RNN and output?
        final_dense = hp.Boolean("Final Dense Layer")

        # Feature Embedding Params
        emb_l1 = hp.Float("Feature Embedding L1",
                                    min_value=0.0,
                                    max_value=0.1,
                                    step=0.01)
        emb_l2 = hp.Float("Feature Embedding L2",
                                    min_value=0.0,
                                    max_value=0.1,
                                    step=0.01)
        emb_n = hp.Int("Embedding Dimension",
                                    min_value=64,
                                    max_value=512,
                                    default=64,
                                    step=64)

        # Average Embedding Params
        avg_l1 = hp.Float("Average Embedding L1",
                                    min_value=0.0,
                                    max_value=0.1,
                                    step=0.01,
                                    parent_name = "Feature Weighting",
                                    parent_values = [True])
        avg_l2 = hp.Float("Average Embedding L2",
                                    min_value=0.0,
                                    max_value=0.1,
                                    step=0.01,
                                    parent_name = "Feature Weighting",
                                    parent_values = [True])

        # LSTM Params
        lstm_n = hp.Int("LSTM Units",
                        min_value=32,
                        max_value=512,
                        default=32,
                        step=32)
        lstm_dropout = hp.Float("LSTM Dropout",
                                min_value=0.0,
                                max_value=0.9,
                                default=0.4,
                                step=0.01)
        lstm_recurrent_dropout = hp.Float("LSTM Recurrent Dropout",
                                            min_value=0.0,
                                            max_value=0.9,
                                            default=0.4,
                                            step=0.01)
        lstm_l1 = hp.Float("LSTM weights L1",
                            min_value=0.0,
                            max_value=0.1,
                            step=0.01)
        lstm_l2 = hp.Float("LSTM weights L2",
                            min_value=0.0,
                            max_value=0.1,
                            step=0.01)
        
        # Final dense layer
        dense_n = hp.Int("Dense Units",
                         min_value=2,
                         max_value=128,
                         sampling="log",
                         parent_name="Final Dense Layer",
                         parent_values=[True]
                         )
        # Model code
        feat_input = keras.Input(shape=(None, None), ragged=True)
        demog_input = keras.Input(shape=(self.n_demog, ))

        emb1 = keras.layers.Embedding(self.vocab_size,
                                    output_dim=emb_n,
                                    embeddings_regularizer=keras.regularizers.l1_l2(emb_l1, emb_l2),
                                    mask_zero=True,
                                    name="Feature_Embeddings")(feat_input)
        
        if weighting:
            emb2 = keras.layers.Embedding(self.vocab_size,
                                          output_dim=1,
                                          embeddings_regularizer=keras.regularizers.l1_l2(avg_l1, avg_l2),
                                          mask_zero=True,
                                          name="Average_Embeddings")(feat_input)

            # Multiplying the code embeddings by their respective weights
            mult = keras.layers.Multiply(name="Embeddings_by_Average")([emb1, emb2])
            avg = keras.layers.Lambda(lambda x: tf.math.reduce_mean(x, axis=2), name="Averaging")(mult)
        else:
            avg = keras.layers.Lambda(lambda x: tf.math.reduce_mean(x, axis=2), name="Averaging")(emb1)
        
        if rnn_conditional:
            lstm_layer = cond_rnn.ConditionalRNN(lstm_n, 
                                   dropout=lstm_dropout,
                                   recurrent_dropout=lstm_recurrent_dropout,
                                   kernel_regularizer=keras.regularizers.l1_l2(lstm_l1, lstm_l2),
                                   name="Recurrent")([avg, demog_input])

        else:
            lstm_layer = keras.layers.LSTM(lstm_n, 
                                   dropout=lstm_dropout,
                                   recurrent_dropout=lstm_recurrent_dropout,
                                   kernel_regularizer=keras.regularizers.l1_l2(lstm_l1, lstm_l2),
                                   name="Recurrent")(avg)
        
            demog_avg = keras.layers.Dense(lstm_n)(demog_input)
            lstm_layer = keras.layers.Concatenate()([lstm_layer, demog_avg])

        if final_dense:
            lstm_layer = keras.layers.Dense(dense_n, activation = "relu", name = "pre_output")(lstm_layer)

        output = keras.layers.Dense(
            self.n_classes if self.n_classes > 2 else 1,
            activation="sigmoid",
            name="Output")(lstm_layer)

        model = keras.Model([feat_input, demog_input], output)

        model.compile(optimizer = "adam", loss=self.loss, metrics=self.metrics)

        return model

In [21]:
hyper_model = LSTMHyper(
    vocab_size = N_VOCAB,
    metrics = metrics,
    loss = loss_fn,
    n_classes = N_CLASS,
    n_demog = MAX_DEMOG
)

tuner = kerastuner.tuners.Hyperband(
    hyper_model,
    objective="val_loss",
    max_epochs=EPOCHS,
    hyperband_iterations=5,
    project_name="new_lstm_topo",
    # NOTE: This could be in output as well if we don't want to track/version it
    directory=tensorboard_dir,
)

INFO:tensorflow:Reloading Oracle from existing project data/model_checkpoints/new_lstm_topo\oracle.json
INFO:tensorflow:Reloading Tuner from data/model_checkpoints/new_lstm_topo\tuner0.json


In [22]:
# Announce the search space
tuner.search_space_summary()

Search space summary
Default search space size: 14
Conditional RNN (Boolean)
{'default': False, 'conditions': []}
Feature Weighting (Boolean)
{'default': False, 'conditions': []}
Final Dense Layer (Boolean)
{'default': False, 'conditions': []}
Feature Embedding L1 (Float)
{'default': 0.0, 'conditions': [], 'min_value': 0.0, 'max_value': 0.1, 'step': 0.01, 'sampling': None}
Feature Embedding L2 (Float)
{'default': 0.0, 'conditions': [], 'min_value': 0.0, 'max_value': 0.1, 'step': 0.01, 'sampling': None}
Embedding Dimension (Int)
{'default': 64, 'conditions': [], 'min_value': 64, 'max_value': 512, 'step': 64, 'sampling': None}
Average Embedding L1 (Float)
{'default': 0.0, 'conditions': [{'class_name': 'Parent', 'config': {'name': 'Feature Weighting', 'values': [1]}}], 'min_value': 0.0, 'max_value': 0.1, 'step': 0.01, 'sampling': None}
Average Embedding L2 (Float)
{'default': 0.0, 'conditions': [{'class_name': 'Parent', 'config': {'name': 'Feature Weighting', 'values': [1]}}], 'min_value'

In [23]:
tuner.search(train_gen,
             validation_data=validation_gen,
             epochs=EPOCHS,
             steps_per_epoch=STEPS_PER_EPOCH,
             validation_steps=VALID_STEPS_PER_EPOCH,
             callbacks=callbacks
)

Trial 149 Complete [00h 04m 56s]
val_loss: 18.912052154541016

Best val_loss So Far: 1.2175601720809937
Total elapsed time: 10h 25m 24s
INFO:tensorflow:Oracle triggered exit


In [24]:
tuner.results_summary()

Results summary
Results in data/model_checkpoints/new_lstm_topo
Showing 10 best trials
Objective(name='val_loss', direction='min')
Trial summary
Hyperparameters:
Conditional RNN: True
Feature Weighting: False
Final Dense Layer: False
Feature Embedding L1: 0.0
Feature Embedding L2: 0.0
Embedding Dimension: 448
LSTM Units: 64
LSTM Dropout: 0.44
LSTM Recurrent Dropout: 0.38
LSTM weights L1: 0.03
LSTM weights L2: 0.09
tuner/epochs: 15
tuner/initial_epoch: 5
tuner/bracket: 2
tuner/round: 2
tuner/trial_id: aea680825de0164cbcc05e038c91dfbd
Score: 1.2175601720809937
Trial summary
Hyperparameters:
Conditional RNN: True
Feature Weighting: False
Final Dense Layer: False
Feature Embedding L1: 0.0
Feature Embedding L2: 0.0
Embedding Dimension: 448
LSTM Units: 64
LSTM Dropout: 0.44
LSTM Recurrent Dropout: 0.38
LSTM weights L1: 0.03
LSTM weights L2: 0.09
tuner/epochs: 5
tuner/initial_epoch: 2
tuner/bracket: 2
tuner/round: 1
tuner/trial_id: 962aad15801ce5e55cc60b4710c7299b
Score: 1.222415566444397
Tri

In [25]:
# Pull the best model
best_hp = tuner.get_best_hyperparameters()[0]
best_model = tuner.hypermodel.build(best_hp)

best_model.summary()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, 11)]         0                                            
__________________________________________________________________________________________________
tf_op_layer_ExpandDims (TensorF [(1, None, 11)]      0           input_2[0][0]                    
__________________________________________________________________________________________________
input_1 (InputLayer)            [(None, None, None)] 0                                            
__________________________________________________________________________________________________
tf_op_layer_Tile (TensorFlowOpL [(2, None, 11)]      0           tf_op_layer_ExpandDims[0][0]     
_______________________________________________________________________________________

In [26]:
best_model.save(os.path.join(tensorboard_dir, "new_lstm_topo", "best"))

INFO:tensorflow:Assets written to: c:\Users\blues\work\premier_analysis\data\model_checkpoints\new_lstm_topo\best\assets
