In [1]:
# Imports
import csv
import os
import pickle as pkl
import pandas as pd
import argparse
import cond_rnn

import numpy as np
import tensorflow as tf
import tensorflow_addons as tfa
import tensorflow.keras as keras
import kerastuner
from kerastuner import HyperModel
from sklearn.model_selection import train_test_split
from sklearn.utils import compute_class_weight
from tensorflow.keras.callbacks import TensorBoard

import tools.analysis as ta
from tools import keras as tk
import tools.preprocessing as tp

In [2]:
 # GLOBALS   
TIME_SEQ = 225
TARGET = "multi_class"
BATCH_SIZE = 128
EPOCHS = 15
TEST_SPLIT = 0.2
VAL_SPLIT = 0.1
RAND = 2021
TB_UPDATE_FREQ = 200

# Paths
output_dir = os.path.abspath(os.path.join("..", "output"))
data_dir = os.path.abspath(os.path.join("..", "data", "data"))
tensorboard_dir = os.path.abspath(
    os.path.join(data_dir, "..", "model_checkpoints"))
pkl_dir = os.path.join(output_dir, "pkl")
stats_dir = os.path.join(output_dir, "analysis")

# Create analysis dir if it doesn't exist
os.makedirs(stats_dir, exist_ok=True)

stats_filename = TARGET + "_stats.csv"

In [3]:
# Data load
with open(os.path.join(pkl_dir, TARGET + "_trimmed_seqs.pkl"), "rb") as f:
    inputs = pkl.load(f)

with open(os.path.join(pkl_dir, "all_ftrs_dict.pkl"), "rb") as f:
    vocab = pkl.load(f)

with open(os.path.join(pkl_dir, "feature_lookup.pkl"), "rb") as f:
    all_feats = pkl.load(f)

with open(os.path.join(pkl_dir, "demog_dict.pkl"), "rb") as f:
    demog_lookup = pkl.load(f)

# Determining number of vocab entries
N_VOCAB = len(vocab) + 1
N_DEMOG = len(demog_lookup) + 1
MAX_DEMOG = max(len(x) for _, x, _ in inputs)
N_CLASS = max(x for _, _, x in inputs) + 1


In [4]:
# Model Metrics and callbacks
callbacks = [
    TensorBoard(
    log_dir=os.path.join(tensorboard_dir, "new_lstm_topo_tb", ""),
    histogram_freq=1,
    profile_batch=0,
    write_graph=False,
    update_freq=TB_UPDATE_FREQ
    ),
    keras.callbacks.EarlyStopping(monitor="val_loss",
                                min_delta=0,
                                patience=3,
                                restore_best_weights=True,
                                mode="min")
]

# Create some metrics
metrics = [
    keras.metrics.AUC(num_thresholds=int(1e5), name="ROC-AUC"),
    keras.metrics.AUC(num_thresholds=int(1e5), curve="PR", name="PR-AUC"),
    tfa.metrics.F1Score(num_classes=N_CLASS, average="weighted")
]

# Define loss function
# NOTE: We were experimenting with focal loss at one point, maybe we can try that again at some point
# loss_fn = keras.losses.categorical_crossentropy if TARGET == "multi_class" else keras.losses.binary_crossentropy


In [5]:
# TTV
# Splitting the data
train, test = train_test_split(
    range(len(inputs)),
    test_size=TEST_SPLIT,
    stratify=[labs for _, _, labs in inputs],
    random_state=RAND)

train, validation = train_test_split(
    train,
    test_size=VAL_SPLIT,
    stratify=[samp[2] for i, samp in enumerate(inputs) if i in train],
    random_state=RAND)

train_gen = tk.create_ragged_data_gen(
    [inputs[samp] for samp in train],
    max_demog=MAX_DEMOG,
    epochs=EPOCHS,
    multiclass=N_CLASS > 2,
    random_seed=RAND,
    batch_size=BATCH_SIZE)

validation_gen = tk.create_ragged_data_gen(
    [inputs[samp] for samp in validation],
    max_demog=MAX_DEMOG,
    epochs=EPOCHS,
    shuffle=False,
    multiclass=N_CLASS > 2,
    random_seed=RAND,
    batch_size=BATCH_SIZE)

# NOTE: don't shuffle test data
test_gen = tk.create_ragged_data_gen([inputs[samp] for samp in test],
                                        max_demog=MAX_DEMOG,
                                        epochs=1,
                                        multiclass=N_CLASS > 2,
                                        shuffle=False,
                                        random_seed=RAND,
                                        batch_size=BATCH_SIZE)

# %% Compute steps-per-epoch
# NOTE: Sometimes it can't determine this properly from tf.data
STEPS_PER_EPOCH = np.ceil(len(train) / BATCH_SIZE)
VALID_STEPS_PER_EPOCH = np.ceil(len(validation) / BATCH_SIZE)

In [6]:
classes = np.unique([labs for _, _, labs in inputs]).tolist()

class_weights = compute_class_weight(
    class_weight="balanced",
    classes=classes,
    y=[labs for _, _, labs in inputs],
)

class_weights = dict(zip(classes, class_weights))

print(class_weights)

{0: 3.530688622754491, 1: 1.4372333942717854, 2: 0.4948075107521242}


In [7]:
# Hyperparameter Model builder
class LSTMHyper(kerastuner.HyperModel):
    def __init__(self, vocab_size, metrics, loss = None, n_classes=1, n_demog=32, n_demog_bags=6):
        self.vocab_size = vocab_size
        self.n_classes = n_classes
        self.n_demog = n_demog
        self.n_demog_bags = n_demog_bags
        self.metrics = metrics
        self.loss = loss

    def build(self, hp: kerastuner.HyperParameters) -> keras.Model:

        # L1/L2 vals
        reg_vals = [0.0, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1]

        # Model Topology

        # Should we use a "conditional RNN"
        rnn_conditional = hp.Boolean("Conditional RNN")

        # Should we multiply the feature embeddings by their averages?
        weighting = hp.Boolean("Feature Weighting")

        # Should we add a dense layer between RNN and output?
        final_dense = hp.Boolean("Final Dense Layer")

        # Feature Embedding Params
        emb_l1 = hp.Choice("Feature Embedding L1", reg_vals)
        emb_l2 = hp.Choice("Feature Embedding L2", reg_vals)

        emb_n = hp.Int("Embedding Dimension",
                                    min_value=64,
                                    max_value=512,
                                    default=64,
                                    step=64)

        # Demog Embedding
        demog_emb_n = hp.Int("Demographics Embedding Dimension",
                            min_value=1,
                            max_value=64,
                            default=self.n_demog
                            )

        # Average Embedding Params
        avg_l1 = hp.Choice("Average Embedding L1", reg_vals,
                                    parent_name = "Feature Weighting",
                                    parent_values = [True])
        avg_l2 = hp.Choice("Average Embedding L2", reg_vals,
                                    parent_name = "Feature Weighting",
                                    parent_values = [True])

        # LSTM Params
        lstm_n = hp.Int("LSTM Units",
                        min_value=32,
                        max_value=512,
                        default=32,
                        step=32)
        lstm_dropout = hp.Float("LSTM Dropout",
                                min_value=0.0,
                                max_value=0.9,
                                default=0.4,
                                step=0.01)
        lstm_recurrent_dropout = hp.Float("LSTM Recurrent Dropout",
                                            min_value=0.0,
                                            max_value=0.9,
                                            default=0.4,
                                            step=0.01)
        lstm_l1 = hp.Choice("LSTM weights L1", reg_vals)
        lstm_l2 = hp.Choice("LSTM weights L2", reg_vals)
        
        # Final dense layer
        dense_n = hp.Int("Dense Units",
                         min_value=2,
                         max_value=128,
                         sampling="log",
                         parent_name="Final Dense Layer",
                         parent_values=[True]
                         )
        # Model code
        feat_input = keras.Input(shape=(None, None), ragged=True)
        demog_input = keras.Input(shape=(self.n_demog_bags, ))

        demog_emb = keras.layers.Embedding(self.n_demog,
                                        output_dim=demog_emb_n,
                                        mask_zero=True,
                                       name="Demographic_Embeddings"
        )(demog_input)

        demog_avg = keras.layers.Flatten()(demog_emb)

        emb1 = keras.layers.Embedding(self.vocab_size,
                                    output_dim=emb_n,
                                    embeddings_regularizer=keras.regularizers.l1_l2(emb_l1, emb_l2),
                                    mask_zero=True,
                                    name="Feature_Embeddings")(feat_input)
        
        if weighting:
            emb2 = keras.layers.Embedding(self.vocab_size,
                                          output_dim=1,
                                          embeddings_regularizer=keras.regularizers.l1_l2(avg_l1, avg_l2),
                                          mask_zero=True,
                                          name="Average_Embeddings")(feat_input)

            # Multiplying the code embeddings by their respective weights
            mult = keras.layers.Multiply(name="Embeddings_by_Average")([emb1, emb2])
            avg = keras.layers.Lambda(lambda x: tf.math.reduce_mean(x, axis=2), name="Averaging")(mult)
        else:
            avg = keras.layers.Lambda(lambda x: tf.math.reduce_mean(x, axis=2), name="Averaging")(emb1)
        
        if rnn_conditional:
            lstm_layer = cond_rnn.ConditionalRNN(lstm_n,
                                   dropout=lstm_dropout,
                                   recurrent_dropout=lstm_recurrent_dropout,
                                   recurrent_regularizer=keras.regularizers.l1_l2(lstm_l1, lstm_l2),
                                   name="Recurrent")([avg, demog_avg])

        else:
            lstm_layer = keras.layers.LSTM(lstm_n, 
                                   dropout=lstm_dropout,
                                   recurrent_dropout=lstm_recurrent_dropout,
                                   recurrent_regularizer=keras.regularizers.l1_l2(lstm_l1, lstm_l2),
                                   name="Recurrent")(avg)
        
            # demog_avg = keras.layers.Dense(lstm_n)(demog_input)
            lstm_layer = keras.layers.Concatenate()([lstm_layer, demog_avg])

        if final_dense:
            lstm_layer = keras.layers.Dense(dense_n, activation = "relu", name = "pre_output")(lstm_layer)

        output = keras.layers.Dense(
            self.n_classes if self.n_classes > 2 else 1,
            activation="softmax" if self.n_classes > 2 else "sigmoid",
            name="Output")(lstm_layer)

        model = keras.Model([feat_input, demog_input], output)

        # --- Focal Loss Hyperparameters
        # Note: For gamma=0, focal loss is identical to crossentropy
        hyper_gamma = hp.Choice("Focal gamma", [0., 0.1, 0.2, 0.5, 1.0, 2.0, 5.0])
        hyper_alpha = hp.Choice("Focal alpha", [.1, .25, .5, .75, .9, .99, .999, 1.0, 2.0])

        # --- Learning rate and momentum
        lr = hp.Choice("Learning Rate", [1e-6, 5e-6, 1e-5, 5e-5, 1e-4, 5e-4, 1e-3, 5e-3, 1e-2, 5e-2, 1e-1])
        momentum = hp.Float("Momentum", min_value=0.0, max_value=0.9, step=0.1)

        model.compile(optimizer = keras.optimizers.SGD(lr, momentum=momentum), loss=tk.FocalLoss(gamma=hyper_gamma, alpha=hyper_alpha, name="loss"), metrics=self.metrics)

        return model

In [8]:
hyper_model = LSTMHyper(
    vocab_size = N_VOCAB,
    metrics = metrics,
    n_classes = N_CLASS,
    n_demog = N_DEMOG,
    n_demog_bags=MAX_DEMOG
)

tuner = kerastuner.tuners.Hyperband(
    hyper_model,
    objective=kerastuner.Objective("val_f1_score",direction="max"),
    max_epochs=EPOCHS,
    hyperband_iterations=5,
    project_name="new_lstm_topo",
    # NOTE: This could be in output as well if we don't want to track/version it
    directory=tensorboard_dir,
)



In [9]:
# Announce the search space
tuner.search_space_summary()

Search space summary
Default search space size: 19
Conditional RNN (Boolean)
{'default': False, 'conditions': []}
Feature Weighting (Boolean)
{'default': False, 'conditions': []}
Final Dense Layer (Boolean)
{'default': False, 'conditions': []}
Feature Embedding L1 (Choice)
{'default': 0.0, 'conditions': [], 'values': [0.0, 1e-06, 1e-05, 0.0001, 0.001, 0.01, 0.1], 'ordered': True}
Feature Embedding L2 (Choice)
{'default': 0.0, 'conditions': [], 'values': [0.0, 1e-06, 1e-05, 0.0001, 0.001, 0.01, 0.1], 'ordered': True}
Embedding Dimension (Int)
{'default': 64, 'conditions': [], 'min_value': 64, 'max_value': 512, 'step': 64, 'sampling': None}
Demographics Embedding Dimension (Int)
{'default': 12, 'conditions': [], 'min_value': 1, 'max_value': 64, 'step': 1, 'sampling': None}
Average Embedding L1 (Choice)
{'default': 0.0, 'conditions': [{'class_name': 'Parent', 'config': {'name': 'Feature Weighting', 'values': [1]}}], 'values': [0.0, 1e-06, 1e-05, 0.0001, 0.001, 0.01, 0.1], 'ordered': True}

In [10]:
tuner.search(train_gen,
             validation_data=validation_gen,
             epochs=EPOCHS,
             steps_per_epoch=STEPS_PER_EPOCH,
             validation_steps=VALID_STEPS_PER_EPOCH,
             callbacks=callbacks
)

Trial 150 Complete [00h 11m 02s]
val_f1_score: 0.5897813439369202

Best val_f1_score So Far: 0.5961918234825134
Total elapsed time: 10h 09m 57s
INFO:tensorflow:Oracle triggered exit


In [None]:
tuner.results_summary()

Results summary
Results in c:\Users\blues\work\premier_analysis\data\model_checkpoints\new_lstm_topo
Showing 10 best trials
Objective(name='val_f1_score', direction='max')
Trial summary
Hyperparameters:
Conditional RNN: True
Feature Weighting: False
Final Dense Layer: True
Feature Embedding L1: 0.0
Feature Embedding L2: 0.0001
Embedding Dimension: 320
Demographics Embedding Dimension: 19
LSTM Units: 128
LSTM Dropout: 0.06
LSTM Recurrent Dropout: 0.01
LSTM weights L1: 0.1
LSTM weights L2: 1e-06
Dense Units: 24
Focal gamma: 1.0
Focal alpha: 0.999
Learning Rate: 0.0001
Momentum: 0.5
tuner/epochs: 2
tuner/initial_epoch: 0
tuner/bracket: 2
tuner/round: 0
Score: 0.5961918234825134
Trial summary
Hyperparameters:
Conditional RNN: False
Feature Weighting: False
Final Dense Layer: False
Feature Embedding L1: 0.01
Feature Embedding L2: 0.01
Embedding Dimension: 192
Demographics Embedding Dimension: 47
LSTM Units: 128
LSTM Dropout: 0.85
LSTM Recurrent Dropout: 0.73
LSTM weights L1: 0.0001
LSTM wei

In [None]:
# Pull the best model
best_hp = tuner.get_best_hyperparameters()[0]
best_model = tuner.hypermodel.build(best_hp)

best_model.summary()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, 6)]          0                                            
__________________________________________________________________________________________________
Demographic_Embeddings (Embeddi (None, 6, 19)        228         input_2[0][0]                    
__________________________________________________________________________________________________
flatten (Flatten)               (None, 114)          0           Demographic_Embeddings[0][0]     
__________________________________________________________________________________________________
tf_op_layer_ExpandDims (TensorF [(1, None, 114)]     0           flatten[0][0]                    
_______________________________________________________________________________________

In [None]:
best_model.save(os.path.join(tensorboard_dir, "new_lstm_topo", "best"))

Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
INFO:tensorflow:Assets written to: c:\Users\blues\work\premier_analysis\data\model_checkpoints\new_lstm_topo\best\assets
