In [13]:
import tensorflow as tf
import keras_tuner as kt
import tensorflow.keras.backend as K
import pickle
from sklearn.utils import class_weight
from model_definition import *
from tuner_trial_functions import *
from preprocessing import vocab_size

In [14]:
with open('./data/train_cleaned_routes', 'rb') as f:
    X_train, y_train = pickle.load(f).values()
#with open('./data/test_preprocessed_routes', 'rb') as f:
#    X_test, y_test = pickle.load(f).values()
with open('./data/val_cleaned_routes', 'rb') as f:
    X_val, y_val = pickle.load(f).values()

In [15]:
X_train[:, 0] = 3
#X_test[:, 0] = 3
X_val[:, 0] = 3

In [16]:
# Variables:

#constant
num_classes = 9
epochs = 40

# Iteration one, 68 trials :

# num_layers   =  2    -  8,   step 2   ->   2
# d_model      =  64   -  512, step 64  ->   64
# dff          =  512  -  2048, step 256 ->  768
# num_heads    =  4    -  10,   step 2   ->  10
# dropout_rate =  0.1  -  0.4,  step 0.1 ->  0.1
# warmup_steps =  2500 -  5500, step 500 ->  3500



# Iteration two, 100 trials:

# num_layers   =  1    -  4,   step 1   ->  3
# d_model      =  16   -  128, step 8  -> 128
# dff          =  512  -  1024, step 32 -> 736
# num_heads    =  8    -  14,   step 1   -> 8
# dropout_rate =  0.04 -  0.24,  step 0.02 -> 0.2
# warmup_steps =  2500 -  4750, step 250 -> 3250
# beta_1       =  0.79 -  0.95, step 0.02 -> 0.79
# beta_2       =  0.95 -  0.99, step 0.005 -> 0.98
# epsilon      = 1e-11 -  1e-7, step NA   -> 6.35e-08

# Iteration three, 99 trials:

# num_layers   =  1    -  4,   step 1   ->  
    # default: 2
# d_model      =  16   -  192, step 16  -> 
    # default: 128
# dff          =  512  -  1280, step 64 ->
    # default: 768
# num_heads    =  8    -  14,   step 1   -> 
    # default: 10
# dropout_rate =  0.125 -  0.30,  step 0.025 -> 
    # default: 0.2
# warmup_steps =  2000 -  7000, step 500 -> 
    # default: 4000
# beta_1       =  0.74 -  0.93, step 0.0025 -> 
    # default: 0.8
# beta_2       =  0.95 -  0.99, step 0.005 -> 
    # default: 0.98
# epsilon      = 1e-11 -  1e-7, sample 'log' ->
    # default: 1e-8
# global_batch =  16   -  128,  step 16  ->
    # default: 64
    
# Iteration four, 50 trials:

# num_layers   =  2    -  8,   step 2   ->  
    # default:
# d_model      =  64   -  256, step 64  -> 
    # default:
# dff          =  512  -  2048, step 256 ->
    # default:
# num_heads    =  6    -  24,   step 6   -> 
    # default:
# dropout_rate =  0.1 -  0.4,  step 0.1 -> 
    # default:
# warmup_steps =  2000 -  6000, step 1000 -> 
    # default:
# batch_size =  32   -  128,  step 32  ->
    # default:
# lr_scalar =  0.01   -  0.13,  step 0.02  ->
    # default:
    
    
# Iteration five, 50 trials:

# num_layers   =  2    -  8,   step 2   ->  
    # default:
# d_model      =  64   -  256, step 64  -> 
    # default:
# dff          =  512  -  2048, step 256 ->
    # default:
# num_heads    =  6    -  18,   step 6   -> 
    # default:
# dropout_rate =  0.1 -  0.4,  step 0.1 -> 
    # default:
# warmup_steps =  2000 -  6000, step 1000 -> 
    # default:
# batch_size =  64   -  256,  step 64  ->
    # default:
# lr_ramp_scalar =  0.01   -  0.13,  step 0.02  ->
    # default:
# lr_decay_expo =  0.01   -  0.13,  step 0.02  ->
    # default:


# learning_rate = transformerSchedule(d_model, warmup_steps=warmup_steps, ramp_scalar=lr_ramp_scalar, decay_expo=lr_decay_scalar)
# optimizer = tf.keras.optimizers.Nadam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-7, use_ema = True)

In [17]:
class WithinKAccuracy(tf.keras.metrics.Metric):
    def __init__(self, k, name='within_k_acc', **kwargs):
        super().__init__(name=f'within_{k}_acc', **kwargs)
        self.total = self.add_weight(name='total', initializer='zeros')
        self.correct_within_k = self.add_weight(name=f'correct_within_{k}', initializer='zeros')
        self.k = k

    def update_state(self, y_true, y_pred, sample_weight=None):
        y_true = tf.cast(y_true, dtype='float32')
        y_pred = tf.cast(y_pred, dtype='float32')
        abs_diff = K.abs(y_true - y_pred)
        within = tf.cast(K.less_equal(abs_diff, self.k), dtype='float32')
        self.total.assign_add(tf.cast(tf.size(within), dtype='float32'))
        self.correct_within_k.assign_add(tf.reduce_sum(within))

    def result(self):
        return self.correct_within_k / self.total
    
    def reset_state(self):
        self.total.assign(0.0)
        self.correct_within_k.assign(0.0)

In [18]:
class MyHyperModel(kt.HyperModel):
    def __init__(self, num_classes, vocab_size):
        self.num_classes = num_classes
        self.vocab_size = vocab_size
        self.acc_metric = tf.keras.metrics.SparseCategoricalAccuracy()
        self.within1_metric = WithinKAccuracy(k=1)
        self.within2_metric = WithinKAccuracy(k=2)

    def build(self, hp):
        self.batch_size = hp.get('batch_size')
        
        model = EncoderClassifier(
            num_layers=hp.get('num_layers'),
            d_model=hp.get('d_model'),
            num_heads=hp.get('num_heads'),
            dff=hp.get('dff'),
            vocab_size=self.vocab_size,
            num_classes=self.num_classes,
            dropout_rate=hp.get('dropout_rate'),
            activation=hp.get('activation'),
        )
        
        # Define the new learning rate schedule
        learning_rate = transformerSchedule(hp.get('d_model'), warmup_steps=hp.get('warmup_steps'), 
                                            ramp_scalar=hp.get('lr_ramp_scalar'), decay_scalar=hp.get('lr_decay_scalar'))
        # Update the optimizer with the new learning rate
        if hp.get('Nadam'):
            optimizer = tf.keras.optimizers.Nadam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-7, use_ema=hp.get('opt_special'))
        else:
            optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-7, amsgrad=hp.get('opt_special'))
            
        model.compile(
            loss=tf.keras.losses.SparseCategoricalCrossentropy(),
            optimizer=optimizer,
            metrics=['accuracy']#, self.within1_metric, self.within2_metric, self.weighted_accuracy],
        )    
        
        return model
    
    def fit(self, hp, model, x, y, epochs, validation_data, class_weight=None, callbacks=None, verbose=1, **kwargs):
        batch_size = self.batch_size
        
        # Split the validation data into x_val and y_val
        X_val, y_val = validation_data

        # Convert the datasets to tf.data.Dataset.        
        train_dataset = tf.data.Dataset.from_tensor_slices((x, y)).batch(batch_size)
        val_dataset = tf.data.Dataset.from_tensor_slices((X_val, y_val)).batch(batch_size)

        history = model.fit(train_dataset, epochs=epochs, validation_data=val_dataset, class_weight=class_weight, 
                            callbacks=callbacks, verbose=1)
        
        return history
    
def weighted_accuracy(self, y_true, y_pred):
    acc_metric = self.acc_metric
    within1_metric = self.within1_metric
    within2_metric = self.within2_metric

    # Update the metric states before calculating the metric value
    acc_metric.update_state(y_true, y_pred)
    acc = acc_metric.result()

    within1_metric.update_state(y_true, y_pred)
    within1 = within1_metric.result()

    within2_metric.update_state(y_true, y_pred)
    within2 = within2_metric.result()

    weighted_acc = 0.3 * acc + 0.5 * within1 + 0.2 * within2
    return weighted_acc

def within1_acc(self, y_true, y_pred):
    within1_metric = self.within1_metric
    within1_metric.update_state(y_true, y_pred)
    return within1_metric.result()

def within2_acc(self, y_true, y_pred):
    within2_metric = self.within2_metric
    within2_metric.update_state(y_true, y_pred)
    return within2_metric.result()


In [19]:
def create_base_hp(hp=kt.HyperParameters()):  #, hp_type=hp_type):  
    #hp.Fixed("Tuning_Mode", value=hp_type)
    
    # These Hyperparameters affect the size/complexity of the model layers
    num_layers = hp.Int("num_layers", min_value=2, max_value=8, default=2, step=2)
    d_model = hp.Int("d_model", min_value=64, max_value=256, default=128, step=64)
    dff = hp.Int("dff", min_value=512, max_value=2048, default=768, step=256)
    num_heads = hp.Int("num_heads", min_value=6, max_value=18, default=10, step=6)
    dropout_rate = hp.Float("dropout_rate", min_value=0.1, max_value=0.4, default=0.2, step=0.1)
        
    #if(hp_type == "learn" or hp_type == "base"):
    # These Hyperparameters affect the learning rate
    warmup_steps = hp.Int("warmup_steps", min_value=2000, max_value=6000, default=4000, step=1000)
    batch_size = hp.Int("batch_size", min_value=64, max_value=256, step=64)
    lr_ramp_scalar = hp.Float("lr_ramp_scalar", min_value=0.01, max_value=0.13, default=0.1, step=0.02)
    lr_decay_scalar = hp.Float("lr_decay_scalar", min_value=0.01, max_value=0.13, default=0.1, step=0.02)

    #if(hp_type == "arch" or hp_type == "base"):
    hp.Choice("activation", values=['relu', 'selu'], default='relu')
    hp.Boolean("Nadam", default=True)
    hp.Boolean("opt_special", default=True)
        
    return hp

In [20]:
tuner_num = 7 #initialization
#hp_type = ""

In [24]:
def get_tuner(tune_new=True, trials=10, tuner_num=tuner_num):#, hp_type=hp_type):
    #hp = create_updated_hp(tuner_num=tuner_num, hp_type=hp_type)
    hp = create_base_hp()
    return kt.BayesianOptimization(
        hypermodel = MyHyperModel(9, vocab_size),
        hyperparameters = hp,
        #tune_new_entries = tune_new,
        objective='val_accuracy',
        max_trials=trials,
        overwrite=False,
        directory="tuners",
        project_name=f'tuner{tuner_num}',
    )

In [25]:
class_weights = class_weight.compute_class_weight(class_weight='balanced',
                                                  classes=np.unique(y_train),
                                                  y=y_train) 
class_weights=dict(zip(np.unique(y_train), class_weights))

callbacks = tf.keras.callbacks.TensorBoard(log_dir=f'./tuner{tuner_num}logs')

In [None]:
base_tuner = get_tuner(trials=99, tuner_num=tuner_num)
   
base_tuner.search(x=X_train, y=y_train, epochs=40, validation_data=(X_val, y_val), 
                  class_weight=class_weights, callbacks=[callbacks])

Trial 51 Complete [00h 04m 20s]
val_accuracy: 0.48172324895858765

Best val_accuracy So Far: 0.4973890483379364
Total elapsed time: 00h 04m 20s

Search: Running Trial #52

Value             |Best Value So Far |Hyperparameter
8                 |6                 |num_layers
256               |256               |d_model
2048              |1536              |dff
6                 |12                |num_heads
0.2               |0.4               |dropout_rate
2000              |2000              |warmup_steps
64                |64                |batch_size
0.07              |0.07              |lr_ramp_scalar
0.09              |0.09              |lr_decay_scalar
relu              |relu              |activation
True              |True              |Nadam
False             |True              |opt_special

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40

In [None]:
best_model = tuner.get_best_models(num_models=1)[0]
best_model.save('tuner_models/' + f'tuner_model_{tuner_num}.h5')

In [None]:
tuner_num = 4
hp_type = ""

base_num = 10
layer_num = 6
learn_num = 6
arch_num = 3

iterations = 1
num_trials = iterations*(layer_num+learn_num+arch_num)+base_num

assert base_num >= 10
print(f'Number of trials: {num_trials}')

In [12]:
#base_hp = create_hp(hp_type="base")
#layer_hp = create_hp(hp_type="layer")
#learn_hp = create_hp(hp_type="learn")
#arch_hp = create_hp(hp_type="arch")

In [32]:
#base_tuner = get_tuner(create_hp(hp_type="base"), trials=base_num, tuner_num=tuner_num)

# run 10 models to get a starting point
#base_tuner.search(x=X_train, y=y_train, epochs=3, validation_data=(X_val, y_val), 
             #class_weight=class_weights)

# run 4 * 15 models: 6 focused on tuning layer values, 6 focused on tuning learn values, 
# and 3 focused on changing model architecture
for i in range(1, iterations+1):
    layer_trial = layer_num * i + base_num
    learn_trial = learn_num * i + base_num + layer_trial
    arch_trial  = arch_num  * i + base_num + learn_trial
    
    
    layer_tuner = get_tuner(tune_new=False, trials=layer_trial, tuner_num=tuner_num, hp_type='layer')
    layer_tuner.search(x=X_train, y=y_train, epochs=3, validation_data=(X_val, y_val), 
             class_weight=class_weights)
    
    
    learn_tuner = get_tuner(tune_new=False, trials=learn_trial, tuner_num=tuner_num, hp_type='learn')
    learn_tuner.search(x=X_train, y=y_train, epochs=3, validation_data=(X_val, y_val), 
             class_weight=class_weights)
    
    arch_tuner = get_tuner(tune_new=False, trials=arch_trial, tuner_num=tuner_num, hp_type='arch')
    arch_tuner.search(x=X_train, y=y_train, epochs=3, validation_data=(X_val, y_val), 
             class_weight=class_weights)

NameError: name 'iterations' is not defined

In [8]:
def create_updated_hp(hp=keras_tuner.HyperParameters(), hp_type=hp_type, tuner_num=tuner_num):
    trials_np = np.zeros(0)
    base_dir = f'tuners/tuner{tuner_num}'
    num_trials = get_num_trials(base_dir)
    trials_np = create_trials_np(num_trials, base_dir)

    tuner_df = create_tuner_df(num_trials, trials_np)
    df = best_trials(tuner_df, num_trials=1)    
    
    hp.Fixed("Tuning_Mode", value=hp_type)
           
    if (hp_type == "layer" or hp_type == "base"):
        # These Hyperparameters affect the size/complexity of the model layers
        num_layers = hp.Int("num_layers", min_value=1, max_value=4, default=2, step=1)
        d_model = hp.Int("d_model", min_value=16, max_value=192, default=128, step=16)
        dff = hp.Int("dff", min_value=512, max_value=1280, default=768, step=64)
        num_heads = hp.Int("num_heads", min_value=8, max_value=14, default=10, step=1)
        dropout_rate = hp.Float("dropout_rate", min_value=0.125, max_value=0.3, default=0.2, step=0.025)
        if(hp_type != "base"):
            hp.Fixed("warmup_steps", value=df['warmup_steps'].iloc[0])
            hp.Fixed("batch_size", value=df['batch_size'].iloc[0])
            hp.Fixed("beta_1", value=df['beta_1'].iloc[0])
            hp.Fixed("beta_2", value=df['beta_2'].iloc[0])
            hp.Fixed("epsilon", value=df['epsilon'].iloc[0])

            hp.Fixed("activation", value='relu')
            hp.Fixed("sequential", value=True)
    if (hp_type == "learn" or hp_type == "base"):
        # These Hyperparameters affect the learning rate
        if(hp_type != "base"):
            hp.Fixed("num_layers", value=df['num_layers'].iloc[0])
            hp.Fixed("d_model", value=df['d_model'].iloc[0])
            hp.Fixed("dff", value=df['dff'].iloc[0])
            hp.Fixed("num_heads", value=df['num_heads'].iloc[0])
            hp.Fixed("dropout_rate", value=df['dropout_rate'].iloc[0])
        
        warmup_steps = hp.Int("warmup_steps", min_value=2000, max_value=7000, default=4000, step=500)
        batch_size = hp.Int("batch_size", min_value=16, max_value=128, step=16)
        beta_1 = hp.Float("beta_1", min_value=0.74, max_value=0.93, default= .8, step=0.0025)
        beta_2 = hp.Float("beta_2", min_value=0.95, max_value=0.99, default= .98, step=0.005)
        epsilon = hp.Float("epsilon", min_value=1e-9, max_value=5e-7, default= 1e-8, sampling="log")
        
        if(hp_type != "base"):
            hp.Fixed("activation", value='relu')
            hp.Fixed("sequential", value=True)
    if(hp_type == "arch" or hp_type == "base"):
        if(hp_type != "base"):
            hp.Fixed("num_layers", value=df['num_layers'].iloc[0])
            hp.Fixed("d_model", value=df['d_model'].iloc[0])
            hp.Fixed("dff", value=df['dff'].iloc[0])
            hp.Fixed("num_heads", value=df['num_heads'].iloc[0])
            hp.Fixed("dropout_rate", value=df['dropout_rate'].iloc[0])
            
            hp.Fixed("warmup_steps", value=df['warmup_steps'].iloc[0])
            hp.Fixed("batch_size", value=df['batch_size'].iloc[0])
            hp.Fixed("beta_1", value=df['beta_1'].iloc[0])
            hp.Fixed("beta_2", value=df['beta_2'].iloc[0])
            hp.Fixed("epsilon", value=df['epsilon'].iloc[0])
        
        hp.Choice("activation", values=['relu', 'swish'], default='relu')
        hp.Boolean("sequential", default=True)

    return hp

In [1]:
class LegacyHyperModel(keras_tuner.HyperModel):
    def build(self, hp):     
        num_classes = 9
        self.batch_size = hp.get('batch_size')
       
        model = EncoderClassifier(
            num_layers=hp.get('num_layers'),
            d_model=hp.get('d_model'),
            num_heads=hp.get('num_heads'),
            dff=hp.get('dff'),
            vocab_size=vocab_size,
            num_classes=num_classes,
            dropout_rate=hp.get('dropout_rate'),
            activation=hp.get('activation'),
        )
        
        learning_rate = transformerSchedule(hp.get('d_model'), warmup_steps=hp.get('warmup_steps'), 
                                            ramp_scalar=hp.get('lr_ramp_scalar'), decay_scalar=hp.get('lr_decay_scalar'))
        if(hp.get('Nadam')):
            optimizer = tf.keras.optimizers.Nadam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-7, use_ema = hp.get('opt_special'))
        else:
            optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-7, amsgrad = hp.get('opt_special'))
            
            
        model.compile(
            loss=tf.keras.losses.SparseCategoricalCrossentropy(),
            optimizer=optimizer,
            metrics=['accuracy', within1_acc, within2_acc, weighted_average])    
        
        return model

    def fit(self, hp, model, x, y, epochs, validation_data, class_weight=None, callbacks=None, verbose=1, **kwargs):
        batch_size = self.batch_size
                
        # Convert the datasets to tf.data.Dataset.        
        train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train)).batch(batch_size)
        val_dataset = tf.data.Dataset.from_tensor_slices((X_val, y_val)).batch(batch_size)

        history = model.fit(train_dataset, epochs=epochs, validation_data=val_dataset, class_weight=class_weight, 
                            callbacks=callbacks, verbose=1)
        
        return history

NameError: name 'keras_tuner' is not defined