In [2]:
import keras_tuner
import pickle
from sklearn.utils import class_weight
from model_definition import *
from tuner_trial_functions import *
from preprocessing import vocab_size

In [81]:
with open('./data/train_preprocessed_routes', 'rb') as f:
    X_train, y_train = pickle.load(f).values()
#with open('./data/test_preprocessed_routes', 'rb') as f:
#    X_test, y_test = pickle.load(f).values()
with open('./data/val_preprocessed_routes', 'rb') as f:
    X_val, y_val = pickle.load(f).values()

In [82]:
X_train[:, 0] = 3
#X_test[:, 0] = 3
X_val[:, 0] = 3

In [83]:
tuner_num = 0 #initialization
hp_type = ""

In [84]:
# Variables:

#constant
num_classes = 9
epochs = 30

# Iteration one, 68 trials :

# num_layers   =  2    -  8,   step 2   ->   2
# d_model      =  64   -  512, step 64  ->   64
# dff          =  512  -  2048, step 256 ->  768
# num_heads    =  4    -  10,   step 2   ->  10
# dropout_rate =  0.1  -  0.4,  step 0.1 ->  0.1
# warmup_steps =  2500 -  5500, step 500 ->  3500



# Iteration two, 100 trials:

# num_layers   =  1    -  4,   step 1   ->  3
# d_model      =  16   -  128, step 8  -> 128
# dff          =  512  -  1024, step 32 -> 736
# num_heads    =  8    -  14,   step 1   -> 8
# dropout_rate =  0.04 -  0.24,  step 0.02 -> 0.2
# warmup_steps =  2500 -  4750, step 250 -> 3250
# beta_1       =  0.79 -  0.95, step 0.02 -> 0.79
# beta_2       =  0.95 -  0.99, step 0.005 -> 0.98
# epsilon      = 1e-11 -  1e-7, step NA   -> 6.35e-08

# Iteration three, 99 trials:

# num_layers   =  1    -  4,   step 1   ->  
    # default: 2
# d_model      =  16   -  192, step 16  -> 
    # default: 128
# dff          =  512  -  1280, step 64 ->
    # default: 768
# num_heads    =  8    -  14,   step 1   -> 
    # default: 10
# dropout_rate =  0.125 -  0.30,  step 0.025 -> 
    # default: 0.2
# warmup_steps =  2000 -  7000, step 500 -> 
    # default: 4000
# beta_1       =  0.74 -  0.93, step 0.0025 -> 
    # default: 0.8
# beta_2       =  0.95 -  0.99, step 0.005 -> 
    # default: 0.98
# epsilon      = 1e-11 -  1e-7, sample 'log' ->
    # default: 1e-8
# global_batch =  16   -  128,  step 16  ->
    # default: 64


# Implement:
# global_batch =  16   -  128,  step 16  ->

# learning_rate = CustomSchedule(d_model, warmup_steps=warmup_steps)
# optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)

In [85]:
class MyHyperModel(keras_tuner.HyperModel):
    def build(self, hp):     
        num_classes = 9
        self.batch_size = hp.get('batch_size')

        model = EncoderClassifier(
            num_layers=hp.get('num_layers'),
            d_model=hp.get('d_model'),
            num_heads=hp.get('num_heads'),
            dff=hp.get('dff'),
            vocab_size=vocab_size,
            num_classes=num_classes,
            dropout_rate=hp.get('dropout_rate'),
            activation=hp.get('activation'),
            sequential=hp.get('sequential')
        )
        
        learning_rate = CustomSchedule(hp.get('d_model'), warmup_steps=hp.get('warmup_steps'))
        optimizer = tf.keras.optimizers.Adam(learning_rate, 
                                beta_1=hp.get("beta_1"),
                                beta_2=hp.get("beta_2"),
                                epsilon=hp.get("epsilon"))
        
        model.compile(
            loss=tf.keras.losses.SparseCategoricalCrossentropy(),
            optimizer=optimizer,
            metrics=['accuracy'])
    
        return model

    def fit(self, hp, model, x, y, epochs, validation_data, verbose=1, **kwargs):
        batch_size = self.batch_size
        
        # Convert the datasets to tf.data.Dataset.        
        train_ds = tf.data.Dataset.from_tensor_slices((X_train, y_train)).batch(batch_size)
        validation_data = tf.data.Dataset.from_tensor_slices(validation_data).batch(batch_size)

        history = model.fit(train_ds, epochs=epochs, validation_data=validation_data, verbose=1,
                  class_weight=class_weights)
           
        return history

In [86]:
def update_hp(hp, hp_type=hp_type, tuner_num=tuner_num):
    trials_np = np.zeros(0)
    base_dir = f'tuners/tuner{tuner_num}'
    num_trials = get_num_trials(base_dir)
    trials_np = create_trials_np(num_trials, base_dir)

    tuner_df = create_tuner_df(num_trials, trials_np)
    df = best_trials(tuner_df, num_trials=1)    
    
    hp.Fixed("Tuning_Mode", value=hp_type)
    
    if(hp_type == "layer"):
        # These Hyperparameters affect the size/complexity of the model layers
        hp.Fixed("warmup_steps", value=df['warmup_steps'].iloc[0])
        hp.Fixed("batch_size", value=df['batch_size'].iloc[0])
        hp.Fixed("beta_1", value=df['beta_1'].iloc[0])
        hp.Fixed("beta_2", value=df['beta_2'].iloc[0])
        hp.Fixed("epsilon", value=df['epsilon'].iloc[0])
        
        hp.Fixed("activation", value='relu')
        hp.Fixed("sequential", value=True)
    elif(hp_type == "learn"):
        # These Hyperparameters affect the learning rate
        hp.Fixed("num_layers", value=df['num_layers'].iloc[0])
        hp.Fixed("d_model", value=df['d_model'].iloc[0])
        hp.Fixed("dff", value=df['dff'].iloc[0])
        hp.Fixed("num_heads", value=df['num_heads'].iloc[0])
        hp.Fixed("dropout_rate", value=df['dropout_rate'].iloc[0])
        
        hp.Fixed("activation", value='relu')
        hp.Fixed("sequential", value=True)
    else:
        hp.Choice("activation", values=['relu', 'swish'], default='relu')
        hp.Boolean("sequential", default=True)

    return hp

In [87]:
def create_hp(hp=keras_tuner.HyperParameters(), hp_type=hp_type, tuner_num=tuner_num):  
    hp.Fixed("Tuning_Mode", value=hp_type)
    if(hp_type == "layer" or hp_type == "base"):
        # These Hyperparameters affect the size/complexity of the model layers
        num_layers = hp.Int("num_layers", min_value=1, max_value=4, default=2, step=1)
        d_model = hp.Int("d_model", min_value=16, max_value=192, default=128, step=16)
        dff = hp.Int("dff", min_value=512, max_value=1280, default=768, step=64)
        num_heads = hp.Int("num_heads", min_value=8, max_value=14, default=10, step=1)
        dropout_rate = hp.Float("dropout_rate", min_value=0.125, max_value=0.3, default=0.2, step=0.025)
        
    if(hp_type == "learn" or hp_type == "base"):
        # These Hyperparameters affect the learning rate
        warmup_steps = hp.Int("warmup_steps", min_value=2000, max_value=7000, default=4000, step=500)
        batch_size = hp.Int("batch_size", min_value=16, max_value=128, step=16)
        beta_1 = hp.Float("beta_1", min_value=0.74, max_value=0.93, default= .8, step=0.0025)
        beta_2 = hp.Float("beta_2", min_value=0.95, max_value=0.99, default= .98, step=0.005)
        epsilon = hp.Float("epsilon", min_value=1e-9, max_value=5e-7, default= 1e-8, sampling="log")
        
    if(hp_type == "arch" or hp_type == "base"):
        hp.Choice("activation", values=['relu', 'swish'], default='relu')
        hp.Boolean("sequential", default=True)
        
    return hp

In [88]:
def get_tuner(hp, tune_new=True, trials=10, tuner_num=tuner_num, update=False, hp_type=hp_type):
    if(update):
        hp = update_hp(hp)
    return keras_tuner.BayesianOptimization(
        hypermodel=MyHyperModel(),
        hyperparameters = hp,
        tune_new_entries = tune_new,
        objective="val_accuracy",
        max_trials=trials,
        overwrite=False,
        directory="tuners",
        project_name=f'tuner{tuner_num}',
    )

In [89]:
class_weights = class_weight.compute_class_weight(class_weight='balanced',
                                                  classes=np.unique(y_train),
                                                  y=y_train) 
class_weights=dict(zip(np.unique(y_train), class_weights))

my_callbacks = tf.keras.callbacks.EarlyStopping(
    monitor="val_accuracy", 
    min_delta=0.05, patience=3,
    verbose=2, baseline=0.40, start_from_epoch=1)

In [93]:
tuner_num = 4
hp_type = ""

base_num = 10
layer_num = 6
learn_num = 6
arch_num = 3

iterations = 1
num_trials = iterations*(layer_num+learn_num+arch_num)+base_num

assert base_num > 9
print(f'Number of trials: {num_trials}')

Number of trials: 25


In [94]:
base_hp = create_hp(hp_type="base")
layer_hp = create_hp(hp_type="layer")
learn_hp = create_hp(hp_type="learn")
arch_hp = create_hp(hp_type="arch")

In [None]:
#hp_type="base"
#base_tuner = get_tuner(hp=base_hp, trials=base_num, tuner_num=tuner_num)

# run 10 models to get a starting point
#base_tuner.search(x=X_train, y=y_train, epochs=3, validation_data=(X_val, y_val), 
             #class_weight=class_weights)

# run 4 * 15 models: 6 focused on tuning layer values, 6 focused on tuning learn values, 
# and 3 focused on changing model architecture
for i in range(1, iterations+1):
    layer_trial = layer_num * i + base_num
    learn_trial = learn_num * i + base_num + layer_trial
    arch_trial  = arch_num  * i + base_num + learn_trial
    
    hp_type="layer"
    layer_tuner = get_tuner(hp=layer_hp, tune_new=False, trials=layer_trial, update=True, hp_type=hp_type)
    layer_tuner.search(x=X_train, y=y_train, epochs=3, validation_data=(X_val, y_val), 
             class_weight=class_weights)
    hp_type="learn"
    learn_tuner = get_tuner(hp=learn_hp, tune_new=False, trials=learn_trial, update=True, hp_type=hp_type)
    learn_tuner.search(x=X_train, y=y_train, epochs=3, validation_data=(X_val, y_val), 
             class_weight=class_weights)
    hp_type="arch"
    arch_tuner = get_tuner(hp=arch_hp, tune_new=False, trials=arch_trial, update=True, hp_type=hp_type)
    arch_tuner.search(x=X_train, y=y_train, epochs=3, validation_data=(X_val, y_val), 
             class_weight=class_weights)

Trial 13 Complete [00h 00m 17s]
val_accuracy: 0.3762102425098419

Best val_accuracy So Far: 0.46565237641334534
Total elapsed time: 00h 02m 37s

Search: Running Trial #14

Value             |Best Value So Far |Hyperparameter
1                 |1                 |num_layers
112               |160               |d_model
1152              |576               |dff
8                 |14                |num_heads
0.225             |0.125             |dropout_rate
7000              |6000              |warmup_steps
32                |16                |batch_size
0.775             |0.8275            |beta_1
0.955             |0.96              |beta_2
9.5557e-08        |2.0502e-08        |epsilon
swish             |swish             |activation
False             |False             |sequential

Epoch 1/3
134/597 [=====>........................] - ETA: 18s - loss: 2.6659 - accuracy: 0.1369