In [7]:
!pip install keras-tuner
import numpy as np
import tensorflow as tf
import keras_tuner as kt
from sklearn.model_selection import train_test_split
from collections import Counter



In [8]:
##############################################################################
# 1) Custom Layers for Indices
##############################################################################
class PatchIndex(tf.keras.layers.Layer):
    """Returns (batch, num_patches) with values [0..num_patches-1]."""
    def __init__(self, num_patches, **kwargs):
        super().__init__(**kwargs)
        self.num_patches = num_patches

    def call(self, x):
        bs = tf.shape(x)[0]
        idx = tf.range(self.num_patches)
        idx = tf.expand_dims(idx, 0)  # shape (1, num_patches)
        idx = tf.tile(idx, [bs, 1])   # shape (batch, num_patches)
        return idx

class ClassTokenIndex(tf.keras.layers.Layer):
    """Returns shape (batch,1) all zeros for the class token embedding."""
    def call(self, x):
        bs = tf.shape(x)[0]
        idx = tf.range(1)        # [0]
        idx = tf.expand_dims(idx, 0)  # (1,1)
        idx = tf.tile(idx, [bs,1])    # (batch,1)
        return idx

In [9]:
##############################################################################
# 2) Warmup + CosineDecay Learning Rate
##############################################################################
def create_warmup_cosine_lr(initial_lr, warmup_steps, total_steps):
    """
    Composite schedule:
      - Warmup from 0 -> initial_lr over `warmup_steps`
      - Cosine decay from initial_lr -> 0 over `(total_steps - warmup_steps)`
    """
    def lr_fn(step):
        step = tf.cast(step, tf.float32)
        if step < warmup_steps:
            return initial_lr * (step / tf.cast(warmup_steps, tf.float32))
        else:
            progress = (step - warmup_steps) / (total_steps - warmup_steps)
            return 0.5 * initial_lr * (1.0 + tf.cos(np.pi * progress))

    return tf.keras.optimizers.schedules.LearningRateSchedule(lr_fn)


In [10]:
class WarmupCosineSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    """
    1) Warmup from 0 to initial_lr over warmup_steps
    2) Cosine decay from initial_lr down to 0 over (total_steps - warmup_steps).
    """
    def __init__(self, initial_lr, warmup_steps, total_steps):
        super().__init__()
        self.initial_lr = initial_lr
        self.warmup_steps = float(warmup_steps)
        self.total_steps = float(total_steps)

    def __call__(self, step):
        step = tf.cast(step, tf.float32)

        # Warmup phase
        def warmup_fn():
            return self.initial_lr * (step / self.warmup_steps)

        # Cosine decay phase
        def cosine_fn():
            progress = (step - self.warmup_steps) / (self.total_steps - self.warmup_steps)
            return 0.5 * self.initial_lr * (1.0 + tf.cos(np.pi * progress))

        return tf.cond(step < self.warmup_steps, lambda: warmup_fn(), lambda: cosine_fn())

    def get_config(self):
        return {
            'initial_lr': self.initial_lr,
            'warmup_steps': self.warmup_steps,
            'total_steps': self.total_steps
        }


In [11]:
##############################################################################
# 3) Hybrid CNN-Transformer with More Depth & LR Schedule
##############################################################################
def build_deeper_hybrid(hp):
    """
    Key improvements:
      - Up to 3 Conv blocks in the CNN
      - Up to 4 Transformer blocks
      - hidden_dim up to 512
      - warmup + cosine LR schedule
    """
    n, m = 6, 7
    num_patches = n * m
    num_classes = 7

    # Hyperparameters
    num_conv_blocks = hp.Int('num_conv_blocks', min_value=2, max_value=3)
    embed_dim       = hp.Choice('embed_dim', [32, 64, 128])
    hidden_dim      = hp.Choice('hidden_dim', [128, 256, 512])
    num_layers      = hp.Int('num_layers', min_value=2, max_value=4)
    num_heads       = hp.Choice('num_heads', [2, 4, 8])
    dropout_rate    = hp.Choice('dropout_rate', [0.0, 0.1, 0.2])
    base_lr         = hp.Choice('base_lr', [1e-4, 3e-4, 1e-3])

    key_dim   = hidden_dim // num_heads
    value_dim = key_dim * 2
    mlp_dim   = hidden_dim

    inp = tf.keras.layers.Input(shape=(n, m, 2))

    # 1) CNN Embedding
    x = inp
    for i in range(num_conv_blocks):
        filters = embed_dim * (i + 1)
        x = tf.keras.layers.Conv2D(filters=filters, kernel_size=3, padding='same', activation='relu')(x)
        x = tf.keras.layers.Conv2D(filters=filters, kernel_size=3, padding='same', activation='relu')(x)

    x = tf.keras.layers.Conv2D(embed_dim, kernel_size=1, activation='relu')(x)
    x = tf.keras.layers.Reshape((num_patches, embed_dim))(x)

    # 2) Transformer Layers
    x = tf.keras.layers.Dense(hidden_dim)(x)
    patch_idx = PatchIndex(num_patches)(x)
    pos_emb = tf.keras.layers.Embedding(input_dim=num_patches, output_dim=hidden_dim)(patch_idx)
    x = tf.keras.layers.Add()([x, pos_emb])

    cls_idx = ClassTokenIndex()(x)
    cls_token = tf.keras.layers.Embedding(input_dim=1, output_dim=hidden_dim)(cls_idx)
    x = tf.keras.layers.Concatenate(axis=1)([cls_token, x])

    for _ in range(num_layers):
        ln1 = tf.keras.layers.LayerNormalization()(x)
        attn_out = tf.keras.layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=key_dim, value_dim=value_dim, dropout=dropout_rate
        )(ln1, ln1, ln1)
        x = tf.keras.layers.Add()([x, attn_out])

    cls_out = x[:, 0, :]
    cls_out = tf.keras.layers.LayerNormalization()(cls_out)
    logits  = tf.keras.layers.Dense(num_classes, activation='softmax')(cls_out)

    model = tf.keras.models.Model(inp, logits)

    # 3) LR Schedule
    warmup_steps = 2000
    total_steps  = 20000
    lr_schedule  = lr_schedule = WarmupCosineSchedule(base_lr, warmup_steps, total_steps)

    optimizer    = tf.keras.optimizers.Adam(lr_schedule)

    model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

In [None]:
##############################################################################
# 4) Tuning & Training Example
##############################################################################
if __name__ == "__main__":
    #shutil.rmtree('deeper_hybrid_tuner', ignore_errors=True)  # Clear old tuner results

    full_comb_X = np.load("C:/Users/argon/Documents/Desktop Prime/MS Business Analytics/Spring Semester/Optimization - II/Connect 4 Project/final_deduplicated_X.npy")
    full_comb_Y = np.load("C:/Users/argon/Documents/Desktop Prime/MS Business Analytics/Spring Semester/Optimization - II/Connect 4 Project/final_deduplicated_Y.npy")

    X_train, X_val, y_train, y_val = train_test_split(full_comb_X, full_comb_Y, test_size=0.15, shuffle=True, random_state=42)

    tuner = kt.Hyperband(
        build_deeper_hybrid,
        objective='val_accuracy',
        max_epochs=10,
        factor=3,
        directory='deeper_hybrid_tuner',
        project_name='cnn_transformer_improved'
    )

    stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10)

    tuner.search(X_train, y_train, validation_data=(X_val, y_val), epochs=10, batch_size=128, callbacks=[stop_early], shuffle=True)

    best_hps = tuner.get_best_hyperparameters(1)[0]

    # âœ… Print all available hyperparameters before accessing them
    print("\nAvailable hyperparameters:", best_hps.values)

    # âœ… Get hyperparameters safely
    print("\nBest Hyperparams found by Hyperband:")
    print(" - num_conv_blocks=", best_hps.get('num_conv_blocks'))
    print(" - embed_dim     =", best_hps.get('embed_dim'))
    print(" - hidden_dim    =", best_hps.get('hidden_dim'))
    print(" - num_layers    =", best_hps.get('num_layers'))
    print(" - num_heads     =", best_hps.get('num_heads'))
    print(" - dropout_rate  =", best_hps.get('dropout_rate'))
    print(" - base_lr       =", best_hps.get('base_lr'))

    best_model = tuner.hypermodel.build(best_hps)

    ckpt_path = '/content/best_hybrid_model_fixed_2.keras'
    ckpt_cb = tf.keras.callbacks.ModelCheckpoint(
        ckpt_path, monitor='val_loss', save_best_only=True)


    final_history = best_model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=60, batch_size=128, callbacks=[stop_early, ckpt_cb], shuffle=True)

    val_loss, val_acc = best_model.evaluate(X_val, y_val)
    print(f"\nFinal validation accuracy: {val_acc:.4f}")



Search: Running Trial #1

Value             |Best Value So Far |Hyperparameter
3                 |3                 |num_conv_blocks
128               |128               |embed_dim
512               |512               |hidden_dim
3                 |3                 |num_layers
4                 |4                 |num_heads
0                 |0                 |dropout_rate
0.0001            |0.0001            |base_lr
2                 |2                 |tuner/epochs
0                 |0                 |tuner/initial_epoch
2                 |2                 |tuner/bracket
0                 |0                 |tuner/round

