<a href="https://colab.research.google.com/github/ayyucedemirbas/MixtureOfExperts/blob/main/MoE_implementation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import tensorflow as tf
from tensorflow.keras import layers
import numpy as np

In [2]:
class Expert(layers.Layer):
    def __init__(self, units=32, **kwargs):
        super(Expert, self).__init__(**kwargs)
        self.dense = layers.Dense(units, activation='relu')

    def call(self, inputs):
        return self.dense(inputs)

In [3]:
class MoELayer(layers.Layer):
    def __init__(self, num_experts=3, k=2, units=32, **kwargs):
        super(MoELayer, self).__init__(**kwargs)
        self.num_experts = num_experts
        self.k = k
        self.units = units
        self.experts = [Expert(units) for _ in range(num_experts)]
        self.gate = layers.Dense(num_experts)

    def call(self, inputs):
        # Compute the gating weights (logits)
        gate_outputs = self.gate(inputs)
        gate_outputs = tf.nn.softmax(gate_outputs, axis=-1)

        # Select top-k experts
        top_k_values, top_k_indices = tf.nn.top_k(gate_outputs, k=self.k)

        # Compute the outputs of the selected experts
        expert_outputs = [self.experts[i](inputs) for i in range(self.num_experts)]
        expert_outputs = tf.stack(expert_outputs, axis=-1)

        # Gather the top-k expert outputs
        top_k_expert_outputs = tf.gather(expert_outputs, top_k_indices, batch_dims=-1)

        # Weighted sum of top-k expert outputs
        top_k_expert_outputs = tf.reduce_sum(top_k_expert_outputs * tf.expand_dims(top_k_values, -2), axis=-1)

        return top_k_expert_outputs


In [4]:
def build_moe_model(input_shape, num_experts=3, k=2, units=32):
    inputs = layers.Input(shape=input_shape)
    x = MoELayer(num_experts=num_experts, k=k, units=units)(inputs)
    outputs = layers.Dense(1)(x)

    model = tf.keras.Model(inputs=inputs, outputs=outputs)
    return model

In [5]:
# Generate some dummy data
np.random.seed(42)
x_train = np.random.rand(1000, 10)
y_train = np.random.rand(1000, 1)

# Build the MoE model
moe_model = build_moe_model(input_shape=(10,), num_experts=4, k=2, units=64)
moe_model.compile(optimizer='adam', loss='mse')

# Train the model
moe_model.fit(x_train, y_train, epochs=10, batch_size=32)


Epoch 1/10
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 40ms/step - loss: 0.1385
Epoch 2/10
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.0839
Epoch 3/10
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.0868
Epoch 4/10
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0891
Epoch 5/10
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.0840
Epoch 6/10
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0840
Epoch 7/10
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0859
Epoch 8/10
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0821
Epoch 9/10
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0810
Epoch 10/10
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.0828


<keras.src.callbacks.history.History at 0x7f1c38ee08e0>

In [6]:
moe_model.summary()