## Encoder Speed

In [1]:
import time
import torch as t
import torch.nn as nn

In [3]:
batch_size = 1024
input_dim = 512
dict_size = 1_000_000

In [4]:
x = t.randn(batch_size, input_dim).cuda()

In [5]:
W_small = nn.Linear(input_dim, dict_size // 16).cuda()
W_big = nn.Linear(input_dim, dict_size).cuda()

In [20]:
# Warm-up run
_ = W_small(x)

# Ensure all CUDA operations are completed
t.cuda.synchronize()

# Time the operation
start = t.cuda.Event(enable_timing=True)
end = t.cuda.Event(enable_timing=True)

start.record()
output = W_small(x)
end.record()

# Ensure all CUDA operations are completed
t.cuda.synchronize()

elapsed_time = start.elapsed_time(end)
print(f"Forward pass took {elapsed_time:.3f} milliseconds")


Forward pass took 3.243 milliseconds


In [21]:
# Warm-up run
_ = W_big(x)

# Ensure all CUDA operations are completed
t.cuda.synchronize()

# Time the operation
start = t.cuda.Event(enable_timing=True)
end = t.cuda.Event(enable_timing=True)

start.record()
output = W_big(x)
end.record()

# Ensure all CUDA operations are completed
t.cuda.synchronize()

elapsed_time = start.elapsed_time(end)
print(f"Forward pass took {elapsed_time:.3f} milliseconds")


Forward pass took 55.207 milliseconds


## MoE Library

In [10]:
import torch as t
from torch import nn
from mixture_of_experts import MoE


In [11]:
moe1 = MoE(
    dim = 768,
    num_experts = 1,               # increase the experts (# parameters) of your model without increasing computation
    hidden_dim = 768 * 32,           # size of hidden dimension in each expert, defaults to 4 * dimension
    activation = nn.ReLU,      # use your preferred activation, will default to GELU
    second_policy_train = 'random', # in top_2 gating, policy for whether to use a second-place expert
    second_policy_eval = 'random',  # all (always) | none (never) | threshold (if gate value > the given threshold) | random (if gate value > threshold * random_uniform(0, 1))
    second_threshold_train = 0.2,
    second_threshold_eval = 0.2,
    capacity_factor_train = 1.25,   # experts have fixed capacity per batch. we need some extra capacity in case gating is not perfectly balanced.
    capacity_factor_eval = 2.,      # capacity_factor_* should be set to a value >=1
    loss_coef = 1e-2                # multiplier on the auxiliary expert balancing auxiliary loss
)


In [17]:
moe4 = MoE(
    dim = 768,
    num_experts = 4,               # increase the experts (# parameters) of your model without increasing computation
    hidden_dim = 768 * 32,           # size of hidden dimension in each expert, defaults to 4 * dimension
    activation = nn.ReLU,      # use your preferred activation, will default to GELU
    second_policy_train = 'random', # in top_2 gating, policy for whether to use a second-place expert
    second_policy_eval = 'random',  # all (always) | none (never) | threshold (if gate value > the given threshold) | random (if gate value > threshold * random_uniform(0, 1))
    second_threshold_train = 0.2,
    second_threshold_eval = 0.2,
    capacity_factor_train = 1.25,   # experts have fixed capacity per batch. we need some extra capacity in case gating is not perfectly balanced.
    capacity_factor_eval = 2.,      # capacity_factor_* should be set to a value >=1
    loss_coef = 1e-2                # multiplier on the auxiliary expert balancing auxiliary loss
)


In [12]:
moe16 = MoE(
    dim = 768,
    num_experts = 16,               # increase the experts (# parameters) of your model without increasing computation
    hidden_dim = 768 * 32,           # size of hidden dimension in each expert, defaults to 4 * dimension
    activation = nn.ReLU,      # use your preferred activation, will default to GELU
    second_policy_train = 'random', # in top_2 gating, policy for whether to use a second-place expert
    second_policy_eval = 'random',  # all (always) | none (never) | threshold (if gate value > the given threshold) | random (if gate value > threshold * random_uniform(0, 1))
    second_threshold_train = 0.2,
    second_threshold_eval = 0.2,
    capacity_factor_train = 1.25,   # experts have fixed capacity per batch. we need some extra capacity in case gating is not perfectly balanced.
    capacity_factor_eval = 2.,      # capacity_factor_* should be set to a value >=1
    loss_coef = 1e-2                # multiplier on the auxiliary expert balancing auxiliary loss
)

In [21]:
moe64 = MoE(
    dim = 768,
    num_experts = 64,               # increase the experts (# parameters) of your model without increasing computation
    hidden_dim = 768 * 32,           # size of hidden dimension in each expert, defaults to 4 * dimension
    activation = nn.ReLU,      # use your preferred activation, will default to GELU
    second_policy_train = 'random', # in top_2 gating, policy for whether to use a second-place expert
    second_policy_eval = 'random',  # all (always) | none (never) | threshold (if gate value > the given threshold) | random (if gate value > threshold * random_uniform(0, 1))
    second_threshold_train = 0.2,
    second_threshold_eval = 0.2,
    capacity_factor_train = 1.25,   # experts have fixed capacity per batch. we need some extra capacity in case gating is not perfectly balanced.
    capacity_factor_eval = 2.,      # capacity_factor_* should be set to a value >=1
    loss_coef = 1e-2                # multiplier on the auxiliary expert balancing auxiliary loss
)

In [24]:
moe1024 = MoE(
    dim = 768,
    num_experts = 1024,               # increase the experts (# parameters) of your model without increasing computation
    hidden_dim = 768 * 32,           # size of hidden dimension in each expert, defaults to 4 * dimension
    activation = nn.ReLU,      # use your preferred activation, will default to GELU
    second_policy_train = 'random', # in top_2 gating, policy for whether to use a second-place expert
    second_policy_eval = 'random',  # all (always) | none (never) | threshold (if gate value > the given threshold) | random (if gate value > threshold * random_uniform(0, 1))
    second_threshold_train = 0.2,
    second_threshold_eval = 0.2,
    capacity_factor_train = 1.25,   # experts have fixed capacity per batch. we need some extra capacity in case gating is not perfectly balanced.
    capacity_factor_eval = 2.,      # capacity_factor_* should be set to a value >=1
    loss_coef = 1e-2                # multiplier on the auxiliary expert balancing auxiliary loss
)

In [None]:
## are the weights stored on CPU?

In [14]:
inputs = t.randn(1, 8192, 768)

In [18]:
# Warm-up run
_ = moe1(inputs)

# Ensure all CUDA operations are completed
t.cuda.synchronize()

# Time the operation
start = t.cuda.Event(enable_timing=True)
end = t.cuda.Event(enable_timing=True)

start.record()
out, aux_loss = moe1(inputs)
end.record()

# Ensure all CUDA operations are completed
t.cuda.synchronize()

elapsed_time = start.elapsed_time(end)
print(f"Forward pass took {elapsed_time:.3f} milliseconds")


Forward pass took 3704.199 milliseconds


In [19]:
# Warm-up run
_ = moe4(inputs)

# Ensure all CUDA operations are completed
t.cuda.synchronize()

# Time the operation
start = t.cuda.Event(enable_timing=True)
end = t.cuda.Event(enable_timing=True)

start.record()
out, aux_loss = moe4(inputs)
end.record()

# Ensure all CUDA operations are completed
t.cuda.synchronize()

elapsed_time = start.elapsed_time(end)
print(f"Forward pass took {elapsed_time:.3f} milliseconds")


Forward pass took 5826.192 milliseconds


In [23]:
# Warm-up run
_ = moe16(inputs)

# Ensure all CUDA operations are completed
t.cuda.synchronize()

# Time the operation
start = t.cuda.Event(enable_timing=True)
end = t.cuda.Event(enable_timing=True)

start.record()
out, aux_loss = moe16(inputs)
end.record()

# Ensure all CUDA operations are completed
t.cuda.synchronize()

elapsed_time = start.elapsed_time(end)
print(f"Forward pass took {elapsed_time:.3f} milliseconds")


Forward pass took 8306.871 milliseconds


In [22]:
# Warm-up run
_ = moe64(inputs)

# Ensure all CUDA operations are completed
t.cuda.synchronize()

# Time the operation
start = t.cuda.Event(enable_timing=True)
end = t.cuda.Event(enable_timing=True)

start.record()
out, aux_loss = moe64(inputs)
end.record()

# Ensure all CUDA operations are completed
t.cuda.synchronize()

elapsed_time = start.elapsed_time(end)
print(f"Forward pass took {elapsed_time:.3f} milliseconds")


Forward pass took 7080.979 milliseconds


In [None]:
# Warm-up run
_ = moe1024(inputs)

# Ensure all CUDA operations are completed
t.cuda.synchronize()

# Time the operation
start = t.cuda.Event(enable_timing=True)
end = t.cuda.Event(enable_timing=True)

start.record()
out, aux_loss = moe1024(inputs)
end.record()

# Ensure all CUDA operations are completed
t.cuda.synchronize()

elapsed_time = start.elapsed_time(end)
print(f"Forward pass took {elapsed_time:.3f} milliseconds")
