In [1]:
import torch
from torch import nn
import torch.nn.functional as F
from dotenv import load_dotenv
import wandb
import math
from helpers.memory import check_memory, profile_memory
from helpers.logging import get_gradient_stats
from helpers.moe_utils import check_cosine_similarity
from helpers.dataset import load_shard_as_dataloader
from dataclasses import dataclass, asdict
import time
from collections import defaultdict
import os
import glob 
import json
from datetime import datetime
from transformers import AutoTokenizer

from config import ModelConf, TrainConf
from moe import OlmoeModel
from train import train


check_memory()



Device 0: NVIDIA H200
  Allocated: 0.00 GB
  Reserved: 0.00 GB
  Total: 139.83 GB

Device 1: NVIDIA H200
  Allocated: 0.00 GB
  Reserved: 0.00 GB
  Total: 139.83 GB

Device 2: NVIDIA H200
  Allocated: 0.00 GB
  Reserved: 0.00 GB
  Total: 139.83 GB

Device 3: NVIDIA H200
  Allocated: 0.00 GB
  Reserved: 0.00 GB
  Total: 139.83 GB



In [5]:
import torch
from helpers.moe_utils import differentiable_gram_schmidt
# write a test
n_experts = 3
D = 4
x = torch.randn(n_experts, D) 

ortho_x = differentiable_gram_schmidt(x,keep_magnitude=True,use_random_order=True)

# the non-diagonal elements should be close to 0
gram_matrix = ortho_x @ ortho_x.T

gram_matrix_x = x @ x.T

print(gram_matrix)
print(gram_matrix_x)

print(ortho_x)
print(x)


tensor([[ 3.5339e+00, -1.3411e-07,  0.0000e+00],
        [-1.3411e-07,  1.4727e+00,  2.3842e-07],
        [ 0.0000e+00,  2.3842e-07,  3.6907e+00]])
tensor([[ 3.5339,  1.4842, -0.4625],
        [ 1.4842,  1.4727,  1.3688],
        [-0.4625,  1.3688,  3.6907]])
tensor([[-1.8224,  0.3410, -0.0952, -0.2958],
        [ 0.1713,  0.2342, -1.0960, -0.4328],
        [ 0.3380,  0.6748,  0.8158, -1.5670]])
tensor([[-1.8224,  0.3410, -0.0952, -0.2958],
        [-0.5618,  0.5124, -0.0708, -0.9432],
        [ 0.5737,  0.6246,  0.8216, -1.5154]])


In [2]:
model_conf = ModelConf(
    D = 768, 
    H = 8,
    I = 512,
    n_experts = 30,
    n_shared_experts = 2,
    top_k = 4,
    norm_topk_prob = False,
    n_layers = 10,
    max_position_embeddings = 2048,
    gate_orthogonal = False,
    is_freeze_weights = False,
    main_device = 'cuda:0',
    keep_magnitude =True,
    use_random_order = True
)

train_conf = TrainConf(
    router_cos_loss_coef = 0.01,
)
seed = 1234

In [3]:
""" 
Let's load the model
- Set the default_device to specify where all the non-expert layers live (the experts are moved on model init)
- Set the default_dtype to specify the model dtype, all params will be in this dtype except for this explicitly specified differently in class definition
  - In the default OlMoE, RMSNorm is required to be f32 whereas all other params are bf16. 
"""
# torch.set_default_device(conf.main_device) # This is buggy, don't use
torch.set_default_dtype(torch.bfloat16)
torch.set_float32_matmul_precision('medium') # See https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html 
torch.manual_seed(seed)

model = OlmoeModel(
    model_conf,
    primary_device = model_conf.main_device, # Where to store dense layers and shared experts
    expert_device_map = [model_conf.main_device] * model_conf.n_experts #=, here let's test them with all of them on cuda:0
)
model = torch.compile(model)
tokenizer = AutoTokenizer.from_pretrained('allenai/OLMoE-1B-7B-0924', add_eos_token = False, add_bos_token = False)
print(f"Total parameters: {sum(p.numel() for p in model.parameters()):,}")
check_memory()

Total parameters: 478,609,152
Device 0: NVIDIA H200
  Allocated: 0.89 GB
  Reserved: 1.14 GB
  Total: 139.83 GB

Device 1: NVIDIA H200
  Allocated: 0.00 GB
  Reserved: 0.00 GB
  Total: 139.83 GB

Device 2: NVIDIA H200
  Allocated: 0.00 GB
  Reserved: 0.00 GB
  Total: 139.83 GB

Device 3: NVIDIA H200
  Allocated: 0.00 GB
  Reserved: 0.00 GB
  Total: 139.83 GB



# some data
Default setting: 


    1.02 vs 0.0027
    D 768-> 768*4 : 0.530 vs 0.001
    n_experts 30-> 120 : 4.305 vs 0.010

    D_768 -> 768/4 : 2.166 vs 0.0054
    n_experts 30-> 8: 0.261 vs 0.00089

    D 768-> 768*4 and n_experts 30-> 120: 2.151 vs 0.0052
    D 768-> 768/4 and n_experts 30-> 8:  0.476 vs 0.0018

linear dependency on # of experts
$O(1/\sqrt{d})$ dependency on dimension

For reference, the lm loss is ~ 11, and the aux loss is 2~4

In [4]:
"""
Setup a Wandb run for logging. Choose a run name and notes for the run!
"""
RUN_NAME = 'test'
RUN_NOTES = 'None'

load_dotenv('./../../secrets.env')
wandb.login(key = os.getenv('WANDB_API_KEY'))
run = wandb.init(
    project = 'interpretable-moes', 
    name = RUN_NAME,
    notes = RUN_NOTES,
    config = {**asdict(model_conf), **asdict(train_conf)}
)

# (Optional) Also log various info as a wandb media object.
additional_log_notes = {
    'run_name': RUN_NAME,
    'notes': RUN_NOTES,
    'created_at': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
    'total_model_params': sum(p.numel() for p in model.parameters()),
    'available_cuda_gpus': [torch.cuda.get_device_properties(i).name for i in range(torch.cuda.device_count())],
    'model_conf': asdict(model_conf),
    'train_conf': asdict(train_conf)
}

wandb.log({'conf': wandb.Html(f"<pre style='font-size:12px;'>{json.dumps(additional_log_notes, indent = 2)}</pre>")})

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33myuanbo096[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [4]:
val_dl = load_shard_as_dataloader(
    './../../data/val_shard.json',
    tokenizer,
    batch_size = 32,
    seq_len = 2048,
    eos_seperator_id = tokenizer.eos_token_id
)

In [5]:
train(model, tokenizer, train_conf, model_conf, val_dl, seed, save_dir = 'test')
wandb.finish()

Found 1946 shards.

=== Loading shard ./../../data/train_shard_0.json (index 0) ===


ERROR:tornado.general:SEND Error: Host unreachable


KeyboardInterrupt: 