In [1]:
import torch
from torch import nn
import torch.nn.functional as F
from dotenv import load_dotenv
import wandb
import math
from helpers.memory import check_memory, profile_memory
from helpers.logging import get_gradient_stats
from helpers.moe_utils import check_cosine_similarity
from helpers.dataset import load_shard_as_dataloader
from dataclasses import dataclass, asdict
import time
from collections import defaultdict
import os
import glob 
import json
from datetime import datetime
from transformers import AutoTokenizer

from config import ModelConf, TrainConf
from moe import OlmoeModel
from train import train


check_memory()

Device 0: NVIDIA H200
  Allocated: 0.00 GB
  Reserved: 0.00 GB
  Total: 139.83 GB

Device 1: NVIDIA H200
  Allocated: 0.00 GB
  Reserved: 0.00 GB
  Total: 139.83 GB

Device 2: NVIDIA H200
  Allocated: 0.00 GB
  Reserved: 0.00 GB
  Total: 139.83 GB

Device 3: NVIDIA H200
  Allocated: 0.00 GB
  Reserved: 0.00 GB
  Total: 139.83 GB



In [2]:
model_conf = ModelConf(
    D = 768, 
    H = 8,
    I = 512,
    n_experts = 30,
    n_shared_experts = 2,
    top_k = 4,
    norm_topk_prob = False,
    n_layers = 10,
    max_position_embeddings = 2048,
    gate_orthogonal = True,
    is_freeze_weights = False,
    main_device = 'cuda:0'
)

train_conf = TrainConf(
    router_cos_loss_coef = 0.01,
)
seed = 1234

In [3]:
""" 
Let's load the model
- Set the default_device to specify where all the non-expert layers live (the experts are moved on model init)
- Set the default_dtype to specify the model dtype, all params will be in this dtype except for this explicitly specified differently in class definition
  - In the default OlMoE, RMSNorm is required to be f32 whereas all other params are bf16. 
"""
# torch.set_default_device(conf.main_device) # This is buggy, don't use
torch.set_default_dtype(torch.bfloat16)
torch.set_float32_matmul_precision('medium') # See https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html 
torch.manual_seed(seed)

model = OlmoeModel(
    model_conf,
    primary_device = model_conf.main_device, # Where to store dense layers and shared experts
    expert_device_map = [model_conf.main_device] * model_conf.n_experts #=, here let's test them with all of them on cuda:0
)
model = torch.compile(model)
tokenizer = AutoTokenizer.from_pretrained('allenai/OLMoE-1B-7B-0924', add_eos_token = False, add_bos_token = False)
print(f"Total parameters: {sum(p.numel() for p in model.parameters()):,}")
check_memory()

Total parameters: 478,609,152
Device 0: NVIDIA H200
  Allocated: 0.89 GB
  Reserved: 1.14 GB
  Total: 139.83 GB

Device 1: NVIDIA H200
  Allocated: 0.00 GB
  Reserved: 0.00 GB
  Total: 139.83 GB

Device 2: NVIDIA H200
  Allocated: 0.00 GB
  Reserved: 0.00 GB
  Total: 139.83 GB

Device 3: NVIDIA H200
  Allocated: 0.00 GB
  Reserved: 0.00 GB
  Total: 139.83 GB



# some data
Default setting: 


    1.02 vs 0.0027
    D 768-> 768*4 : 0.530 vs 0.001
    n_experts 30-> 120 : 4.305 vs 0.010

    D_768 -> 768/4 : 2.166 vs 0.0054
    n_experts 30-> 8: 0.261 vs 0.00089

    D 768-> 768*4 and n_experts 30-> 120: 2.151 vs 0.0052
    D 768-> 768/4 and n_experts 30-> 8:  0.476 vs 0.0018

linear dependency on # of experts
$O(1/\sqrt{d})$ dependency on dimension

For reference, the lm loss is ~ 11, and the aux loss is 2~4

In [4]:
"""
Setup a Wandb run for logging. Choose a run name and notes for the run!
"""
RUN_NAME = 'test-01 -single-gpu -experts-32 -topk-4 -forward-slow'
RUN_NOTES = 'Baseline test with routing orthogonal initialization and no gate update'

load_dotenv('./../../secrets.env')
wandb.login(key = os.getenv('WANDB_API_KEY'))
run = wandb.init(
    project = 'interpretable-moes', 
    name = RUN_NAME,
    notes = RUN_NOTES,
    config = {**asdict(model_conf), **asdict(train_conf)}
)

# (Optional) Also log various info as a wandb media object.
additional_log_notes = {
    'run_name': RUN_NAME,
    'notes': RUN_NOTES,
    'created_at': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
    'total_model_params': sum(p.numel() for p in model.parameters()),
    'available_cuda_gpus': [torch.cuda.get_device_properties(i).name for i in range(torch.cuda.device_count())],
    'model_conf': asdict(model_conf),
    'train_conf': asdict(train_conf)
}

wandb.log({'conf': wandb.Html(f"<pre style='font-size:12px;'>{json.dumps(additional_log_notes, indent = 2)}</pre>")})

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33myuanbo096[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [5]:
val_dl = load_shard_as_dataloader(
    './../../data/val_shard.json',
    tokenizer,
    batch_size = 32,
    seq_len = 2048,
    eos_seperator_id = tokenizer.eos_token_id
)

In [7]:
train(model, tokenizer, train_conf, model_conf, val_dl, seed)
wandb.finish()

Found 3 shards.

=== Loading shard ./../../data/train_shard_0.json (index 0) ===


W0221 18:55:36.566000 13222 torch/_dynamo/variables/tensor.py:776] [0/0] Graph break from `Tensor.item()`, consider setting:
W0221 18:55:36.566000 13222 torch/_dynamo/variables/tensor.py:776] [0/0]     torch._dynamo.config.capture_scalar_outputs = True
W0221 18:55:36.566000 13222 torch/_dynamo/variables/tensor.py:776] [0/0] or:
W0221 18:55:36.566000 13222 torch/_dynamo/variables/tensor.py:776] [0/0]     env TORCHDYNAMO_CAPTURE_SCALAR_OUTPUTS=1
W0221 18:55:36.566000 13222 torch/_dynamo/variables/tensor.py:776] [0/0] to include these operations in the captured graph.
W0221 18:55:36.566000 13222 torch/_dynamo/variables/tensor.py:776] [0/0] 
W0221 18:55:36.566000 13222 torch/_dynamo/variables/tensor.py:776] [0/0] Graph break: from user code at:
W0221 18:55:36.566000 13222 torch/_dynamo/variables/tensor.py:776] [0/0]   File "/workspace/interpretable-moes/experiments_cli/base_olmoe_cos_loss/moe.py", line 679, in forward
W0221 18:55:36.566000 13222 torch/_dynamo/variables/tensor.py:776] [0/0]

Step 0: avg_loss=11.0751 | fwd_time=12.86s | bwd_time=8.84s | batch_time = 22.07 | lr=2.4e-04
Step 1: avg_loss=10.4715 | fwd_time=1.13s | bwd_time=3.39s | batch_time = 4.80 | lr=2.4e-04
Step 2: avg_loss=9.9102 | fwd_time=1.14s | bwd_time=3.34s | batch_time = 4.76 | lr=2.5e-04
Step 3: avg_loss=9.6184 | fwd_time=1.14s | bwd_time=3.34s | batch_time = 4.77 | lr=2.5e-04
Step 4: avg_loss=9.4199 | fwd_time=1.14s | bwd_time=3.35s | batch_time = 4.77 | lr=2.5e-04
Step 5: avg_loss=9.2567 | fwd_time=1.13s | bwd_time=3.34s | batch_time = 4.77 | lr=2.5e-04
Step 6: avg_loss=9.0935 | fwd_time=1.14s | bwd_time=3.32s | batch_time = 4.75 | lr=2.5e-04
Step 7: avg_loss=8.9047 | fwd_time=1.14s | bwd_time=3.32s | batch_time = 4.74 | lr=2.6e-04
Step 8: avg_loss=8.7626 | fwd_time=1.13s | bwd_time=3.30s | batch_time = 4.72 | lr=2.6e-04
Step 9: avg_loss=8.6098 | fwd_time=1.13s | bwd_time=3.30s | batch_time = 4.72 | lr=2.6e-04
Step 10: avg_loss=8.4577 | fwd_time=1.13s | bwd_time=3.24s | batch_time = 4.65 | lr=2.

KeyboardInterrupt: 

In [9]:
!jupyter nbconvert --to script "{your_notebook_name}.ipynb" --no-prompt --TemplateExporter.exclude_markdown=True

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


This application is used to convert notebook files (*.ipynb)
        to various other formats.


Options
The options below are convenience aliases to configurable class-options,
as listed in the "Equivalent to" description-line of the aliases.
To see all configurable class-options for some <cmd>, use:
    <cmd> --help-all

--debug
    set log level to logging.DEBUG (maximize logging output)
    Equivalent to: [--Application.log_level=10]
--show-config
    Show the application's configuration (human-readable format)
    Equivalent to: [--Application.show_config=True]
--show-config-json
    Show the application's configuration (json format)
    Equivalent to: [--Application.show_config_json=True]
--generate-config
    generate default config file
    Equivalent to: [--JupyterApp.generate_config=True]
-y
    Answer yes to any questions instead of prompting.
    Equivalent to: [--JupyterApp.answer_yes=True]
--execute
    Execute the notebook prior to export.
    Equivalent to: [--ExecutePr