In [16]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import TrainingArguments, Trainer, GPT2Tokenizer, default_data_collator
from typing import Optional
import logging
from torch.utils.tensorboard import SummaryWriter
from model_hf import GPTConfig, GPTLMHeadModel, MoEUsageLoggingCallback
import numpy as np


In [17]:
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("training.log", mode='w'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger('MoELogger')


# Load the GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
# Add special tokens, such as PAD token (if not already present)
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})  # Set pad token to eos token



loading file vocab.json from cache at C:\Users\Kosaraju\.cache\huggingface\hub\models--gpt2\snapshots\607a30d783dfa663caf39e06633721c8d4cfcd7e\vocab.json
loading file merges.txt from cache at C:\Users\Kosaraju\.cache\huggingface\hub\models--gpt2\snapshots\607a30d783dfa663caf39e06633721c8d4cfcd7e\merges.txt
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at C:\Users\Kosaraju\.cache\huggingface\hub\models--gpt2\snapshots\607a30d783dfa663caf39e06633721c8d4cfcd7e\tokenizer_config.json
loading file tokenizer.json from cache at C:\Users\Kosaraju\.cache\huggingface\hub\models--gpt2\snapshots\607a30d783dfa663caf39e06633721c8d4cfcd7e\tokenizer.json
loading configuration file config.json from cache at C:\Users\Kosaraju\.cache\huggingface\hub\models--gpt2\snapshots\607a30d783dfa663caf39e06633721c8d4cfcd7e\config.json
Model config GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": 

In [5]:
# Prepare the model configuration
block_size = 128

config = GPTConfig(
    block_size=block_size,  # Small block size for debugging
    vocab_size=tokenizer.vocab_size,  # Small vocab size for debugging
    n_layer=4,      # Few layers for faster training
    n_head=4,
    n_embd=256,
    use_moe=True,  # Enable Mixture of Experts
    num_experts=5,
    num_experts_per_tok=2,
    moe_loss=True,
    moe_loss_type = "entropy_regularization",  # Type of load balancing loss "variance_penalty", "entropy_regularization", "diversity_regularization"
    moe_loss_coef = 1e0, 
)

# Initialize the model
model = GPTLMHeadModel(config)

GPT has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.


Using Mixture of Experts (MoE) in MLP
Using Mixture of Experts (MoE) in MLP
Using Mixture of Experts (MoE) in MLP
Using Mixture of Experts (MoE) in MLP
Number of parameters: 24.44M


In [7]:
model = GPTLMHeadModel.from_pretrained(pretrained_model_name_or_path="./results", local_files_only=True, device_map='cuda')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

Using Mixture of Experts (MoE) in MLP
Using Mixture of Experts (MoE) in MLP
Using Mixture of Experts (MoE) in MLP
Using Mixture of Experts (MoE) in MLP
Number of parameters: 37.31M


GPTLMHeadModel(
  (transformer): ModuleDict(
    (wte): Embedding(50257, 256)
    (wpe): Embedding(128, 256)
    (drop): Dropout(p=0.0, inplace=False)
    (h): ModuleList(
      (0-3): 4 x Block(
        (ln_1): LayerNorm()
        (attn): CausalSelfAttention(
          (c_attn): Linear(in_features=256, out_features=768, bias=True)
          (c_proj): Linear(in_features=256, out_features=256, bias=True)
          (attn_dropout): Dropout(p=0.0, inplace=False)
          (resid_dropout): Dropout(p=0.0, inplace=False)
        )
        (ln_2): LayerNorm()
        (mlp): MoE(
          (experts): ModuleList(
            (0-4): 5 x MLP(
              (c_fc): Linear(in_features=256, out_features=1024, bias=True)
              (gelu): GELU(approximate='none')
              (c_proj): Linear(in_features=1024, out_features=256, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (gate): Linear(in_features=256, out_features=5, bias=False)
        )

In [2]:
import torch
from transformers import GPT2Tokenizer
import logging
from model_hf import GPTLMHeadModel

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("training.log", mode='w'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger('MoELogger')

# Load the GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Add special tokens, such as PAD token (if not already present)
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})  # Set pad token to eos token

# Load the fine-tuned model (from your results folder)
model = GPTLMHeadModel.from_pretrained(pretrained_model_name_or_path="./results", local_files_only=True, device_map='cuda')

# Set the device (use GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Input prompt for text generation
input_text = "To be, or not to be, that is the question:"

# Encode the input text to get input IDs
input_ids = torch.tensor(tokenizer.encode(input_text), device='cuda')[None,...]
# Generate text with the model
generated_ids = model.generate(
    input_ids,
    max_length=100,        # Maximum number of tokens to generate
    temperature=0.7,       # Controls randomness in generation (lower is more deterministic)
    top_k=50,              # Keep only top k tokens with highest probability
    top_p=0.95,            # Nucleus sampling: focus on the top p cumulative probability
    repetition_penalty=1.2,  # Discourage repetition
    do_sample=True,        # Sample from the distribution instead of taking argmax
    num_return_sequences=1  # Number of sequences to generate
)

# Decode the generated text back to readable format
# generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)

print("Generated Text:")
print(generated_text)


Using Mixture of Experts (MoE) in MLP
Using Mixture of Experts (MoE) in MLP
Using Mixture of Experts (MoE) in MLP
Using Mixture of Experts (MoE) in MLP
Number of parameters: 37.31M
Generated Text:
To be, or not to be, that is the question:
ClAnd will upon:Iee you Now good,A and out spirit sleep othersOf too of sovereign,But speak noble.
Second: Henry again earthIs so.
Firstman
 Citizen
ICH:Ay to up the of by own!
INUS
o have person so;For might him do thee
T to his, your,He no state as bear unto fair tongueWhich you, my,'d


In [10]:
# Load the GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Add special tokens, such as PAD token (if not already present)
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})  # Set pad token to eos token

# Load the fine-tuned model
model = GPTLMHeadModel.from_pretrained(pretrained_model_name_or_path="./results", local_files_only=True)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Input prompt for text generation
input_text = "To be, or not to be, that is the question:"

# Encode the input text to get input IDs
input_ids = tokenizer.encode(input_text, return_tensors='pt').to(device)
if input_ids.size(1) > model.config.block_size:
    input_ids = input_ids[:, :model.config.block_size]
# Generate text with the model
generated_ids = model.generate(
    input_ids,
    max_length=model.config.block_size+1,        # Maximum number of tokens to generate
    temperature=0.9,       # Controls randomness in generation (lower is more deterministic)
    top_k=5000,              # Keep only top k tokens with highest probability
    top_p=0.99,            # Nucleus sampling: focus on the top p cumulative probability
    repetition_penalty=1.2,  # Discourage repetition
    do_sample=True,        # Sample from the distribution instead of taking argmax
    num_return_sequences=1  # Number of sequences to generate
)

# Decode the generated text back to readable format
generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

print("Generated Text:")
print(generated_text)


Using Mixture of Experts (MoE) in MLP
Using Mixture of Experts (MoE) in MLP
Using Mixture of Experts (MoE) in MLP
Using Mixture of Experts (MoE) in MLP
Number of parameters: 24.44M
Generated Text:
To be, or not to be, that is the question:
Thirdman
 bear jumps impartialWh her him us him Angelo
 your a and ina for head--As I.
All
Third:The for rough this Cam,. victory
 yourru,.Upon- queen but how.
 high.
Prov:This; honour no,First put another
 his, fellow
 ' to g by himself
 so and is What
ad like
.
aw.What this Can unly You thou,.3 HRY for as first part him again
Prov:It be, manily A judgmentWereI


In [11]:
# Load the GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Add special tokens, such as PAD token (if not already present)
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})  # Set pad token to eos token

# Load the fine-tuned model
model = GPTLMHeadModel.from_pretrained(pretrained_model_name_or_path="./results", local_files_only=True)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Input prompt for text generation
input_text = "To be, or not to be, that is the question:"

# Encode the input text to get input IDs
input_ids = tokenizer.encode(input_text, return_tensors='pt').to(device)

# Generate text with the model using cache (past_key_values)
generated_ids = model.generate(
    input_ids,
    max_length=model.config.block_size + 50,  # Generate up to 50 tokens beyond block size
    temperature=0.9,       # Controls randomness in generation (lower is more deterministic)
    top_k=50,              # Keep only top k tokens with highest probability
    top_p=0.95,            # Nucleus sampling: focus on the top p cumulative probability
    repetition_penalty=1.2,  # Discourage repetition
    do_sample=True,        # Sample from the distribution instead of taking argmax
    num_return_sequences=1,  # Number of sequences to generate
    use_cache=True         # Enable past_key_values to allow generation beyond block_size
)

# Decode the generated text back to readable format
generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

print("Generated Text:")
print(generated_text)


Using Mixture of Experts (MoE) in MLP
Using Mixture of Experts (MoE) in MLP
Using Mixture of Experts (MoE) in MLP
Using Mixture of Experts (MoE) in MLP
Number of parameters: 24.44M


AssertionError: Cannot forward sequence of length 129, block size is only 128

# trainign

In [1]:
import torch
from torch.utils.data import Dataset
from transformers import TrainingArguments, Trainer, GPT2Tokenizer, default_data_collator
from model_hf import GPTConfig, GPTLMHeadModel, MoEUsageLoggingCallback
import numpy as np
import logging
from omegaconf import OmegaConf


In [2]:
# Define the NumpyMemmapDataset class
class NumpyMemmapDataset(Dataset):
    """
    PyTorch Dataset for loading numpy memmapped arrays efficiently.
    """
    def __init__(self, filename="data/mock_train.bin", block_size=1024, device="cpu", iterate='random', 
                 return_labels=False, eval_data=False, eval_samples=1000):
        """
        Args:
        - filename (str): Path to the binary file.
        - block_size (int): Length of the sequence.
        - device (str): Device to move the tensors ('cpu' or 'cuda').
        - iterate (str): 'random' or 'linear'. If 'random', samples random slices from the dataset.
        - return_labels (bool): Whether to return labels (used for GPT-2 LMHeadModel).
        - eval_data (bool): If True, dataset works in evaluation mode and samples a limited number of sequences.
        - eval_samples (int): Number of samples for evaluation mode (if eval_data=True).
        """
        self.data = np.memmap(filename, dtype=np.uint16, mode='r')  # Load binary file with memmap
        self.max_len = len(self.data)
        self.block_size = block_size
        self.iterate = iterate
        self.device = torch.device(device)
        self.return_labels = return_labels
        self.eval_data = eval_data
        self.eval_samples = eval_samples

    def __len__(self):
        # If evaluation mode, return eval_samples, otherwise return full length
        return self.eval_samples if self.eval_data else self.max_len // self.block_size

    def __getitem__(self, index):
        # Handle random sampling if 'iterate' is set to 'random'
        if self.iterate == 'random':
            index = torch.randint(0, self.max_len // self.block_size, (1,)).item()

        idx = index * self.block_size
        x = torch.from_numpy(self.data[idx:idx + self.block_size].astype(np.int64))
        y = torch.from_numpy(self.data[idx + 1:idx + 1 + self.block_size].astype(np.int64))

        if self.device.type == 'cuda':
            # Pin memory for GPU asynchronous transfer
            x = x.pin_memory().to(self.device, non_blocking=True)
            y = y.pin_memory().to(self.device, non_blocking=True)

        # If `return_labels` is true, return both input and target (for GPT training)
        if self.return_labels:
            return {"input_ids": x, "labels": y}
        return x


logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("training.log", mode='w'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger('MoELogger')

In [None]:
# Load config using OmegaConf
config_file = 'config.yaml'  # Path to your config file
cfg = OmegaConf.load(config_file)

# Load the GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Add special tokens, such as PAD token (if not already present)
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})  # Set pad token to eos token


block_size = 128

# Prepare the model configuration
model_config = GPTConfig(**cfg.model)
model = GPTLMHeadModel(model_config)
train_dataset = NumpyMemmapDataset(**cfg.dataset.train)
eval_dataset = NumpyMemmapDataset(**cfg.dataset.eval)
training_args = TrainingArguments(**cfg.train.training_args)
# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=default_data_collator
)

# Optional MoE logging callback
moe_logging_callback = MoEUsageLoggingCallback(
    trainer=trainer, logger=logger,**cfg.train.moe_log
)
trainer.add_callback(moe_logging_callback)
# Train the model
trainer.train()

# Save the trained model
trainer.model.save_pretrained(cfg.train.model_save_dir)

GPT has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.


Using Mixture of Experts (MoE) in MLP
Using Mixture of Experts (MoE) in MLP
Using Mixture of Experts (MoE) in MLP
Using Mixture of Experts (MoE) in MLP
Number of parameters: 24.41M


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
max_steps is given, it will override any value given in num_train_epochs
Using auto half precision backend
***** Running training *****
  Num examples = 2,359
  Num Epochs = 85,715
  Instantaneous batch size per device = 30
  Total train batch size (w. parallel, distributed & accumulation) = 300
  Gradient Accumulation steps = 10
  Total optimization steps = 600,000
  Number of trainable parameters = 24,440,320


  0%|          | 0/600000 [00:00<?, ?it/s]

  y = F.scaled_dot_product_attention(
2024-10-22 09:22:12,736 - INFO - MoE Layer 1 Usage: Expert 0: 21.09%, Expert 1: 22.10%, Expert 2: 13.25%, Expert 3: 22.28%, Expert 4: 21.28%
2024-10-22 09:22:12,737 - INFO - MoE Layer 2 Usage: Expert 0: 22.25%, Expert 1: 22.40%, Expert 2: 19.73%, Expert 3: 22.45%, Expert 4: 13.17%
2024-10-22 09:22:12,738 - INFO - MoE Layer 3 Usage: Expert 0: 21.33%, Expert 1: 20.86%, Expert 2: 23.40%, Expert 3: 17.23%, Expert 4: 17.18%
2024-10-22 09:22:12,749 - INFO - MoE Layer 4 Usage: Expert 0: 24.09%, Expert 1: 22.09%, Expert 2: 9.04%, Expert 3: 20.45%, Expert 4: 24.33%


Logging MoE usage stats...


2024-10-22 09:22:16,687 - INFO - MoE Layer 1 Usage: Expert 0: 21.39%, Expert 1: 21.86%, Expert 2: 13.39%, Expert 3: 22.06%, Expert 4: 21.31%
2024-10-22 09:22:16,688 - INFO - MoE Layer 2 Usage: Expert 0: 24.28%, Expert 1: 22.17%, Expert 2: 17.73%, Expert 3: 23.73%, Expert 4: 12.09%
2024-10-22 09:22:16,689 - INFO - MoE Layer 3 Usage: Expert 0: 22.54%, Expert 1: 17.57%, Expert 2: 23.42%, Expert 3: 18.78%, Expert 4: 17.68%
2024-10-22 09:22:16,690 - INFO - MoE Layer 4 Usage: Expert 0: 29.62%, Expert 1: 21.12%, Expert 2: 5.88%, Expert 3: 16.17%, Expert 4: 27.21%


Logging MoE usage stats...


2024-10-22 09:22:20,659 - INFO - MoE Layer 1 Usage: Expert 0: 22.20%, Expert 1: 21.79%, Expert 2: 13.51%, Expert 3: 21.65%, Expert 4: 20.86%
2024-10-22 09:22:20,660 - INFO - MoE Layer 2 Usage: Expert 0: 27.66%, Expert 1: 21.32%, Expert 2: 15.00%, Expert 3: 26.16%, Expert 4: 9.86%
2024-10-22 09:22:20,661 - INFO - MoE Layer 3 Usage: Expert 0: 23.93%, Expert 1: 12.04%, Expert 2: 23.26%, Expert 3: 21.58%, Expert 4: 19.19%
2024-10-22 09:22:20,663 - INFO - MoE Layer 4 Usage: Expert 0: 35.12%, Expert 1: 18.67%, Expert 2: 2.04%, Expert 3: 12.04%, Expert 4: 32.13%


Logging MoE usage stats...


2024-10-22 09:22:24,670 - INFO - MoE Layer 1 Usage: Expert 0: 23.15%, Expert 1: 21.92%, Expert 2: 13.83%, Expert 3: 21.08%, Expert 4: 20.02%
2024-10-22 09:22:24,671 - INFO - MoE Layer 2 Usage: Expert 0: 29.25%, Expert 1: 21.04%, Expert 2: 14.65%, Expert 3: 26.82%, Expert 4: 8.25%
2024-10-22 09:22:24,672 - INFO - MoE Layer 3 Usage: Expert 0: 23.62%, Expert 1: 9.02%, Expert 2: 20.49%, Expert 3: 23.83%, Expert 4: 23.03%
2024-10-22 09:22:24,673 - INFO - MoE Layer 4 Usage: Expert 0: 30.72%, Expert 1: 21.29%, Expert 2: 0.28%, Expert 3: 16.25%, Expert 4: 31.46%


Logging MoE usage stats...


2024-10-22 09:22:28,544 - INFO - MoE Layer 1 Usage: Expert 0: 23.70%, Expert 1: 21.94%, Expert 2: 14.47%, Expert 3: 20.63%, Expert 4: 19.26%
2024-10-22 09:22:28,545 - INFO - MoE Layer 2 Usage: Expert 0: 24.32%, Expert 1: 22.92%, Expert 2: 19.52%, Expert 3: 22.59%, Expert 4: 10.65%
2024-10-22 09:22:28,546 - INFO - MoE Layer 3 Usage: Expert 0: 20.81%, Expert 1: 16.48%, Expert 2: 18.05%, Expert 3: 21.86%, Expert 4: 22.80%
2024-10-22 09:22:28,547 - INFO - MoE Layer 4 Usage: Expert 0: 26.24%, Expert 1: 24.55%, Expert 2: 0.01%, Expert 3: 22.07%, Expert 4: 27.14%


Logging MoE usage stats...


2024-10-22 09:22:32,361 - INFO - MoE Layer 1 Usage: Expert 0: 23.06%, Expert 1: 21.32%, Expert 2: 16.01%, Expert 3: 20.44%, Expert 4: 19.17%
2024-10-22 09:22:32,362 - INFO - MoE Layer 2 Usage: Expert 0: 20.53%, Expert 1: 20.15%, Expert 2: 19.85%, Expert 3: 20.56%, Expert 4: 18.91%
2024-10-22 09:22:32,363 - INFO - MoE Layer 3 Usage: Expert 0: 20.48%, Expert 1: 19.08%, Expert 2: 19.29%, Expert 3: 20.70%, Expert 4: 20.45%
2024-10-22 09:22:32,364 - INFO - MoE Layer 4 Usage: Expert 0: 26.20%, Expert 1: 25.44%, Expert 2: 0.00%, Expert 3: 21.55%, Expert 4: 26.82%


Logging MoE usage stats...


2024-10-22 09:22:36,213 - INFO - MoE Layer 1 Usage: Expert 0: 21.39%, Expert 1: 20.53%, Expert 2: 18.29%, Expert 3: 20.06%, Expert 4: 19.73%
2024-10-22 09:22:36,214 - INFO - MoE Layer 2 Usage: Expert 0: 21.72%, Expert 1: 19.63%, Expert 2: 19.21%, Expert 3: 19.51%, Expert 4: 19.93%
2024-10-22 09:22:36,215 - INFO - MoE Layer 3 Usage: Expert 0: 19.79%, Expert 1: 20.46%, Expert 2: 19.83%, Expert 3: 19.92%, Expert 4: 20.00%
2024-10-22 09:22:36,216 - INFO - MoE Layer 4 Usage: Expert 0: 25.59%, Expert 1: 24.83%, Expert 2: 0.00%, Expert 3: 23.93%, Expert 4: 25.65%


Logging MoE usage stats...


2024-10-22 09:22:39,997 - INFO - MoE Layer 1 Usage: Expert 0: 19.96%, Expert 1: 20.20%, Expert 2: 20.14%, Expert 3: 19.48%, Expert 4: 20.23%
2024-10-22 09:22:39,998 - INFO - MoE Layer 2 Usage: Expert 0: 20.30%, Expert 1: 19.89%, Expert 2: 19.80%, Expert 3: 20.26%, Expert 4: 19.75%
2024-10-22 09:22:39,999 - INFO - MoE Layer 3 Usage: Expert 0: 20.05%, Expert 1: 19.49%, Expert 2: 19.75%, Expert 3: 20.02%, Expert 4: 20.68%
2024-10-22 09:22:40,000 - INFO - MoE Layer 4 Usage: Expert 0: 25.05%, Expert 1: 25.21%, Expert 2: 0.00%, Expert 3: 25.04%, Expert 4: 24.71%


Logging MoE usage stats...


2024-10-22 09:22:43,776 - INFO - MoE Layer 1 Usage: Expert 0: 19.83%, Expert 1: 20.39%, Expert 2: 20.48%, Expert 3: 19.22%, Expert 4: 20.08%
2024-10-22 09:22:43,777 - INFO - MoE Layer 2 Usage: Expert 0: 20.74%, Expert 1: 20.36%, Expert 2: 20.13%, Expert 3: 19.38%, Expert 4: 19.38%
2024-10-22 09:22:43,778 - INFO - MoE Layer 3 Usage: Expert 0: 20.30%, Expert 1: 19.90%, Expert 2: 19.46%, Expert 3: 19.71%, Expert 4: 20.63%
2024-10-22 09:22:43,779 - INFO - MoE Layer 4 Usage: Expert 0: 25.51%, Expert 1: 24.92%, Expert 2: 0.00%, Expert 3: 24.70%, Expert 4: 24.87%


Logging MoE usage stats...


2024-10-22 09:22:47,535 - INFO - MoE Layer 1 Usage: Expert 0: 20.24%, Expert 1: 20.35%, Expert 2: 20.20%, Expert 3: 19.47%, Expert 4: 19.75%
2024-10-22 09:22:47,536 - INFO - MoE Layer 2 Usage: Expert 0: 20.86%, Expert 1: 20.01%, Expert 2: 20.08%, Expert 3: 19.81%, Expert 4: 19.25%
2024-10-22 09:22:47,537 - INFO - MoE Layer 3 Usage: Expert 0: 20.15%, Expert 1: 20.29%, Expert 2: 19.64%, Expert 3: 19.51%, Expert 4: 20.42%
2024-10-22 09:22:47,538 - INFO - MoE Layer 4 Usage: Expert 0: 25.19%, Expert 1: 24.98%, Expert 2: 0.00%, Expert 3: 24.69%, Expert 4: 25.15%

***** Running Evaluation *****
  Num examples = 1000
  Batch size = 30


Logging MoE usage stats...


  0%|          | 0/34 [00:00<?, ?it/s]

Saving model checkpoint to ./results/pretraining_moe/checkpoints\checkpoint-100
Configuration saved in ./results/pretraining_moe/checkpoints\checkpoint-100\config.json
Configuration saved in ./results/pretraining_moe/checkpoints\checkpoint-100\generation_config.json
Model weights saved in ./results/pretraining_moe/checkpoints\checkpoint-100\model.safetensors
tokenizer config file saved in ./results/pretraining_moe/checkpoints\checkpoint-100\tokenizer_config.json
Special tokens file saved in ./results/pretraining_moe/checkpoints\checkpoint-100\special_tokens_map.json


{'eval_loss': 9.569494247436523, 'eval_runtime': 0.5094, 'eval_samples_per_second': 1963.261, 'eval_steps_per_second': 66.751, 'epoch': 12.66}


2024-10-22 09:22:52,184 - INFO - MoE Layer 1 Usage: Expert 0: 20.45%, Expert 1: 20.14%, Expert 2: 19.93%, Expert 3: 19.41%, Expert 4: 20.07%
2024-10-22 09:22:52,185 - INFO - MoE Layer 2 Usage: Expert 0: 20.70%, Expert 1: 20.21%, Expert 2: 20.20%, Expert 3: 19.91%, Expert 4: 18.98%
2024-10-22 09:22:52,186 - INFO - MoE Layer 3 Usage: Expert 0: 19.88%, Expert 1: 20.24%, Expert 2: 20.09%, Expert 3: 19.55%, Expert 4: 20.24%
2024-10-22 09:22:52,188 - INFO - MoE Layer 4 Usage: Expert 0: 25.41%, Expert 1: 25.09%, Expert 2: 0.00%, Expert 3: 24.73%, Expert 4: 24.77%


Logging MoE usage stats...


2024-10-22 09:22:55,980 - INFO - MoE Layer 1 Usage: Expert 0: 20.30%, Expert 1: 19.78%, Expert 2: 20.34%, Expert 3: 19.76%, Expert 4: 19.82%
2024-10-22 09:22:55,981 - INFO - MoE Layer 2 Usage: Expert 0: 20.64%, Expert 1: 19.97%, Expert 2: 20.15%, Expert 3: 20.37%, Expert 4: 18.86%
2024-10-22 09:22:55,982 - INFO - MoE Layer 3 Usage: Expert 0: 20.23%, Expert 1: 19.84%, Expert 2: 20.19%, Expert 3: 19.53%, Expert 4: 20.21%
2024-10-22 09:22:55,983 - INFO - MoE Layer 4 Usage: Expert 0: 24.86%, Expert 1: 24.93%, Expert 2: 0.00%, Expert 3: 24.79%, Expert 4: 25.43%


Logging MoE usage stats...


2024-10-22 09:22:59,824 - INFO - MoE Layer 1 Usage: Expert 0: 20.36%, Expert 1: 19.73%, Expert 2: 20.49%, Expert 3: 19.71%, Expert 4: 19.71%
2024-10-22 09:22:59,825 - INFO - MoE Layer 2 Usage: Expert 0: 20.69%, Expert 1: 19.86%, Expert 2: 20.29%, Expert 3: 20.10%, Expert 4: 19.07%
2024-10-22 09:22:59,826 - INFO - MoE Layer 3 Usage: Expert 0: 20.21%, Expert 1: 19.66%, Expert 2: 20.24%, Expert 3: 19.85%, Expert 4: 20.04%
2024-10-22 09:22:59,828 - INFO - MoE Layer 4 Usage: Expert 0: 24.88%, Expert 1: 24.75%, Expert 2: 0.00%, Expert 3: 25.08%, Expert 4: 25.29%


Logging MoE usage stats...


2024-10-22 09:23:03,637 - INFO - MoE Layer 1 Usage: Expert 0: 20.22%, Expert 1: 19.81%, Expert 2: 20.34%, Expert 3: 19.84%, Expert 4: 19.80%
2024-10-22 09:23:03,638 - INFO - MoE Layer 2 Usage: Expert 0: 20.44%, Expert 1: 19.91%, Expert 2: 20.17%, Expert 3: 20.28%, Expert 4: 19.20%
2024-10-22 09:23:03,639 - INFO - MoE Layer 3 Usage: Expert 0: 20.34%, Expert 1: 19.51%, Expert 2: 20.44%, Expert 3: 19.82%, Expert 4: 19.88%
2024-10-22 09:23:03,641 - INFO - MoE Layer 4 Usage: Expert 0: 24.81%, Expert 1: 24.35%, Expert 2: 0.00%, Expert 3: 25.34%, Expert 4: 25.50%


Logging MoE usage stats...


2024-10-22 09:23:07,434 - INFO - MoE Layer 1 Usage: Expert 0: 20.34%, Expert 1: 19.33%, Expert 2: 20.58%, Expert 3: 19.74%, Expert 4: 20.01%
2024-10-22 09:23:07,435 - INFO - MoE Layer 2 Usage: Expert 0: 20.45%, Expert 1: 20.00%, Expert 2: 20.11%, Expert 3: 20.34%, Expert 4: 19.10%
2024-10-22 09:23:07,436 - INFO - MoE Layer 3 Usage: Expert 0: 20.34%, Expert 1: 19.72%, Expert 2: 20.10%, Expert 3: 19.68%, Expert 4: 20.16%
2024-10-22 09:23:07,437 - INFO - MoE Layer 4 Usage: Expert 0: 24.97%, Expert 1: 25.23%, Expert 2: 0.00%, Expert 3: 24.67%, Expert 4: 25.13%


Logging MoE usage stats...


2024-10-22 09:23:11,270 - INFO - MoE Layer 1 Usage: Expert 0: 20.27%, Expert 1: 19.46%, Expert 2: 20.40%, Expert 3: 19.91%, Expert 4: 19.96%
2024-10-22 09:23:11,271 - INFO - MoE Layer 2 Usage: Expert 0: 20.45%, Expert 1: 19.66%, Expert 2: 20.35%, Expert 3: 20.43%, Expert 4: 19.12%
2024-10-22 09:23:11,272 - INFO - MoE Layer 3 Usage: Expert 0: 20.52%, Expert 1: 19.97%, Expert 2: 20.02%, Expert 3: 19.34%, Expert 4: 20.15%
2024-10-22 09:23:11,273 - INFO - MoE Layer 4 Usage: Expert 0: 25.05%, Expert 1: 24.35%, Expert 2: 0.00%, Expert 3: 25.19%, Expert 4: 25.41%


Logging MoE usage stats...


2024-10-22 09:23:15,080 - INFO - MoE Layer 1 Usage: Expert 0: 20.25%, Expert 1: 19.40%, Expert 2: 20.51%, Expert 3: 19.84%, Expert 4: 19.99%
2024-10-22 09:23:15,081 - INFO - MoE Layer 2 Usage: Expert 0: 20.19%, Expert 1: 19.77%, Expert 2: 20.23%, Expert 3: 20.76%, Expert 4: 19.05%
2024-10-22 09:23:15,082 - INFO - MoE Layer 3 Usage: Expert 0: 20.41%, Expert 1: 19.38%, Expert 2: 20.52%, Expert 3: 19.90%, Expert 4: 19.79%
2024-10-22 09:23:15,083 - INFO - MoE Layer 4 Usage: Expert 0: 24.92%, Expert 1: 23.92%, Expert 2: 0.00%, Expert 3: 25.46%, Expert 4: 25.70%


Logging MoE usage stats...


2024-10-22 09:23:18,888 - INFO - MoE Layer 1 Usage: Expert 0: 20.15%, Expert 1: 19.28%, Expert 2: 20.39%, Expert 3: 20.11%, Expert 4: 20.06%
2024-10-22 09:23:18,889 - INFO - MoE Layer 2 Usage: Expert 0: 20.27%, Expert 1: 19.56%, Expert 2: 20.53%, Expert 3: 20.39%, Expert 4: 19.25%
2024-10-22 09:23:18,889 - INFO - MoE Layer 3 Usage: Expert 0: 20.39%, Expert 1: 19.70%, Expert 2: 20.40%, Expert 3: 19.42%, Expert 4: 20.09%
2024-10-22 09:23:18,891 - INFO - MoE Layer 4 Usage: Expert 0: 24.52%, Expert 1: 25.14%, Expert 2: 0.00%, Expert 3: 25.12%, Expert 4: 25.23%


Logging MoE usage stats...


2024-10-22 09:23:22,677 - INFO - MoE Layer 1 Usage: Expert 0: 20.36%, Expert 1: 19.22%, Expert 2: 20.32%, Expert 3: 19.90%, Expert 4: 20.19%
2024-10-22 09:23:22,678 - INFO - MoE Layer 2 Usage: Expert 0: 20.33%, Expert 1: 19.51%, Expert 2: 20.54%, Expert 3: 20.92%, Expert 4: 18.70%
2024-10-22 09:23:22,679 - INFO - MoE Layer 3 Usage: Expert 0: 20.48%, Expert 1: 20.15%, Expert 2: 20.49%, Expert 3: 19.27%, Expert 4: 19.61%
2024-10-22 09:23:22,680 - INFO - MoE Layer 4 Usage: Expert 0: 25.56%, Expert 1: 24.29%, Expert 2: 0.00%, Expert 3: 24.92%, Expert 4: 25.23%


Logging MoE usage stats...


2024-10-22 09:23:26,503 - INFO - MoE Layer 1 Usage: Expert 0: 20.29%, Expert 1: 19.18%, Expert 2: 20.10%, Expert 3: 20.21%, Expert 4: 20.21%
2024-10-22 09:23:26,504 - INFO - MoE Layer 2 Usage: Expert 0: 20.10%, Expert 1: 19.56%, Expert 2: 20.67%, Expert 3: 20.76%, Expert 4: 18.92%
2024-10-22 09:23:26,505 - INFO - MoE Layer 3 Usage: Expert 0: 20.14%, Expert 1: 19.64%, Expert 2: 21.15%, Expert 3: 19.70%, Expert 4: 19.37%
2024-10-22 09:23:26,506 - INFO - MoE Layer 4 Usage: Expert 0: 25.03%, Expert 1: 24.71%, Expert 2: 0.00%, Expert 3: 25.00%, Expert 4: 25.26%

***** Running Evaluation *****
  Num examples = 1000
  Batch size = 30


Logging MoE usage stats...


  0%|          | 0/34 [00:00<?, ?it/s]

Saving model checkpoint to ./results/pretraining_moe/checkpoints\checkpoint-200
Configuration saved in ./results/pretraining_moe/checkpoints\checkpoint-200\config.json
Configuration saved in ./results/pretraining_moe/checkpoints\checkpoint-200\generation_config.json
Model weights saved in ./results/pretraining_moe/checkpoints\checkpoint-200\model.safetensors
tokenizer config file saved in ./results/pretraining_moe/checkpoints\checkpoint-200\tokenizer_config.json
Special tokens file saved in ./results/pretraining_moe/checkpoints\checkpoint-200\special_tokens_map.json


{'eval_loss': 8.01181697845459, 'eval_runtime': 0.5115, 'eval_samples_per_second': 1954.996, 'eval_steps_per_second': 66.47, 'epoch': 25.32}


2024-10-22 09:23:31,118 - INFO - MoE Layer 1 Usage: Expert 0: 20.31%, Expert 1: 18.94%, Expert 2: 20.31%, Expert 3: 20.04%, Expert 4: 20.40%
2024-10-22 09:23:31,118 - INFO - MoE Layer 2 Usage: Expert 0: 19.80%, Expert 1: 19.92%, Expert 2: 20.97%, Expert 3: 20.50%, Expert 4: 18.81%
2024-10-22 09:23:31,119 - INFO - MoE Layer 3 Usage: Expert 0: 20.66%, Expert 1: 20.39%, Expert 2: 20.51%, Expert 3: 18.67%, Expert 4: 19.77%
2024-10-22 09:23:31,121 - INFO - MoE Layer 4 Usage: Expert 0: 25.16%, Expert 1: 25.45%, Expert 2: 0.00%, Expert 3: 24.67%, Expert 4: 24.72%


Logging MoE usage stats...


2024-10-22 09:23:34,972 - INFO - MoE Layer 1 Usage: Expert 0: 20.60%, Expert 1: 19.16%, Expert 2: 20.19%, Expert 3: 20.05%, Expert 4: 20.00%
2024-10-22 09:23:34,973 - INFO - MoE Layer 2 Usage: Expert 0: 20.08%, Expert 1: 19.55%, Expert 2: 20.60%, Expert 3: 21.12%, Expert 4: 18.65%
2024-10-22 09:23:34,974 - INFO - MoE Layer 3 Usage: Expert 0: 19.90%, Expert 1: 20.52%, Expert 2: 21.07%, Expert 3: 19.23%, Expert 4: 19.28%
2024-10-22 09:23:34,975 - INFO - MoE Layer 4 Usage: Expert 0: 25.27%, Expert 1: 25.06%, Expert 2: 0.04%, Expert 3: 24.26%, Expert 4: 25.37%


Logging MoE usage stats...


2024-10-22 09:23:38,880 - INFO - MoE Layer 1 Usage: Expert 0: 20.37%, Expert 1: 19.14%, Expert 2: 20.09%, Expert 3: 20.24%, Expert 4: 20.16%
2024-10-22 09:23:38,881 - INFO - MoE Layer 2 Usage: Expert 0: 20.43%, Expert 1: 19.50%, Expert 2: 20.30%, Expert 3: 21.17%, Expert 4: 18.60%
2024-10-22 09:23:38,882 - INFO - MoE Layer 3 Usage: Expert 0: 19.69%, Expert 1: 20.45%, Expert 2: 21.31%, Expert 3: 19.19%, Expert 4: 19.36%
2024-10-22 09:23:38,883 - INFO - MoE Layer 4 Usage: Expert 0: 25.31%, Expert 1: 24.93%, Expert 2: 0.44%, Expert 3: 24.04%, Expert 4: 25.28%


Logging MoE usage stats...


2024-10-22 09:23:42,784 - INFO - MoE Layer 1 Usage: Expert 0: 20.39%, Expert 1: 18.98%, Expert 2: 20.00%, Expert 3: 20.20%, Expert 4: 20.43%
2024-10-22 09:23:42,785 - INFO - MoE Layer 2 Usage: Expert 0: 20.51%, Expert 1: 19.02%, Expert 2: 20.42%, Expert 3: 21.31%, Expert 4: 18.74%
2024-10-22 09:23:42,785 - INFO - MoE Layer 3 Usage: Expert 0: 19.59%, Expert 1: 20.90%, Expert 2: 21.46%, Expert 3: 18.96%, Expert 4: 19.09%
2024-10-22 09:23:42,787 - INFO - MoE Layer 4 Usage: Expert 0: 24.62%, Expert 1: 24.20%, Expert 2: 2.70%, Expert 3: 23.96%, Expert 4: 24.51%


Logging MoE usage stats...


2024-10-22 09:23:46,672 - INFO - MoE Layer 1 Usage: Expert 0: 20.34%, Expert 1: 19.15%, Expert 2: 20.20%, Expert 3: 20.29%, Expert 4: 20.02%
2024-10-22 09:23:46,673 - INFO - MoE Layer 2 Usage: Expert 0: 20.72%, Expert 1: 18.89%, Expert 2: 19.89%, Expert 3: 21.83%, Expert 4: 18.67%
2024-10-22 09:23:46,674 - INFO - MoE Layer 3 Usage: Expert 0: 19.43%, Expert 1: 21.02%, Expert 2: 21.35%, Expert 3: 19.03%, Expert 4: 19.17%
2024-10-22 09:23:46,675 - INFO - MoE Layer 4 Usage: Expert 0: 20.31%, Expert 1: 19.84%, Expert 2: 18.88%, Expert 3: 20.37%, Expert 4: 20.60%


Logging MoE usage stats...


2024-10-22 09:23:50,531 - INFO - MoE Layer 1 Usage: Expert 0: 20.30%, Expert 1: 18.98%, Expert 2: 20.04%, Expert 3: 20.26%, Expert 4: 20.43%
2024-10-22 09:23:50,532 - INFO - MoE Layer 2 Usage: Expert 0: 20.94%, Expert 1: 18.84%, Expert 2: 19.87%, Expert 3: 21.62%, Expert 4: 18.73%
2024-10-22 09:23:50,533 - INFO - MoE Layer 3 Usage: Expert 0: 19.39%, Expert 1: 21.16%, Expert 2: 21.42%, Expert 3: 19.05%, Expert 4: 18.98%
2024-10-22 09:23:50,535 - INFO - MoE Layer 4 Usage: Expert 0: 20.13%, Expert 1: 20.16%, Expert 2: 16.36%, Expert 3: 22.05%, Expert 4: 21.30%


Logging MoE usage stats...


2024-10-22 09:23:54,457 - INFO - MoE Layer 1 Usage: Expert 0: 20.42%, Expert 1: 19.04%, Expert 2: 20.07%, Expert 3: 20.17%, Expert 4: 20.30%
2024-10-22 09:23:54,458 - INFO - MoE Layer 2 Usage: Expert 0: 21.17%, Expert 1: 18.51%, Expert 2: 19.86%, Expert 3: 21.90%, Expert 4: 18.56%
2024-10-22 09:23:54,459 - INFO - MoE Layer 3 Usage: Expert 0: 18.67%, Expert 1: 21.85%, Expert 2: 21.49%, Expert 3: 18.79%, Expert 4: 19.20%
2024-10-22 09:23:54,461 - INFO - MoE Layer 4 Usage: Expert 0: 20.77%, Expert 1: 20.93%, Expert 2: 17.84%, Expert 3: 20.68%, Expert 4: 19.79%


Logging MoE usage stats...


2024-10-22 09:23:58,487 - INFO - MoE Layer 1 Usage: Expert 0: 20.62%, Expert 1: 19.08%, Expert 2: 19.92%, Expert 3: 20.07%, Expert 4: 20.31%
2024-10-22 09:23:58,488 - INFO - MoE Layer 2 Usage: Expert 0: 21.05%, Expert 1: 18.90%, Expert 2: 19.80%, Expert 3: 21.91%, Expert 4: 18.34%
2024-10-22 09:23:58,489 - INFO - MoE Layer 3 Usage: Expert 0: 18.14%, Expert 1: 21.67%, Expert 2: 22.02%, Expert 3: 19.04%, Expert 4: 19.13%
2024-10-22 09:23:58,490 - INFO - MoE Layer 4 Usage: Expert 0: 20.02%, Expert 1: 20.50%, Expert 2: 17.82%, Expert 3: 21.41%, Expert 4: 20.26%


Logging MoE usage stats...


2024-10-22 09:24:02,407 - INFO - MoE Layer 1 Usage: Expert 0: 20.64%, Expert 1: 18.98%, Expert 2: 19.97%, Expert 3: 20.11%, Expert 4: 20.31%
2024-10-22 09:24:02,408 - INFO - MoE Layer 2 Usage: Expert 0: 21.49%, Expert 1: 18.49%, Expert 2: 19.59%, Expert 3: 21.70%, Expert 4: 18.73%
2024-10-22 09:24:02,409 - INFO - MoE Layer 3 Usage: Expert 0: 17.95%, Expert 1: 22.00%, Expert 2: 22.63%, Expert 3: 18.62%, Expert 4: 18.81%
2024-10-22 09:24:02,410 - INFO - MoE Layer 4 Usage: Expert 0: 20.22%, Expert 1: 20.83%, Expert 2: 17.58%, Expert 3: 21.41%, Expert 4: 19.96%


Logging MoE usage stats...


2024-10-22 09:24:06,396 - INFO - MoE Layer 1 Usage: Expert 0: 20.62%, Expert 1: 19.03%, Expert 2: 20.00%, Expert 3: 19.83%, Expert 4: 20.52%
2024-10-22 09:24:06,397 - INFO - MoE Layer 2 Usage: Expert 0: 21.57%, Expert 1: 18.75%, Expert 2: 19.75%, Expert 3: 21.93%, Expert 4: 18.00%
2024-10-22 09:24:06,398 - INFO - MoE Layer 3 Usage: Expert 0: 17.42%, Expert 1: 22.15%, Expert 2: 23.21%, Expert 3: 18.35%, Expert 4: 18.87%
2024-10-22 09:24:06,399 - INFO - MoE Layer 4 Usage: Expert 0: 20.16%, Expert 1: 20.58%, Expert 2: 17.85%, Expert 3: 21.70%, Expert 4: 19.72%

***** Running Evaluation *****
  Num examples = 1000
  Batch size = 30


Logging MoE usage stats...


  0%|          | 0/34 [00:00<?, ?it/s]

Saving model checkpoint to ./results/pretraining_moe/checkpoints\checkpoint-300
Configuration saved in ./results/pretraining_moe/checkpoints\checkpoint-300\config.json
Configuration saved in ./results/pretraining_moe/checkpoints\checkpoint-300\generation_config.json
Model weights saved in ./results/pretraining_moe/checkpoints\checkpoint-300\model.safetensors
tokenizer config file saved in ./results/pretraining_moe/checkpoints\checkpoint-300\tokenizer_config.json
Special tokens file saved in ./results/pretraining_moe/checkpoints\checkpoint-300\special_tokens_map.json


{'eval_loss': 6.638370037078857, 'eval_runtime': 0.5202, 'eval_samples_per_second': 1922.375, 'eval_steps_per_second': 65.361, 'epoch': 37.97}


Deleting older checkpoint [results\pretraining_moe\checkpoints\checkpoint-100] due to args.save_total_limit
2024-10-22 09:24:11,156 - INFO - MoE Layer 1 Usage: Expert 0: 20.73%, Expert 1: 18.96%, Expert 2: 19.90%, Expert 3: 20.07%, Expert 4: 20.35%
2024-10-22 09:24:11,157 - INFO - MoE Layer 2 Usage: Expert 0: 21.81%, Expert 1: 19.29%, Expert 2: 19.61%, Expert 3: 21.76%, Expert 4: 17.52%
2024-10-22 09:24:11,158 - INFO - MoE Layer 3 Usage: Expert 0: 16.93%, Expert 1: 22.57%, Expert 2: 23.45%, Expert 3: 18.24%, Expert 4: 18.81%
2024-10-22 09:24:11,159 - INFO - MoE Layer 4 Usage: Expert 0: 20.27%, Expert 1: 20.46%, Expert 2: 17.50%, Expert 3: 21.94%, Expert 4: 19.83%


Logging MoE usage stats...


2024-10-22 09:24:15,060 - INFO - MoE Layer 1 Usage: Expert 0: 20.72%, Expert 1: 18.88%, Expert 2: 19.78%, Expert 3: 20.04%, Expert 4: 20.58%
2024-10-22 09:24:15,061 - INFO - MoE Layer 2 Usage: Expert 0: 21.82%, Expert 1: 18.76%, Expert 2: 19.53%, Expert 3: 21.76%, Expert 4: 18.13%
2024-10-22 09:24:15,062 - INFO - MoE Layer 3 Usage: Expert 0: 15.98%, Expert 1: 22.50%, Expert 2: 24.62%, Expert 3: 18.22%, Expert 4: 18.68%
2024-10-22 09:24:15,063 - INFO - MoE Layer 4 Usage: Expert 0: 20.27%, Expert 1: 20.82%, Expert 2: 17.12%, Expert 3: 22.01%, Expert 4: 19.78%


Logging MoE usage stats...


2024-10-22 09:24:18,969 - INFO - MoE Layer 1 Usage: Expert 0: 20.95%, Expert 1: 19.00%, Expert 2: 19.36%, Expert 3: 20.17%, Expert 4: 20.53%
2024-10-22 09:24:18,970 - INFO - MoE Layer 2 Usage: Expert 0: 21.41%, Expert 1: 19.37%, Expert 2: 19.22%, Expert 3: 22.10%, Expert 4: 17.90%
2024-10-22 09:24:18,971 - INFO - MoE Layer 3 Usage: Expert 0: 15.65%, Expert 1: 22.82%, Expert 2: 24.82%, Expert 3: 17.88%, Expert 4: 18.83%
2024-10-22 09:24:18,973 - INFO - MoE Layer 4 Usage: Expert 0: 20.57%, Expert 1: 20.85%, Expert 2: 17.54%, Expert 3: 21.67%, Expert 4: 19.37%


Logging MoE usage stats...


2024-10-22 09:24:22,838 - INFO - MoE Layer 1 Usage: Expert 0: 20.92%, Expert 1: 19.09%, Expert 2: 19.60%, Expert 3: 20.13%, Expert 4: 20.26%
2024-10-22 09:24:22,838 - INFO - MoE Layer 2 Usage: Expert 0: 21.62%, Expert 1: 19.06%, Expert 2: 19.78%, Expert 3: 21.61%, Expert 4: 17.94%
2024-10-22 09:24:22,840 - INFO - MoE Layer 3 Usage: Expert 0: 14.99%, Expert 1: 22.71%, Expert 2: 25.41%, Expert 3: 17.94%, Expert 4: 18.95%
2024-10-22 09:24:22,841 - INFO - MoE Layer 4 Usage: Expert 0: 20.00%, Expert 1: 21.40%, Expert 2: 17.48%, Expert 3: 21.71%, Expert 4: 19.41%


Logging MoE usage stats...


2024-10-22 09:24:26,720 - INFO - MoE Layer 1 Usage: Expert 0: 20.78%, Expert 1: 18.88%, Expert 2: 19.73%, Expert 3: 20.26%, Expert 4: 20.36%
2024-10-22 09:24:26,720 - INFO - MoE Layer 2 Usage: Expert 0: 21.55%, Expert 1: 19.26%, Expert 2: 19.59%, Expert 3: 21.97%, Expert 4: 17.63%
2024-10-22 09:24:26,721 - INFO - MoE Layer 3 Usage: Expert 0: 15.15%, Expert 1: 22.82%, Expert 2: 24.76%, Expert 3: 18.34%, Expert 4: 18.93%
2024-10-22 09:24:26,723 - INFO - MoE Layer 4 Usage: Expert 0: 19.78%, Expert 1: 21.07%, Expert 2: 17.59%, Expert 3: 22.33%, Expert 4: 19.23%


Logging MoE usage stats...


2024-10-22 09:24:30,568 - INFO - MoE Layer 1 Usage: Expert 0: 21.28%, Expert 1: 18.87%, Expert 2: 19.59%, Expert 3: 20.22%, Expert 4: 20.03%
2024-10-22 09:24:30,569 - INFO - MoE Layer 2 Usage: Expert 0: 21.50%, Expert 1: 19.15%, Expert 2: 19.18%, Expert 3: 22.13%, Expert 4: 18.04%
2024-10-22 09:24:30,570 - INFO - MoE Layer 3 Usage: Expert 0: 14.78%, Expert 1: 23.17%, Expert 2: 24.22%, Expert 3: 18.46%, Expert 4: 19.37%
2024-10-22 09:24:30,571 - INFO - MoE Layer 4 Usage: Expert 0: 19.98%, Expert 1: 20.76%, Expert 2: 17.93%, Expert 3: 22.87%, Expert 4: 18.46%


Logging MoE usage stats...


2024-10-22 09:24:34,407 - INFO - MoE Layer 1 Usage: Expert 0: 20.83%, Expert 1: 19.17%, Expert 2: 19.64%, Expert 3: 19.95%, Expert 4: 20.41%
2024-10-22 09:24:34,408 - INFO - MoE Layer 2 Usage: Expert 0: 21.13%, Expert 1: 18.88%, Expert 2: 19.99%, Expert 3: 22.06%, Expert 4: 17.94%
2024-10-22 09:24:34,409 - INFO - MoE Layer 3 Usage: Expert 0: 14.76%, Expert 1: 23.84%, Expert 2: 24.19%, Expert 3: 17.89%, Expert 4: 19.32%
2024-10-22 09:24:34,410 - INFO - MoE Layer 4 Usage: Expert 0: 20.14%, Expert 1: 21.04%, Expert 2: 17.90%, Expert 3: 22.49%, Expert 4: 18.43%


Logging MoE usage stats...


2024-10-22 09:24:38,243 - INFO - MoE Layer 1 Usage: Expert 0: 20.84%, Expert 1: 18.39%, Expert 2: 19.81%, Expert 3: 20.26%, Expert 4: 20.70%
2024-10-22 09:24:38,245 - INFO - MoE Layer 2 Usage: Expert 0: 21.29%, Expert 1: 19.29%, Expert 2: 19.46%, Expert 3: 21.79%, Expert 4: 18.17%
2024-10-22 09:24:38,245 - INFO - MoE Layer 3 Usage: Expert 0: 15.08%, Expert 1: 22.87%, Expert 2: 24.73%, Expert 3: 18.17%, Expert 4: 19.15%
2024-10-22 09:24:38,247 - INFO - MoE Layer 4 Usage: Expert 0: 19.14%, Expert 1: 22.14%, Expert 2: 17.99%, Expert 3: 21.98%, Expert 4: 18.75%


Logging MoE usage stats...


2024-10-22 09:24:42,094 - INFO - MoE Layer 1 Usage: Expert 0: 21.07%, Expert 1: 19.12%, Expert 2: 19.25%, Expert 3: 20.42%, Expert 4: 20.13%
2024-10-22 09:24:42,096 - INFO - MoE Layer 2 Usage: Expert 0: 21.40%, Expert 1: 19.74%, Expert 2: 18.92%, Expert 3: 21.91%, Expert 4: 18.03%
2024-10-22 09:24:42,096 - INFO - MoE Layer 3 Usage: Expert 0: 14.89%, Expert 1: 23.51%, Expert 2: 24.41%, Expert 3: 17.74%, Expert 4: 19.45%
2024-10-22 09:24:42,097 - INFO - MoE Layer 4 Usage: Expert 0: 19.08%, Expert 1: 21.74%, Expert 2: 18.04%, Expert 3: 22.72%, Expert 4: 18.43%


Logging MoE usage stats...


2024-10-22 09:24:45,927 - INFO - MoE Layer 1 Usage: Expert 0: 20.91%, Expert 1: 19.03%, Expert 2: 19.54%, Expert 3: 19.98%, Expert 4: 20.54%
2024-10-22 09:24:45,928 - INFO - MoE Layer 2 Usage: Expert 0: 20.90%, Expert 1: 18.72%, Expert 2: 19.89%, Expert 3: 22.68%, Expert 4: 17.82%
2024-10-22 09:24:45,929 - INFO - MoE Layer 3 Usage: Expert 0: 14.87%, Expert 1: 23.70%, Expert 2: 23.29%, Expert 3: 17.82%, Expert 4: 20.32%
2024-10-22 09:24:45,931 - INFO - MoE Layer 4 Usage: Expert 0: 18.93%, Expert 1: 20.99%, Expert 2: 18.57%, Expert 3: 23.22%, Expert 4: 18.29%

***** Running Evaluation *****
  Num examples = 1000
  Batch size = 30


Logging MoE usage stats...


  0%|          | 0/34 [00:00<?, ?it/s]

Saving model checkpoint to ./results/pretraining_moe/checkpoints\checkpoint-400
Configuration saved in ./results/pretraining_moe/checkpoints\checkpoint-400\config.json
Configuration saved in ./results/pretraining_moe/checkpoints\checkpoint-400\generation_config.json
Model weights saved in ./results/pretraining_moe/checkpoints\checkpoint-400\model.safetensors
tokenizer config file saved in ./results/pretraining_moe/checkpoints\checkpoint-400\tokenizer_config.json
Special tokens file saved in ./results/pretraining_moe/checkpoints\checkpoint-400\special_tokens_map.json


{'eval_loss': 6.159518718719482, 'eval_runtime': 0.5179, 'eval_samples_per_second': 1931.004, 'eval_steps_per_second': 65.654, 'epoch': 50.63}


Deleting older checkpoint [results\pretraining_moe\checkpoints\checkpoint-200] due to args.save_total_limit
2024-10-22 09:24:50,586 - INFO - MoE Layer 1 Usage: Expert 0: 20.95%, Expert 1: 18.80%, Expert 2: 19.67%, Expert 3: 20.42%, Expert 4: 20.15%
2024-10-22 09:24:50,586 - INFO - MoE Layer 2 Usage: Expert 0: 20.65%, Expert 1: 20.07%, Expert 2: 18.95%, Expert 3: 21.72%, Expert 4: 18.60%
2024-10-22 09:24:50,587 - INFO - MoE Layer 3 Usage: Expert 0: 14.73%, Expert 1: 24.12%, Expert 2: 23.35%, Expert 3: 17.95%, Expert 4: 19.84%
2024-10-22 09:24:50,589 - INFO - MoE Layer 4 Usage: Expert 0: 19.70%, Expert 1: 22.80%, Expert 2: 16.91%, Expert 3: 22.55%, Expert 4: 18.04%


Logging MoE usage stats...


2024-10-22 09:24:54,448 - INFO - MoE Layer 1 Usage: Expert 0: 21.10%, Expert 1: 19.02%, Expert 2: 19.66%, Expert 3: 20.15%, Expert 4: 20.06%
2024-10-22 09:24:54,449 - INFO - MoE Layer 2 Usage: Expert 0: 20.55%, Expert 1: 19.31%, Expert 2: 19.33%, Expert 3: 21.87%, Expert 4: 18.94%
2024-10-22 09:24:54,450 - INFO - MoE Layer 3 Usage: Expert 0: 14.52%, Expert 1: 24.45%, Expert 2: 23.14%, Expert 3: 18.58%, Expert 4: 19.32%
2024-10-22 09:24:54,451 - INFO - MoE Layer 4 Usage: Expert 0: 19.36%, Expert 1: 21.30%, Expert 2: 18.24%, Expert 3: 23.42%, Expert 4: 17.69%


Logging MoE usage stats...


2024-10-22 09:24:58,304 - INFO - MoE Layer 1 Usage: Expert 0: 21.20%, Expert 1: 19.11%, Expert 2: 18.71%, Expert 3: 20.63%, Expert 4: 20.35%
2024-10-22 09:24:58,304 - INFO - MoE Layer 2 Usage: Expert 0: 20.70%, Expert 1: 19.18%, Expert 2: 19.78%, Expert 3: 21.93%, Expert 4: 18.42%
2024-10-22 09:24:58,305 - INFO - MoE Layer 3 Usage: Expert 0: 15.33%, Expert 1: 23.38%, Expert 2: 24.38%, Expert 3: 17.34%, Expert 4: 19.58%
2024-10-22 09:24:58,307 - INFO - MoE Layer 4 Usage: Expert 0: 19.63%, Expert 1: 21.57%, Expert 2: 17.88%, Expert 3: 22.90%, Expert 4: 18.02%


Logging MoE usage stats...


2024-10-22 09:25:02,139 - INFO - MoE Layer 1 Usage: Expert 0: 21.13%, Expert 1: 18.98%, Expert 2: 19.08%, Expert 3: 20.43%, Expert 4: 20.38%
2024-10-22 09:25:02,140 - INFO - MoE Layer 2 Usage: Expert 0: 20.70%, Expert 1: 19.74%, Expert 2: 18.68%, Expert 3: 22.04%, Expert 4: 18.84%
2024-10-22 09:25:02,140 - INFO - MoE Layer 3 Usage: Expert 0: 14.43%, Expert 1: 23.97%, Expert 2: 23.78%, Expert 3: 18.06%, Expert 4: 19.75%
2024-10-22 09:25:02,142 - INFO - MoE Layer 4 Usage: Expert 0: 20.61%, Expert 1: 20.70%, Expert 2: 18.10%, Expert 3: 22.81%, Expert 4: 17.78%


Logging MoE usage stats...


2024-10-22 09:25:05,995 - INFO - MoE Layer 1 Usage: Expert 0: 21.00%, Expert 1: 18.79%, Expert 2: 19.39%, Expert 3: 20.59%, Expert 4: 20.23%
2024-10-22 09:25:05,996 - INFO - MoE Layer 2 Usage: Expert 0: 20.46%, Expert 1: 19.91%, Expert 2: 19.26%, Expert 3: 21.86%, Expert 4: 18.51%
2024-10-22 09:25:05,997 - INFO - MoE Layer 3 Usage: Expert 0: 14.43%, Expert 1: 23.92%, Expert 2: 23.38%, Expert 3: 18.33%, Expert 4: 19.95%
2024-10-22 09:25:05,999 - INFO - MoE Layer 4 Usage: Expert 0: 20.61%, Expert 1: 21.02%, Expert 2: 17.80%, Expert 3: 23.46%, Expert 4: 17.11%


Logging MoE usage stats...


2024-10-22 09:25:09,851 - INFO - MoE Layer 1 Usage: Expert 0: 20.99%, Expert 1: 18.93%, Expert 2: 19.19%, Expert 3: 20.70%, Expert 4: 20.19%
2024-10-22 09:25:09,852 - INFO - MoE Layer 2 Usage: Expert 0: 20.21%, Expert 1: 19.67%, Expert 2: 19.69%, Expert 3: 21.61%, Expert 4: 18.81%
2024-10-22 09:25:09,853 - INFO - MoE Layer 3 Usage: Expert 0: 14.74%, Expert 1: 24.27%, Expert 2: 23.44%, Expert 3: 18.32%, Expert 4: 19.22%
2024-10-22 09:25:09,854 - INFO - MoE Layer 4 Usage: Expert 0: 20.30%, Expert 1: 21.29%, Expert 2: 17.51%, Expert 3: 23.39%, Expert 4: 17.51%


Logging MoE usage stats...


2024-10-22 09:25:13,685 - INFO - MoE Layer 1 Usage: Expert 0: 20.90%, Expert 1: 19.06%, Expert 2: 19.19%, Expert 3: 20.59%, Expert 4: 20.26%
2024-10-22 09:25:13,686 - INFO - MoE Layer 2 Usage: Expert 0: 20.61%, Expert 1: 19.47%, Expert 2: 19.47%, Expert 3: 21.73%, Expert 4: 18.72%
2024-10-22 09:25:13,687 - INFO - MoE Layer 3 Usage: Expert 0: 15.16%, Expert 1: 23.80%, Expert 2: 23.60%, Expert 3: 17.86%, Expert 4: 19.58%
2024-10-22 09:25:13,688 - INFO - MoE Layer 4 Usage: Expert 0: 21.46%, Expert 1: 21.63%, Expert 2: 17.08%, Expert 3: 23.06%, Expert 4: 16.77%


Logging MoE usage stats...


2024-10-22 09:25:17,563 - INFO - MoE Layer 1 Usage: Expert 0: 21.18%, Expert 1: 18.68%, Expert 2: 19.27%, Expert 3: 20.52%, Expert 4: 20.36%
2024-10-22 09:25:17,564 - INFO - MoE Layer 2 Usage: Expert 0: 20.42%, Expert 1: 19.58%, Expert 2: 19.72%, Expert 3: 21.30%, Expert 4: 18.98%
2024-10-22 09:25:17,565 - INFO - MoE Layer 3 Usage: Expert 0: 14.59%, Expert 1: 23.72%, Expert 2: 23.78%, Expert 3: 18.34%, Expert 4: 19.57%
2024-10-22 09:25:17,567 - INFO - MoE Layer 4 Usage: Expert 0: 21.87%, Expert 1: 20.99%, Expert 2: 17.27%, Expert 3: 23.48%, Expert 4: 16.39%


Logging MoE usage stats...


2024-10-22 09:25:21,390 - INFO - MoE Layer 1 Usage: Expert 0: 21.24%, Expert 1: 18.99%, Expert 2: 19.02%, Expert 3: 20.56%, Expert 4: 20.19%
2024-10-22 09:25:21,391 - INFO - MoE Layer 2 Usage: Expert 0: 20.31%, Expert 1: 20.02%, Expert 2: 19.51%, Expert 3: 21.30%, Expert 4: 18.86%
2024-10-22 09:25:21,392 - INFO - MoE Layer 3 Usage: Expert 0: 15.17%, Expert 1: 23.53%, Expert 2: 23.89%, Expert 3: 18.12%, Expert 4: 19.28%
2024-10-22 09:25:21,394 - INFO - MoE Layer 4 Usage: Expert 0: 21.76%, Expert 1: 20.68%, Expert 2: 17.62%, Expert 3: 23.74%, Expert 4: 16.20%


Logging MoE usage stats...


2024-10-22 09:25:25,236 - INFO - MoE Layer 1 Usage: Expert 0: 20.76%, Expert 1: 18.85%, Expert 2: 19.24%, Expert 3: 20.74%, Expert 4: 20.41%
2024-10-22 09:25:25,237 - INFO - MoE Layer 2 Usage: Expert 0: 20.01%, Expert 1: 19.70%, Expert 2: 19.75%, Expert 3: 21.92%, Expert 4: 18.62%
2024-10-22 09:25:25,238 - INFO - MoE Layer 3 Usage: Expert 0: 15.45%, Expert 1: 24.17%, Expert 2: 23.18%, Expert 3: 17.87%, Expert 4: 19.33%
2024-10-22 09:25:25,239 - INFO - MoE Layer 4 Usage: Expert 0: 21.64%, Expert 1: 20.95%, Expert 2: 17.09%, Expert 3: 24.12%, Expert 4: 16.20%

***** Running Evaluation *****
  Num examples = 1000
  Batch size = 30


Logging MoE usage stats...


  0%|          | 0/34 [00:00<?, ?it/s]

Saving model checkpoint to ./results/pretraining_moe/checkpoints\checkpoint-500
Configuration saved in ./results/pretraining_moe/checkpoints\checkpoint-500\config.json
Configuration saved in ./results/pretraining_moe/checkpoints\checkpoint-500\generation_config.json
Model weights saved in ./results/pretraining_moe/checkpoints\checkpoint-500\model.safetensors
tokenizer config file saved in ./results/pretraining_moe/checkpoints\checkpoint-500\tokenizer_config.json
Special tokens file saved in ./results/pretraining_moe/checkpoints\checkpoint-500\special_tokens_map.json


{'eval_loss': 6.058013439178467, 'eval_runtime': 0.5081, 'eval_samples_per_second': 1967.984, 'eval_steps_per_second': 66.911, 'epoch': 63.29}


Deleting older checkpoint [results\pretraining_moe\checkpoints\checkpoint-300] due to args.save_total_limit
2024-10-22 09:25:29,897 - INFO - MoE Layer 1 Usage: Expert 0: 21.22%, Expert 1: 19.11%, Expert 2: 19.25%, Expert 3: 20.11%, Expert 4: 20.30%
2024-10-22 09:25:29,898 - INFO - MoE Layer 2 Usage: Expert 0: 20.23%, Expert 1: 20.19%, Expert 2: 19.30%, Expert 3: 21.63%, Expert 4: 18.66%
2024-10-22 09:25:29,899 - INFO - MoE Layer 3 Usage: Expert 0: 15.36%, Expert 1: 24.05%, Expert 2: 23.51%, Expert 3: 18.75%, Expert 4: 18.33%
2024-10-22 09:25:29,900 - INFO - MoE Layer 4 Usage: Expert 0: 21.33%, Expert 1: 21.11%, Expert 2: 17.16%, Expert 3: 24.16%, Expert 4: 16.23%


Logging MoE usage stats...


2024-10-22 09:25:33,744 - INFO - MoE Layer 1 Usage: Expert 0: 20.96%, Expert 1: 18.85%, Expert 2: 19.08%, Expert 3: 20.94%, Expert 4: 20.16%
2024-10-22 09:25:33,745 - INFO - MoE Layer 2 Usage: Expert 0: 20.01%, Expert 1: 19.99%, Expert 2: 19.64%, Expert 3: 21.79%, Expert 4: 18.57%
2024-10-22 09:25:33,746 - INFO - MoE Layer 3 Usage: Expert 0: 15.53%, Expert 1: 23.66%, Expert 2: 23.06%, Expert 3: 18.19%, Expert 4: 19.56%
2024-10-22 09:25:33,747 - INFO - MoE Layer 4 Usage: Expert 0: 21.98%, Expert 1: 20.53%, Expert 2: 17.30%, Expert 3: 24.47%, Expert 4: 15.72%


Logging MoE usage stats...


2024-10-22 09:25:37,553 - INFO - MoE Layer 1 Usage: Expert 0: 21.05%, Expert 1: 18.86%, Expert 2: 19.24%, Expert 3: 20.30%, Expert 4: 20.54%
2024-10-22 09:25:37,554 - INFO - MoE Layer 2 Usage: Expert 0: 20.41%, Expert 1: 20.26%, Expert 2: 19.43%, Expert 3: 21.14%, Expert 4: 18.76%
2024-10-22 09:25:37,555 - INFO - MoE Layer 3 Usage: Expert 0: 15.63%, Expert 1: 22.72%, Expert 2: 24.21%, Expert 3: 18.16%, Expert 4: 19.27%
2024-10-22 09:25:37,556 - INFO - MoE Layer 4 Usage: Expert 0: 22.82%, Expert 1: 20.57%, Expert 2: 16.81%, Expert 3: 24.48%, Expert 4: 15.32%


Logging MoE usage stats...


2024-10-22 09:25:41,375 - INFO - MoE Layer 1 Usage: Expert 0: 21.00%, Expert 1: 19.04%, Expert 2: 18.89%, Expert 3: 20.86%, Expert 4: 20.21%
2024-10-22 09:25:41,376 - INFO - MoE Layer 2 Usage: Expert 0: 19.95%, Expert 1: 19.93%, Expert 2: 19.73%, Expert 3: 21.69%, Expert 4: 18.70%
2024-10-22 09:25:41,377 - INFO - MoE Layer 3 Usage: Expert 0: 15.45%, Expert 1: 23.81%, Expert 2: 23.15%, Expert 3: 18.31%, Expert 4: 19.28%
2024-10-22 09:25:41,378 - INFO - MoE Layer 4 Usage: Expert 0: 22.79%, Expert 1: 20.45%, Expert 2: 16.74%, Expert 3: 24.57%, Expert 4: 15.45%


Logging MoE usage stats...


2024-10-22 09:25:45,201 - INFO - MoE Layer 1 Usage: Expert 0: 20.63%, Expert 1: 19.12%, Expert 2: 19.14%, Expert 3: 20.74%, Expert 4: 20.37%
2024-10-22 09:25:45,202 - INFO - MoE Layer 2 Usage: Expert 0: 19.96%, Expert 1: 19.94%, Expert 2: 19.49%, Expert 3: 21.72%, Expert 4: 18.88%
2024-10-22 09:25:45,203 - INFO - MoE Layer 3 Usage: Expert 0: 15.90%, Expert 1: 23.17%, Expert 2: 24.02%, Expert 3: 17.92%, Expert 4: 18.99%
2024-10-22 09:25:45,204 - INFO - MoE Layer 4 Usage: Expert 0: 23.02%, Expert 1: 20.29%, Expert 2: 16.82%, Expert 3: 24.79%, Expert 4: 15.08%


Logging MoE usage stats...


2024-10-22 09:25:49,018 - INFO - MoE Layer 1 Usage: Expert 0: 21.27%, Expert 1: 18.71%, Expert 2: 19.04%, Expert 3: 20.89%, Expert 4: 20.09%
2024-10-22 09:25:49,019 - INFO - MoE Layer 2 Usage: Expert 0: 19.89%, Expert 1: 20.24%, Expert 2: 19.42%, Expert 3: 21.59%, Expert 4: 18.87%
2024-10-22 09:25:49,020 - INFO - MoE Layer 3 Usage: Expert 0: 15.49%, Expert 1: 23.52%, Expert 2: 23.72%, Expert 3: 18.29%, Expert 4: 18.99%
2024-10-22 09:25:49,021 - INFO - MoE Layer 4 Usage: Expert 0: 23.07%, Expert 1: 20.32%, Expert 2: 17.02%, Expert 3: 24.83%, Expert 4: 14.76%


Logging MoE usage stats...


2024-10-22 09:25:52,859 - INFO - MoE Layer 1 Usage: Expert 0: 21.05%, Expert 1: 18.84%, Expert 2: 19.16%, Expert 3: 20.66%, Expert 4: 20.29%
2024-10-22 09:25:52,860 - INFO - MoE Layer 2 Usage: Expert 0: 20.22%, Expert 1: 19.91%, Expert 2: 19.56%, Expert 3: 21.67%, Expert 4: 18.63%
2024-10-22 09:25:52,861 - INFO - MoE Layer 3 Usage: Expert 0: 15.79%, Expert 1: 23.30%, Expert 2: 23.52%, Expert 3: 18.25%, Expert 4: 19.14%
2024-10-22 09:25:52,862 - INFO - MoE Layer 4 Usage: Expert 0: 23.57%, Expert 1: 20.00%, Expert 2: 16.89%, Expert 3: 24.78%, Expert 4: 14.76%


Logging MoE usage stats...


2024-10-22 09:25:56,668 - INFO - MoE Layer 1 Usage: Expert 0: 20.96%, Expert 1: 18.98%, Expert 2: 18.91%, Expert 3: 20.77%, Expert 4: 20.38%
2024-10-22 09:25:56,669 - INFO - MoE Layer 2 Usage: Expert 0: 19.90%, Expert 1: 20.22%, Expert 2: 19.64%, Expert 3: 21.47%, Expert 4: 18.76%
2024-10-22 09:25:56,670 - INFO - MoE Layer 3 Usage: Expert 0: 15.80%, Expert 1: 23.11%, Expert 2: 23.73%, Expert 3: 18.50%, Expert 4: 18.85%
2024-10-22 09:25:56,671 - INFO - MoE Layer 4 Usage: Expert 0: 23.25%, Expert 1: 19.60%, Expert 2: 16.99%, Expert 3: 25.42%, Expert 4: 14.74%


Logging MoE usage stats...


2024-10-22 09:26:00,481 - INFO - MoE Layer 1 Usage: Expert 0: 21.18%, Expert 1: 18.54%, Expert 2: 18.86%, Expert 3: 20.79%, Expert 4: 20.64%
2024-10-22 09:26:00,482 - INFO - MoE Layer 2 Usage: Expert 0: 19.76%, Expert 1: 20.33%, Expert 2: 19.41%, Expert 3: 21.70%, Expert 4: 18.80%
2024-10-22 09:26:00,483 - INFO - MoE Layer 3 Usage: Expert 0: 16.36%, Expert 1: 22.93%, Expert 2: 23.46%, Expert 3: 18.34%, Expert 4: 18.91%
2024-10-22 09:26:00,485 - INFO - MoE Layer 4 Usage: Expert 0: 23.68%, Expert 1: 20.09%, Expert 2: 16.89%, Expert 3: 25.23%, Expert 4: 14.11%


Logging MoE usage stats...


2024-10-22 09:26:04,358 - INFO - MoE Layer 1 Usage: Expert 0: 20.91%, Expert 1: 18.96%, Expert 2: 18.92%, Expert 3: 21.05%, Expert 4: 20.16%
2024-10-22 09:26:04,359 - INFO - MoE Layer 2 Usage: Expert 0: 19.91%, Expert 1: 20.03%, Expert 2: 19.64%, Expert 3: 21.44%, Expert 4: 18.98%
2024-10-22 09:26:04,360 - INFO - MoE Layer 3 Usage: Expert 0: 16.50%, Expert 1: 22.80%, Expert 2: 23.65%, Expert 3: 18.11%, Expert 4: 18.93%
2024-10-22 09:26:04,361 - INFO - MoE Layer 4 Usage: Expert 0: 23.30%, Expert 1: 19.84%, Expert 2: 17.42%, Expert 3: 25.10%, Expert 4: 14.33%

***** Running Evaluation *****
  Num examples = 1000
  Batch size = 30


Logging MoE usage stats...


  0%|          | 0/34 [00:00<?, ?it/s]

Saving model checkpoint to ./results/pretraining_moe/checkpoints\checkpoint-600
Configuration saved in ./results/pretraining_moe/checkpoints\checkpoint-600\config.json
Configuration saved in ./results/pretraining_moe/checkpoints\checkpoint-600\generation_config.json
Model weights saved in ./results/pretraining_moe/checkpoints\checkpoint-600\model.safetensors
tokenizer config file saved in ./results/pretraining_moe/checkpoints\checkpoint-600\tokenizer_config.json
Special tokens file saved in ./results/pretraining_moe/checkpoints\checkpoint-600\special_tokens_map.json


{'eval_loss': 6.03071403503418, 'eval_runtime': 0.5095, 'eval_samples_per_second': 1962.55, 'eval_steps_per_second': 66.727, 'epoch': 75.95}


Deleting older checkpoint [results\pretraining_moe\checkpoints\checkpoint-400] due to args.save_total_limit
2024-10-22 09:26:09,013 - INFO - MoE Layer 1 Usage: Expert 0: 21.03%, Expert 1: 18.88%, Expert 2: 18.98%, Expert 3: 20.57%, Expert 4: 20.54%
2024-10-22 09:26:09,015 - INFO - MoE Layer 2 Usage: Expert 0: 19.75%, Expert 1: 20.14%, Expert 2: 19.27%, Expert 3: 22.09%, Expert 4: 18.75%
2024-10-22 09:26:09,017 - INFO - MoE Layer 3 Usage: Expert 0: 16.07%, Expert 1: 23.35%, Expert 2: 23.24%, Expert 3: 19.05%, Expert 4: 18.30%
2024-10-22 09:26:09,018 - INFO - MoE Layer 4 Usage: Expert 0: 23.46%, Expert 1: 19.73%, Expert 2: 16.85%, Expert 3: 25.01%, Expert 4: 14.95%


Logging MoE usage stats...


2024-10-22 09:26:12,905 - INFO - MoE Layer 1 Usage: Expert 0: 21.19%, Expert 1: 18.53%, Expert 2: 19.03%, Expert 3: 20.85%, Expert 4: 20.40%
2024-10-22 09:26:12,906 - INFO - MoE Layer 2 Usage: Expert 0: 19.87%, Expert 1: 20.10%, Expert 2: 19.68%, Expert 3: 21.65%, Expert 4: 18.70%
2024-10-22 09:26:12,907 - INFO - MoE Layer 3 Usage: Expert 0: 16.12%, Expert 1: 22.92%, Expert 2: 23.22%, Expert 3: 18.71%, Expert 4: 19.04%
2024-10-22 09:26:12,908 - INFO - MoE Layer 4 Usage: Expert 0: 23.66%, Expert 1: 20.06%, Expert 2: 17.11%, Expert 3: 24.89%, Expert 4: 14.28%


Logging MoE usage stats...


2024-10-22 09:26:16,731 - INFO - MoE Layer 1 Usage: Expert 0: 20.90%, Expert 1: 18.55%, Expert 2: 18.97%, Expert 3: 21.01%, Expert 4: 20.57%
2024-10-22 09:26:16,732 - INFO - MoE Layer 2 Usage: Expert 0: 20.18%, Expert 1: 19.94%, Expert 2: 19.32%, Expert 3: 21.51%, Expert 4: 19.06%
2024-10-22 09:26:16,733 - INFO - MoE Layer 3 Usage: Expert 0: 16.43%, Expert 1: 23.17%, Expert 2: 22.94%, Expert 3: 18.74%, Expert 4: 18.72%
2024-10-22 09:26:16,735 - INFO - MoE Layer 4 Usage: Expert 0: 23.47%, Expert 1: 19.92%, Expert 2: 17.36%, Expert 3: 25.13%, Expert 4: 14.12%


Logging MoE usage stats...


2024-10-22 09:26:20,578 - INFO - MoE Layer 1 Usage: Expert 0: 20.79%, Expert 1: 18.83%, Expert 2: 18.89%, Expert 3: 21.09%, Expert 4: 20.39%
2024-10-22 09:26:20,579 - INFO - MoE Layer 2 Usage: Expert 0: 19.88%, Expert 1: 19.94%, Expert 2: 19.52%, Expert 3: 21.70%, Expert 4: 18.96%
2024-10-22 09:26:20,580 - INFO - MoE Layer 3 Usage: Expert 0: 16.49%, Expert 1: 23.17%, Expert 2: 22.99%, Expert 3: 18.92%, Expert 4: 18.44%
2024-10-22 09:26:20,581 - INFO - MoE Layer 4 Usage: Expert 0: 23.29%, Expert 1: 19.84%, Expert 2: 17.44%, Expert 3: 25.13%, Expert 4: 14.30%


Logging MoE usage stats...


2024-10-22 09:26:24,443 - INFO - MoE Layer 1 Usage: Expert 0: 21.05%, Expert 1: 18.78%, Expert 2: 18.86%, Expert 3: 20.99%, Expert 4: 20.32%
2024-10-22 09:26:24,444 - INFO - MoE Layer 2 Usage: Expert 0: 20.16%, Expert 1: 20.13%, Expert 2: 19.52%, Expert 3: 21.40%, Expert 4: 18.79%
2024-10-22 09:26:24,444 - INFO - MoE Layer 3 Usage: Expert 0: 16.39%, Expert 1: 22.50%, Expert 2: 22.79%, Expert 3: 19.02%, Expert 4: 19.30%
2024-10-22 09:26:24,446 - INFO - MoE Layer 4 Usage: Expert 0: 23.64%, Expert 1: 19.58%, Expert 2: 17.57%, Expert 3: 25.04%, Expert 4: 14.18%


Logging MoE usage stats...


2024-10-22 09:26:28,297 - INFO - MoE Layer 1 Usage: Expert 0: 20.71%, Expert 1: 18.91%, Expert 2: 18.84%, Expert 3: 21.10%, Expert 4: 20.44%
2024-10-22 09:26:28,299 - INFO - MoE Layer 2 Usage: Expert 0: 19.84%, Expert 1: 19.99%, Expert 2: 19.63%, Expert 3: 21.84%, Expert 4: 18.70%
2024-10-22 09:26:28,300 - INFO - MoE Layer 3 Usage: Expert 0: 16.51%, Expert 1: 22.73%, Expert 2: 23.09%, Expert 3: 18.86%, Expert 4: 18.81%
2024-10-22 09:26:28,301 - INFO - MoE Layer 4 Usage: Expert 0: 23.34%, Expert 1: 19.56%, Expert 2: 17.72%, Expert 3: 25.21%, Expert 4: 14.16%


Logging MoE usage stats...


2024-10-22 09:26:32,118 - INFO - MoE Layer 1 Usage: Expert 0: 20.93%, Expert 1: 18.81%, Expert 2: 19.06%, Expert 3: 20.70%, Expert 4: 20.50%
2024-10-22 09:26:32,119 - INFO - MoE Layer 2 Usage: Expert 0: 20.15%, Expert 1: 19.94%, Expert 2: 19.56%, Expert 3: 21.53%, Expert 4: 18.82%
2024-10-22 09:26:32,120 - INFO - MoE Layer 3 Usage: Expert 0: 16.66%, Expert 1: 22.91%, Expert 2: 23.41%, Expert 3: 18.66%, Expert 4: 18.37%
2024-10-22 09:26:32,121 - INFO - MoE Layer 4 Usage: Expert 0: 23.28%, Expert 1: 19.59%, Expert 2: 17.65%, Expert 3: 25.30%, Expert 4: 14.18%


Logging MoE usage stats...


2024-10-22 09:26:35,943 - INFO - MoE Layer 1 Usage: Expert 0: 21.13%, Expert 1: 18.69%, Expert 2: 18.80%, Expert 3: 20.93%, Expert 4: 20.44%
2024-10-22 09:26:35,944 - INFO - MoE Layer 2 Usage: Expert 0: 19.69%, Expert 1: 20.13%, Expert 2: 19.53%, Expert 3: 21.54%, Expert 4: 19.12%
2024-10-22 09:26:35,945 - INFO - MoE Layer 3 Usage: Expert 0: 16.59%, Expert 1: 22.75%, Expert 2: 23.31%, Expert 3: 18.96%, Expert 4: 18.40%
2024-10-22 09:26:35,946 - INFO - MoE Layer 4 Usage: Expert 0: 23.10%, Expert 1: 19.40%, Expert 2: 17.64%, Expert 3: 25.59%, Expert 4: 14.28%


Logging MoE usage stats...


2024-10-22 09:26:39,771 - INFO - MoE Layer 1 Usage: Expert 0: 20.93%, Expert 1: 18.72%, Expert 2: 18.77%, Expert 3: 21.14%, Expert 4: 20.43%
2024-10-22 09:26:39,772 - INFO - MoE Layer 2 Usage: Expert 0: 19.98%, Expert 1: 19.83%, Expert 2: 19.60%, Expert 3: 21.64%, Expert 4: 18.94%
2024-10-22 09:26:39,773 - INFO - MoE Layer 3 Usage: Expert 0: 16.73%, Expert 1: 22.46%, Expert 2: 23.13%, Expert 3: 18.93%, Expert 4: 18.75%
2024-10-22 09:26:39,775 - INFO - MoE Layer 4 Usage: Expert 0: 23.12%, Expert 1: 19.72%, Expert 2: 18.11%, Expert 3: 24.78%, Expert 4: 14.27%


Logging MoE usage stats...


2024-10-22 09:26:43,612 - INFO - MoE Layer 1 Usage: Expert 0: 20.88%, Expert 1: 18.73%, Expert 2: 18.86%, Expert 3: 20.97%, Expert 4: 20.56%
2024-10-22 09:26:43,613 - INFO - MoE Layer 2 Usage: Expert 0: 19.95%, Expert 1: 19.79%, Expert 2: 19.53%, Expert 3: 21.62%, Expert 4: 19.11%
2024-10-22 09:26:43,614 - INFO - MoE Layer 3 Usage: Expert 0: 16.82%, Expert 1: 22.71%, Expert 2: 22.62%, Expert 3: 18.96%, Expert 4: 18.88%
2024-10-22 09:26:43,615 - INFO - MoE Layer 4 Usage: Expert 0: 23.52%, Expert 1: 19.81%, Expert 2: 17.87%, Expert 3: 24.76%, Expert 4: 14.03%

***** Running Evaluation *****
  Num examples = 1000
  Batch size = 30


Logging MoE usage stats...


  0%|          | 0/34 [00:00<?, ?it/s]

Saving model checkpoint to ./results/pretraining_moe/checkpoints\checkpoint-700
Configuration saved in ./results/pretraining_moe/checkpoints\checkpoint-700\config.json
Configuration saved in ./results/pretraining_moe/checkpoints\checkpoint-700\generation_config.json
Model weights saved in ./results/pretraining_moe/checkpoints\checkpoint-700\model.safetensors
tokenizer config file saved in ./results/pretraining_moe/checkpoints\checkpoint-700\tokenizer_config.json
Special tokens file saved in ./results/pretraining_moe/checkpoints\checkpoint-700\special_tokens_map.json


{'eval_loss': 6.148586750030518, 'eval_runtime': 0.5136, 'eval_samples_per_second': 1947.177, 'eval_steps_per_second': 66.204, 'epoch': 88.61}


Deleting older checkpoint [results\pretraining_moe\checkpoints\checkpoint-500] due to args.save_total_limit
2024-10-22 09:26:48,274 - INFO - MoE Layer 1 Usage: Expert 0: 21.00%, Expert 1: 18.87%, Expert 2: 18.81%, Expert 3: 20.80%, Expert 4: 20.52%
2024-10-22 09:26:48,275 - INFO - MoE Layer 2 Usage: Expert 0: 19.91%, Expert 1: 20.16%, Expert 2: 19.62%, Expert 3: 21.61%, Expert 4: 18.70%
2024-10-22 09:26:48,275 - INFO - MoE Layer 3 Usage: Expert 0: 16.32%, Expert 1: 23.00%, Expert 2: 22.61%, Expert 3: 19.88%, Expert 4: 18.18%
2024-10-22 09:26:48,277 - INFO - MoE Layer 4 Usage: Expert 0: 23.09%, Expert 1: 19.43%, Expert 2: 18.07%, Expert 3: 25.26%, Expert 4: 14.15%


Logging MoE usage stats...


2024-10-22 09:26:52,174 - INFO - MoE Layer 1 Usage: Expert 0: 20.74%, Expert 1: 18.86%, Expert 2: 18.84%, Expert 3: 20.94%, Expert 4: 20.62%
2024-10-22 09:26:52,175 - INFO - MoE Layer 2 Usage: Expert 0: 20.10%, Expert 1: 20.01%, Expert 2: 19.39%, Expert 3: 21.38%, Expert 4: 19.12%
2024-10-22 09:26:52,176 - INFO - MoE Layer 3 Usage: Expert 0: 16.62%, Expert 1: 22.29%, Expert 2: 22.99%, Expert 3: 19.45%, Expert 4: 18.64%
2024-10-22 09:26:52,178 - INFO - MoE Layer 4 Usage: Expert 0: 22.87%, Expert 1: 19.19%, Expert 2: 18.17%, Expert 3: 25.45%, Expert 4: 14.33%


Logging MoE usage stats...


2024-10-22 09:26:56,207 - INFO - MoE Layer 1 Usage: Expert 0: 20.95%, Expert 1: 18.82%, Expert 2: 18.51%, Expert 3: 21.29%, Expert 4: 20.42%
2024-10-22 09:26:56,208 - INFO - MoE Layer 2 Usage: Expert 0: 19.99%, Expert 1: 19.76%, Expert 2: 19.62%, Expert 3: 21.59%, Expert 4: 19.04%
2024-10-22 09:26:56,209 - INFO - MoE Layer 3 Usage: Expert 0: 16.90%, Expert 1: 22.62%, Expert 2: 23.03%, Expert 3: 18.79%, Expert 4: 18.67%
2024-10-22 09:26:56,210 - INFO - MoE Layer 4 Usage: Expert 0: 22.71%, Expert 1: 19.66%, Expert 2: 18.22%, Expert 3: 24.99%, Expert 4: 14.41%


Logging MoE usage stats...


2024-10-22 09:27:00,044 - INFO - MoE Layer 1 Usage: Expert 0: 20.88%, Expert 1: 18.99%, Expert 2: 18.84%, Expert 3: 20.73%, Expert 4: 20.56%
2024-10-22 09:27:00,045 - INFO - MoE Layer 2 Usage: Expert 0: 20.03%, Expert 1: 19.63%, Expert 2: 19.73%, Expert 3: 21.70%, Expert 4: 18.91%
2024-10-22 09:27:00,046 - INFO - MoE Layer 3 Usage: Expert 0: 16.92%, Expert 1: 22.63%, Expert 2: 22.55%, Expert 3: 19.36%, Expert 4: 18.54%
2024-10-22 09:27:00,047 - INFO - MoE Layer 4 Usage: Expert 0: 22.82%, Expert 1: 19.58%, Expert 2: 18.26%, Expert 3: 25.19%, Expert 4: 14.15%


Logging MoE usage stats...


2024-10-22 09:27:03,893 - INFO - MoE Layer 1 Usage: Expert 0: 21.26%, Expert 1: 18.77%, Expert 2: 18.81%, Expert 3: 21.02%, Expert 4: 20.15%
2024-10-22 09:27:03,894 - INFO - MoE Layer 2 Usage: Expert 0: 19.73%, Expert 1: 19.85%, Expert 2: 19.75%, Expert 3: 21.61%, Expert 4: 19.06%
2024-10-22 09:27:03,894 - INFO - MoE Layer 3 Usage: Expert 0: 16.75%, Expert 1: 22.25%, Expert 2: 22.85%, Expert 3: 19.63%, Expert 4: 18.53%
2024-10-22 09:27:03,896 - INFO - MoE Layer 4 Usage: Expert 0: 22.93%, Expert 1: 19.56%, Expert 2: 18.12%, Expert 3: 25.04%, Expert 4: 14.35%


Logging MoE usage stats...


2024-10-22 09:27:07,735 - INFO - MoE Layer 1 Usage: Expert 0: 20.79%, Expert 1: 18.75%, Expert 2: 19.05%, Expert 3: 20.92%, Expert 4: 20.48%
2024-10-22 09:27:07,736 - INFO - MoE Layer 2 Usage: Expert 0: 20.11%, Expert 1: 19.68%, Expert 2: 19.73%, Expert 3: 21.40%, Expert 4: 19.08%
2024-10-22 09:27:07,737 - INFO - MoE Layer 3 Usage: Expert 0: 16.78%, Expert 1: 22.75%, Expert 2: 22.73%, Expert 3: 19.18%, Expert 4: 18.56%
2024-10-22 09:27:07,738 - INFO - MoE Layer 4 Usage: Expert 0: 23.20%, Expert 1: 19.60%, Expert 2: 18.43%, Expert 3: 24.51%, Expert 4: 14.27%


Logging MoE usage stats...


2024-10-22 09:27:11,636 - INFO - MoE Layer 1 Usage: Expert 0: 21.05%, Expert 1: 18.66%, Expert 2: 18.65%, Expert 3: 21.11%, Expert 4: 20.53%
2024-10-22 09:27:11,637 - INFO - MoE Layer 2 Usage: Expert 0: 20.07%, Expert 1: 19.91%, Expert 2: 19.28%, Expert 3: 21.71%, Expert 4: 19.03%
2024-10-22 09:27:11,638 - INFO - MoE Layer 3 Usage: Expert 0: 16.62%, Expert 1: 22.54%, Expert 2: 22.58%, Expert 3: 19.27%, Expert 4: 18.99%
2024-10-22 09:27:11,639 - INFO - MoE Layer 4 Usage: Expert 0: 22.81%, Expert 1: 19.60%, Expert 2: 18.62%, Expert 3: 24.82%, Expert 4: 14.16%


Logging MoE usage stats...


2024-10-22 09:27:15,538 - INFO - MoE Layer 1 Usage: Expert 0: 21.10%, Expert 1: 18.87%, Expert 2: 18.65%, Expert 3: 21.08%, Expert 4: 20.30%
2024-10-22 09:27:15,539 - INFO - MoE Layer 2 Usage: Expert 0: 19.84%, Expert 1: 19.87%, Expert 2: 19.63%, Expert 3: 21.64%, Expert 4: 19.02%
2024-10-22 09:27:15,541 - INFO - MoE Layer 3 Usage: Expert 0: 16.69%, Expert 1: 22.38%, Expert 2: 22.62%, Expert 3: 19.72%, Expert 4: 18.60%
2024-10-22 09:27:15,542 - INFO - MoE Layer 4 Usage: Expert 0: 22.63%, Expert 1: 19.73%, Expert 2: 18.53%, Expert 3: 24.65%, Expert 4: 14.46%


Logging MoE usage stats...


2024-10-22 09:27:19,448 - INFO - MoE Layer 1 Usage: Expert 0: 20.92%, Expert 1: 18.84%, Expert 2: 18.76%, Expert 3: 21.04%, Expert 4: 20.44%
2024-10-22 09:27:19,449 - INFO - MoE Layer 2 Usage: Expert 0: 20.28%, Expert 1: 19.71%, Expert 2: 19.53%, Expert 3: 21.63%, Expert 4: 18.85%
2024-10-22 09:27:19,449 - INFO - MoE Layer 3 Usage: Expert 0: 16.83%, Expert 1: 22.82%, Expert 2: 22.15%, Expert 3: 19.39%, Expert 4: 18.81%
2024-10-22 09:27:19,451 - INFO - MoE Layer 4 Usage: Expert 0: 22.48%, Expert 1: 19.23%, Expert 2: 18.82%, Expert 3: 25.17%, Expert 4: 14.30%


Logging MoE usage stats...


2024-10-22 09:27:23,359 - INFO - MoE Layer 1 Usage: Expert 0: 21.21%, Expert 1: 18.73%, Expert 2: 18.74%, Expert 3: 21.08%, Expert 4: 20.24%
2024-10-22 09:27:23,360 - INFO - MoE Layer 2 Usage: Expert 0: 19.87%, Expert 1: 20.13%, Expert 2: 19.27%, Expert 3: 21.64%, Expert 4: 19.09%
2024-10-22 09:27:23,362 - INFO - MoE Layer 3 Usage: Expert 0: 16.74%, Expert 1: 22.45%, Expert 2: 22.73%, Expert 3: 19.61%, Expert 4: 18.48%
2024-10-22 09:27:23,364 - INFO - MoE Layer 4 Usage: Expert 0: 22.88%, Expert 1: 19.97%, Expert 2: 18.44%, Expert 3: 24.51%, Expert 4: 14.19%

***** Running Evaluation *****
  Num examples = 1000
  Batch size = 30


Logging MoE usage stats...


  0%|          | 0/34 [00:00<?, ?it/s]

Saving model checkpoint to ./results/pretraining_moe/checkpoints\checkpoint-800
Configuration saved in ./results/pretraining_moe/checkpoints\checkpoint-800\config.json
Configuration saved in ./results/pretraining_moe/checkpoints\checkpoint-800\generation_config.json
Model weights saved in ./results/pretraining_moe/checkpoints\checkpoint-800\model.safetensors
tokenizer config file saved in ./results/pretraining_moe/checkpoints\checkpoint-800\tokenizer_config.json
Special tokens file saved in ./results/pretraining_moe/checkpoints\checkpoint-800\special_tokens_map.json


{'eval_loss': 6.366456031799316, 'eval_runtime': 0.5504, 'eval_samples_per_second': 1816.728, 'eval_steps_per_second': 61.769, 'epoch': 101.27}


Deleting older checkpoint [results\pretraining_moe\checkpoints\checkpoint-700] due to args.save_total_limit
2024-10-22 09:27:28,182 - INFO - MoE Layer 1 Usage: Expert 0: 20.90%, Expert 1: 18.71%, Expert 2: 18.83%, Expert 3: 21.06%, Expert 4: 20.50%
2024-10-22 09:27:28,183 - INFO - MoE Layer 2 Usage: Expert 0: 19.65%, Expert 1: 19.78%, Expert 2: 19.42%, Expert 3: 21.97%, Expert 4: 19.18%
2024-10-22 09:27:28,184 - INFO - MoE Layer 3 Usage: Expert 0: 16.43%, Expert 1: 23.03%, Expert 2: 22.43%, Expert 3: 19.96%, Expert 4: 18.14%
2024-10-22 09:27:28,186 - INFO - MoE Layer 4 Usage: Expert 0: 22.06%, Expert 1: 20.01%, Expert 2: 18.64%, Expert 3: 24.72%, Expert 4: 14.57%


Logging MoE usage stats...


2024-10-22 09:27:32,109 - INFO - MoE Layer 1 Usage: Expert 0: 21.19%, Expert 1: 18.92%, Expert 2: 18.65%, Expert 3: 21.01%, Expert 4: 20.23%
2024-10-22 09:27:32,110 - INFO - MoE Layer 2 Usage: Expert 0: 20.30%, Expert 1: 19.62%, Expert 2: 19.36%, Expert 3: 21.68%, Expert 4: 19.04%
2024-10-22 09:27:32,111 - INFO - MoE Layer 3 Usage: Expert 0: 16.61%, Expert 1: 22.23%, Expert 2: 22.70%, Expert 3: 19.36%, Expert 4: 19.09%
2024-10-22 09:27:32,112 - INFO - MoE Layer 4 Usage: Expert 0: 22.38%, Expert 1: 19.81%, Expert 2: 18.99%, Expert 3: 24.69%, Expert 4: 14.13%


Logging MoE usage stats...


2024-10-22 09:27:36,026 - INFO - MoE Layer 1 Usage: Expert 0: 20.91%, Expert 1: 18.89%, Expert 2: 18.64%, Expert 3: 21.31%, Expert 4: 20.25%
2024-10-22 09:27:36,027 - INFO - MoE Layer 2 Usage: Expert 0: 19.88%, Expert 1: 19.90%, Expert 2: 19.47%, Expert 3: 21.73%, Expert 4: 19.01%
2024-10-22 09:27:36,028 - INFO - MoE Layer 3 Usage: Expert 0: 16.51%, Expert 1: 22.51%, Expert 2: 22.64%, Expert 3: 19.52%, Expert 4: 18.82%
2024-10-22 09:27:36,029 - INFO - MoE Layer 4 Usage: Expert 0: 22.30%, Expert 1: 19.92%, Expert 2: 18.83%, Expert 3: 24.70%, Expert 4: 14.25%


Logging MoE usage stats...


2024-10-22 09:27:39,926 - INFO - MoE Layer 1 Usage: Expert 0: 20.95%, Expert 1: 18.97%, Expert 2: 18.73%, Expert 3: 21.03%, Expert 4: 20.33%
2024-10-22 09:27:39,927 - INFO - MoE Layer 2 Usage: Expert 0: 20.11%, Expert 1: 19.84%, Expert 2: 19.30%, Expert 3: 21.71%, Expert 4: 19.05%
2024-10-22 09:27:39,928 - INFO - MoE Layer 3 Usage: Expert 0: 16.73%, Expert 1: 22.50%, Expert 2: 22.48%, Expert 3: 19.57%, Expert 4: 18.72%
2024-10-22 09:27:39,929 - INFO - MoE Layer 4 Usage: Expert 0: 22.36%, Expert 1: 19.86%, Expert 2: 18.97%, Expert 3: 24.54%, Expert 4: 14.27%


Logging MoE usage stats...


2024-10-22 09:27:43,854 - INFO - MoE Layer 1 Usage: Expert 0: 20.89%, Expert 1: 18.82%, Expert 2: 18.70%, Expert 3: 21.05%, Expert 4: 20.54%
2024-10-22 09:27:43,855 - INFO - MoE Layer 2 Usage: Expert 0: 20.09%, Expert 1: 19.73%, Expert 2: 19.55%, Expert 3: 21.66%, Expert 4: 18.97%
2024-10-22 09:27:43,856 - INFO - MoE Layer 3 Usage: Expert 0: 16.54%, Expert 1: 22.74%, Expert 2: 22.28%, Expert 3: 19.57%, Expert 4: 18.87%
2024-10-22 09:27:43,857 - INFO - MoE Layer 4 Usage: Expert 0: 22.36%, Expert 1: 20.21%, Expert 2: 19.08%, Expert 3: 24.22%, Expert 4: 14.13%


Logging MoE usage stats...


2024-10-22 09:27:47,799 - INFO - MoE Layer 1 Usage: Expert 0: 20.86%, Expert 1: 18.77%, Expert 2: 18.69%, Expert 3: 21.39%, Expert 4: 20.28%
2024-10-22 09:27:47,800 - INFO - MoE Layer 2 Usage: Expert 0: 19.93%, Expert 1: 19.78%, Expert 2: 19.62%, Expert 3: 21.53%, Expert 4: 19.14%
2024-10-22 09:27:47,801 - INFO - MoE Layer 3 Usage: Expert 0: 16.38%, Expert 1: 22.36%, Expert 2: 22.67%, Expert 3: 19.70%, Expert 4: 18.89%
2024-10-22 09:27:47,802 - INFO - MoE Layer 4 Usage: Expert 0: 22.25%, Expert 1: 20.22%, Expert 2: 18.96%, Expert 3: 24.49%, Expert 4: 14.08%


Logging MoE usage stats...


2024-10-22 09:27:51,673 - INFO - MoE Layer 1 Usage: Expert 0: 20.84%, Expert 1: 18.89%, Expert 2: 18.76%, Expert 3: 21.09%, Expert 4: 20.41%
2024-10-22 09:27:51,674 - INFO - MoE Layer 2 Usage: Expert 0: 19.96%, Expert 1: 19.63%, Expert 2: 19.40%, Expert 3: 21.86%, Expert 4: 19.14%
2024-10-22 09:27:51,675 - INFO - MoE Layer 3 Usage: Expert 0: 16.51%, Expert 1: 22.54%, Expert 2: 22.58%, Expert 3: 19.72%, Expert 4: 18.65%
2024-10-22 09:27:51,676 - INFO - MoE Layer 4 Usage: Expert 0: 21.93%, Expert 1: 20.08%, Expert 2: 19.11%, Expert 3: 24.68%, Expert 4: 14.20%


Logging MoE usage stats...


2024-10-22 09:27:55,595 - INFO - MoE Layer 1 Usage: Expert 0: 21.13%, Expert 1: 18.62%, Expert 2: 18.66%, Expert 3: 21.08%, Expert 4: 20.51%
2024-10-22 09:27:55,596 - INFO - MoE Layer 2 Usage: Expert 0: 20.04%, Expert 1: 19.88%, Expert 2: 19.46%, Expert 3: 21.69%, Expert 4: 18.94%
2024-10-22 09:27:55,597 - INFO - MoE Layer 3 Usage: Expert 0: 16.55%, Expert 1: 22.36%, Expert 2: 22.59%, Expert 3: 19.58%, Expert 4: 18.92%
2024-10-22 09:27:55,598 - INFO - MoE Layer 4 Usage: Expert 0: 21.95%, Expert 1: 20.21%, Expert 2: 19.19%, Expert 3: 24.66%, Expert 4: 13.99%


Logging MoE usage stats...


2024-10-22 09:27:59,487 - INFO - MoE Layer 1 Usage: Expert 0: 21.25%, Expert 1: 18.81%, Expert 2: 18.84%, Expert 3: 21.01%, Expert 4: 20.08%
2024-10-22 09:27:59,488 - INFO - MoE Layer 2 Usage: Expert 0: 20.07%, Expert 1: 19.81%, Expert 2: 19.43%, Expert 3: 21.74%, Expert 4: 18.95%
2024-10-22 09:27:59,489 - INFO - MoE Layer 3 Usage: Expert 0: 16.35%, Expert 1: 22.40%, Expert 2: 22.46%, Expert 3: 19.99%, Expert 4: 18.79%
2024-10-22 09:27:59,490 - INFO - MoE Layer 4 Usage: Expert 0: 22.00%, Expert 1: 20.36%, Expert 2: 19.21%, Expert 3: 24.37%, Expert 4: 14.06%


Logging MoE usage stats...


2024-10-22 09:28:03,408 - INFO - MoE Layer 1 Usage: Expert 0: 21.02%, Expert 1: 18.74%, Expert 2: 18.61%, Expert 3: 21.14%, Expert 4: 20.49%
2024-10-22 09:28:03,409 - INFO - MoE Layer 2 Usage: Expert 0: 20.16%, Expert 1: 19.47%, Expert 2: 19.63%, Expert 3: 21.72%, Expert 4: 19.01%
2024-10-22 09:28:03,410 - INFO - MoE Layer 3 Usage: Expert 0: 16.56%, Expert 1: 22.73%, Expert 2: 22.28%, Expert 3: 19.61%, Expert 4: 18.83%
2024-10-22 09:28:03,412 - INFO - MoE Layer 4 Usage: Expert 0: 21.71%, Expert 1: 20.36%, Expert 2: 19.17%, Expert 3: 24.81%, Expert 4: 13.96%

***** Running Evaluation *****
  Num examples = 1000
  Batch size = 30


Logging MoE usage stats...


  0%|          | 0/34 [00:00<?, ?it/s]

Saving model checkpoint to ./results/pretraining_moe/checkpoints\checkpoint-900
Configuration saved in ./results/pretraining_moe/checkpoints\checkpoint-900\config.json
Configuration saved in ./results/pretraining_moe/checkpoints\checkpoint-900\generation_config.json
Model weights saved in ./results/pretraining_moe/checkpoints\checkpoint-900\model.safetensors
tokenizer config file saved in ./results/pretraining_moe/checkpoints\checkpoint-900\tokenizer_config.json
Special tokens file saved in ./results/pretraining_moe/checkpoints\checkpoint-900\special_tokens_map.json


{'eval_loss': 6.569530010223389, 'eval_runtime': 0.5302, 'eval_samples_per_second': 1885.94, 'eval_steps_per_second': 64.122, 'epoch': 113.92}


Deleting older checkpoint [results\pretraining_moe\checkpoints\checkpoint-800] due to args.save_total_limit
2024-10-22 09:28:08,172 - INFO - MoE Layer 1 Usage: Expert 0: 20.64%, Expert 1: 18.91%, Expert 2: 18.99%, Expert 3: 20.93%, Expert 4: 20.53%
2024-10-22 09:28:08,173 - INFO - MoE Layer 2 Usage: Expert 0: 19.24%, Expert 1: 19.97%, Expert 2: 19.24%, Expert 3: 22.20%, Expert 4: 19.35%
2024-10-22 09:28:08,174 - INFO - MoE Layer 3 Usage: Expert 0: 16.18%, Expert 1: 23.14%, Expert 2: 22.61%, Expert 3: 20.12%, Expert 4: 17.95%
2024-10-22 09:28:08,175 - INFO - MoE Layer 4 Usage: Expert 0: 21.61%, Expert 1: 19.89%, Expert 2: 19.35%, Expert 3: 25.04%, Expert 4: 14.11%


Logging MoE usage stats...


2024-10-22 09:28:12,146 - INFO - MoE Layer 1 Usage: Expert 0: 20.93%, Expert 1: 18.75%, Expert 2: 18.70%, Expert 3: 21.31%, Expert 4: 20.30%
2024-10-22 09:28:12,147 - INFO - MoE Layer 2 Usage: Expert 0: 20.02%, Expert 1: 19.74%, Expert 2: 19.56%, Expert 3: 21.58%, Expert 4: 19.10%
2024-10-22 09:28:12,148 - INFO - MoE Layer 3 Usage: Expert 0: 16.28%, Expert 1: 22.40%, Expert 2: 22.59%, Expert 3: 19.84%, Expert 4: 18.90%
2024-10-22 09:28:12,150 - INFO - MoE Layer 4 Usage: Expert 0: 21.90%, Expert 1: 20.38%, Expert 2: 19.15%, Expert 3: 24.78%, Expert 4: 13.80%


Logging MoE usage stats...


2024-10-22 09:28:16,082 - INFO - MoE Layer 1 Usage: Expert 0: 21.10%, Expert 1: 18.81%, Expert 2: 18.68%, Expert 3: 21.00%, Expert 4: 20.40%
2024-10-22 09:28:16,083 - INFO - MoE Layer 2 Usage: Expert 0: 20.19%, Expert 1: 19.64%, Expert 2: 19.71%, Expert 3: 21.58%, Expert 4: 18.88%
2024-10-22 09:28:16,084 - INFO - MoE Layer 3 Usage: Expert 0: 16.37%, Expert 1: 22.66%, Expert 2: 22.43%, Expert 3: 19.72%, Expert 4: 18.81%
2024-10-22 09:28:16,085 - INFO - MoE Layer 4 Usage: Expert 0: 21.68%, Expert 1: 20.50%, Expert 2: 19.35%, Expert 3: 24.67%, Expert 4: 13.80%


Logging MoE usage stats...


2024-10-22 09:28:20,011 - INFO - MoE Layer 1 Usage: Expert 0: 21.09%, Expert 1: 18.76%, Expert 2: 18.76%, Expert 3: 20.94%, Expert 4: 20.45%
2024-10-22 09:28:20,012 - INFO - MoE Layer 2 Usage: Expert 0: 20.48%, Expert 1: 19.82%, Expert 2: 19.07%, Expert 3: 21.69%, Expert 4: 18.95%
2024-10-22 09:28:20,013 - INFO - MoE Layer 3 Usage: Expert 0: 16.27%, Expert 1: 22.51%, Expert 2: 22.36%, Expert 3: 19.89%, Expert 4: 18.96%
2024-10-22 09:28:20,014 - INFO - MoE Layer 4 Usage: Expert 0: 21.68%, Expert 1: 20.51%, Expert 2: 19.44%, Expert 3: 24.59%, Expert 4: 13.79%


Logging MoE usage stats...


2024-10-22 09:28:24,019 - INFO - MoE Layer 1 Usage: Expert 0: 21.13%, Expert 1: 18.77%, Expert 2: 18.67%, Expert 3: 20.99%, Expert 4: 20.43%
2024-10-22 09:28:24,020 - INFO - MoE Layer 2 Usage: Expert 0: 20.23%, Expert 1: 19.63%, Expert 2: 19.43%, Expert 3: 21.58%, Expert 4: 19.13%
2024-10-22 09:28:24,021 - INFO - MoE Layer 3 Usage: Expert 0: 16.28%, Expert 1: 22.66%, Expert 2: 22.27%, Expert 3: 19.73%, Expert 4: 19.05%
2024-10-22 09:28:24,022 - INFO - MoE Layer 4 Usage: Expert 0: 21.99%, Expert 1: 20.66%, Expert 2: 19.24%, Expert 3: 24.49%, Expert 4: 13.61%


Logging MoE usage stats...


2024-10-22 09:28:27,881 - INFO - MoE Layer 1 Usage: Expert 0: 21.11%, Expert 1: 18.85%, Expert 2: 18.68%, Expert 3: 21.03%, Expert 4: 20.33%
2024-10-22 09:28:27,882 - INFO - MoE Layer 2 Usage: Expert 0: 20.29%, Expert 1: 19.55%, Expert 2: 19.39%, Expert 3: 21.81%, Expert 4: 18.96%
2024-10-22 09:28:27,883 - INFO - MoE Layer 3 Usage: Expert 0: 16.33%, Expert 1: 22.41%, Expert 2: 22.47%, Expert 3: 19.93%, Expert 4: 18.86%
2024-10-22 09:28:27,884 - INFO - MoE Layer 4 Usage: Expert 0: 21.81%, Expert 1: 20.77%, Expert 2: 19.36%, Expert 3: 24.54%, Expert 4: 13.53%


Logging MoE usage stats...


2024-10-22 09:28:31,745 - INFO - MoE Layer 1 Usage: Expert 0: 20.93%, Expert 1: 18.85%, Expert 2: 18.70%, Expert 3: 21.25%, Expert 4: 20.28%
2024-10-22 09:28:31,746 - INFO - MoE Layer 2 Usage: Expert 0: 20.39%, Expert 1: 19.55%, Expert 2: 19.48%, Expert 3: 21.49%, Expert 4: 19.10%
2024-10-22 09:28:31,747 - INFO - MoE Layer 3 Usage: Expert 0: 16.25%, Expert 1: 22.48%, Expert 2: 22.32%, Expert 3: 19.88%, Expert 4: 19.07%
2024-10-22 09:28:31,748 - INFO - MoE Layer 4 Usage: Expert 0: 21.93%, Expert 1: 20.71%, Expert 2: 19.34%, Expert 3: 24.55%, Expert 4: 13.46%


Logging MoE usage stats...


2024-10-22 09:28:35,679 - INFO - MoE Layer 1 Usage: Expert 0: 21.19%, Expert 1: 18.83%, Expert 2: 18.88%, Expert 3: 20.75%, Expert 4: 20.35%
2024-10-22 09:28:35,680 - INFO - MoE Layer 2 Usage: Expert 0: 20.32%, Expert 1: 19.79%, Expert 2: 19.28%, Expert 3: 21.69%, Expert 4: 18.92%
2024-10-22 09:28:35,680 - INFO - MoE Layer 3 Usage: Expert 0: 16.12%, Expert 1: 22.57%, Expert 2: 22.52%, Expert 3: 20.00%, Expert 4: 18.80%
2024-10-22 09:28:35,682 - INFO - MoE Layer 4 Usage: Expert 0: 21.81%, Expert 1: 20.68%, Expert 2: 19.31%, Expert 3: 24.74%, Expert 4: 13.46%


Logging MoE usage stats...


2024-10-22 09:28:39,534 - INFO - MoE Layer 1 Usage: Expert 0: 21.09%, Expert 1: 18.89%, Expert 2: 18.68%, Expert 3: 20.98%, Expert 4: 20.36%
2024-10-22 09:28:39,535 - INFO - MoE Layer 2 Usage: Expert 0: 20.39%, Expert 1: 19.65%, Expert 2: 19.16%, Expert 3: 21.70%, Expert 4: 19.10%
2024-10-22 09:28:39,536 - INFO - MoE Layer 3 Usage: Expert 0: 16.31%, Expert 1: 22.51%, Expert 2: 22.45%, Expert 3: 19.67%, Expert 4: 19.06%
2024-10-22 09:28:39,537 - INFO - MoE Layer 4 Usage: Expert 0: 21.79%, Expert 1: 21.04%, Expert 2: 19.34%, Expert 3: 24.44%, Expert 4: 13.39%


Logging MoE usage stats...


2024-10-22 09:28:43,381 - INFO - MoE Layer 1 Usage: Expert 0: 21.17%, Expert 1: 18.79%, Expert 2: 18.73%, Expert 3: 20.99%, Expert 4: 20.32%
2024-10-22 09:28:43,382 - INFO - MoE Layer 2 Usage: Expert 0: 20.29%, Expert 1: 19.75%, Expert 2: 19.40%, Expert 3: 21.69%, Expert 4: 18.87%
2024-10-22 09:28:43,383 - INFO - MoE Layer 3 Usage: Expert 0: 16.10%, Expert 1: 22.60%, Expert 2: 22.46%, Expert 3: 19.88%, Expert 4: 18.96%
2024-10-22 09:28:43,384 - INFO - MoE Layer 4 Usage: Expert 0: 22.21%, Expert 1: 20.65%, Expert 2: 19.23%, Expert 3: 24.53%, Expert 4: 13.39%

***** Running Evaluation *****
  Num examples = 1000
  Batch size = 30


Logging MoE usage stats...
{'loss': 5.8663, 'grad_norm': 4.364841461181641, 'learning_rate': 0.00025, 'epoch': 126.58}


  0%|          | 0/34 [00:00<?, ?it/s]

Saving model checkpoint to ./results/pretraining_moe/checkpoints\checkpoint-1000
Configuration saved in ./results/pretraining_moe/checkpoints\checkpoint-1000\config.json
Configuration saved in ./results/pretraining_moe/checkpoints\checkpoint-1000\generation_config.json
Model weights saved in ./results/pretraining_moe/checkpoints\checkpoint-1000\model.safetensors
tokenizer config file saved in ./results/pretraining_moe/checkpoints\checkpoint-1000\tokenizer_config.json
Special tokens file saved in ./results/pretraining_moe/checkpoints\checkpoint-1000\special_tokens_map.json


{'eval_loss': 6.708746910095215, 'eval_runtime': 0.5099, 'eval_samples_per_second': 1961.056, 'eval_steps_per_second': 66.676, 'epoch': 126.58}


Deleting older checkpoint [results\pretraining_moe\checkpoints\checkpoint-900] due to args.save_total_limit
2024-10-22 09:28:48,167 - INFO - MoE Layer 1 Usage: Expert 0: 21.06%, Expert 1: 19.00%, Expert 2: 18.78%, Expert 3: 20.72%, Expert 4: 20.44%
2024-10-22 09:28:48,168 - INFO - MoE Layer 2 Usage: Expert 0: 19.53%, Expert 1: 19.90%, Expert 2: 19.14%, Expert 3: 22.19%, Expert 4: 19.25%
2024-10-22 09:28:48,169 - INFO - MoE Layer 3 Usage: Expert 0: 16.15%, Expert 1: 23.18%, Expert 2: 22.12%, Expert 3: 19.86%, Expert 4: 18.69%
2024-10-22 09:28:48,170 - INFO - MoE Layer 4 Usage: Expert 0: 21.35%, Expert 1: 21.33%, Expert 2: 19.39%, Expert 3: 24.67%, Expert 4: 13.26%


Logging MoE usage stats...


2024-10-22 09:28:52,134 - INFO - MoE Layer 1 Usage: Expert 0: 21.25%, Expert 1: 18.73%, Expert 2: 18.70%, Expert 3: 20.99%, Expert 4: 20.34%
2024-10-22 09:28:52,135 - INFO - MoE Layer 2 Usage: Expert 0: 20.31%, Expert 1: 19.79%, Expert 2: 19.38%, Expert 3: 21.52%, Expert 4: 19.00%
2024-10-22 09:28:52,136 - INFO - MoE Layer 3 Usage: Expert 0: 15.87%, Expert 1: 22.70%, Expert 2: 22.43%, Expert 3: 20.01%, Expert 4: 18.99%
2024-10-22 09:28:52,138 - INFO - MoE Layer 4 Usage: Expert 0: 21.95%, Expert 1: 20.52%, Expert 2: 19.19%, Expert 3: 25.01%, Expert 4: 13.34%


Logging MoE usage stats...


2024-10-22 09:28:56,092 - INFO - MoE Layer 1 Usage: Expert 0: 21.14%, Expert 1: 18.81%, Expert 2: 18.81%, Expert 3: 20.90%, Expert 4: 20.35%
2024-10-22 09:28:56,093 - INFO - MoE Layer 2 Usage: Expert 0: 20.39%, Expert 1: 19.79%, Expert 2: 19.48%, Expert 3: 21.68%, Expert 4: 18.66%
2024-10-22 09:28:56,094 - INFO - MoE Layer 3 Usage: Expert 0: 16.05%, Expert 1: 22.61%, Expert 2: 22.29%, Expert 3: 19.99%, Expert 4: 19.07%
2024-10-22 09:28:56,095 - INFO - MoE Layer 4 Usage: Expert 0: 21.88%, Expert 1: 20.54%, Expert 2: 19.15%, Expert 3: 25.25%, Expert 4: 13.18%


Logging MoE usage stats...


2024-10-22 09:29:00,075 - INFO - MoE Layer 1 Usage: Expert 0: 21.17%, Expert 1: 18.81%, Expert 2: 18.66%, Expert 3: 21.03%, Expert 4: 20.34%
2024-10-22 09:29:00,076 - INFO - MoE Layer 2 Usage: Expert 0: 20.29%, Expert 1: 19.76%, Expert 2: 19.32%, Expert 3: 21.72%, Expert 4: 18.92%
2024-10-22 09:29:00,077 - INFO - MoE Layer 3 Usage: Expert 0: 16.13%, Expert 1: 22.60%, Expert 2: 22.53%, Expert 3: 19.92%, Expert 4: 18.82%
2024-10-22 09:29:00,078 - INFO - MoE Layer 4 Usage: Expert 0: 22.08%, Expert 1: 20.69%, Expert 2: 19.13%, Expert 3: 24.83%, Expert 4: 13.27%


Logging MoE usage stats...


2024-10-22 09:29:04,001 - INFO - MoE Layer 1 Usage: Expert 0: 21.23%, Expert 1: 18.70%, Expert 2: 18.74%, Expert 3: 20.94%, Expert 4: 20.39%
2024-10-22 09:29:04,003 - INFO - MoE Layer 2 Usage: Expert 0: 20.47%, Expert 1: 19.72%, Expert 2: 19.19%, Expert 3: 21.64%, Expert 4: 18.98%
2024-10-22 09:29:04,004 - INFO - MoE Layer 3 Usage: Expert 0: 16.21%, Expert 1: 22.56%, Expert 2: 22.42%, Expert 3: 19.91%, Expert 4: 18.90%
2024-10-22 09:29:04,005 - INFO - MoE Layer 4 Usage: Expert 0: 21.71%, Expert 1: 20.94%, Expert 2: 19.09%, Expert 3: 25.07%, Expert 4: 13.19%


Logging MoE usage stats...


2024-10-22 09:29:07,958 - INFO - MoE Layer 1 Usage: Expert 0: 21.20%, Expert 1: 18.81%, Expert 2: 18.84%, Expert 3: 20.86%, Expert 4: 20.29%
2024-10-22 09:29:07,959 - INFO - MoE Layer 2 Usage: Expert 0: 20.38%, Expert 1: 19.75%, Expert 2: 19.48%, Expert 3: 21.54%, Expert 4: 18.85%
2024-10-22 09:29:07,960 - INFO - MoE Layer 3 Usage: Expert 0: 16.00%, Expert 1: 22.59%, Expert 2: 22.33%, Expert 3: 19.98%, Expert 4: 19.10%
2024-10-22 09:29:07,961 - INFO - MoE Layer 4 Usage: Expert 0: 21.95%, Expert 1: 20.94%, Expert 2: 19.14%, Expert 3: 24.95%, Expert 4: 13.02%


Logging MoE usage stats...


2024-10-22 09:29:11,934 - INFO - MoE Layer 1 Usage: Expert 0: 21.34%, Expert 1: 18.90%, Expert 2: 18.60%, Expert 3: 20.81%, Expert 4: 20.34%
2024-10-22 09:29:11,935 - INFO - MoE Layer 2 Usage: Expert 0: 20.20%, Expert 1: 19.89%, Expert 2: 19.27%, Expert 3: 21.57%, Expert 4: 19.07%
2024-10-22 09:29:11,936 - INFO - MoE Layer 3 Usage: Expert 0: 16.01%, Expert 1: 22.58%, Expert 2: 22.46%, Expert 3: 20.05%, Expert 4: 18.91%
2024-10-22 09:29:11,937 - INFO - MoE Layer 4 Usage: Expert 0: 22.02%, Expert 1: 20.57%, Expert 2: 19.08%, Expert 3: 25.16%, Expert 4: 13.18%


Logging MoE usage stats...


2024-10-22 09:29:15,889 - INFO - MoE Layer 1 Usage: Expert 0: 21.11%, Expert 1: 18.72%, Expert 2: 18.51%, Expert 3: 21.14%, Expert 4: 20.52%
2024-10-22 09:29:15,890 - INFO - MoE Layer 2 Usage: Expert 0: 20.30%, Expert 1: 19.65%, Expert 2: 19.51%, Expert 3: 21.68%, Expert 4: 18.87%
2024-10-22 09:29:15,891 - INFO - MoE Layer 3 Usage: Expert 0: 15.93%, Expert 1: 22.39%, Expert 2: 22.44%, Expert 3: 20.22%, Expert 4: 19.02%
2024-10-22 09:29:15,892 - INFO - MoE Layer 4 Usage: Expert 0: 21.68%, Expert 1: 21.11%, Expert 2: 19.18%, Expert 3: 24.82%, Expert 4: 13.21%


Logging MoE usage stats...


2024-10-22 09:29:19,895 - INFO - MoE Layer 1 Usage: Expert 0: 21.21%, Expert 1: 18.71%, Expert 2: 18.69%, Expert 3: 20.99%, Expert 4: 20.40%
2024-10-22 09:29:19,896 - INFO - MoE Layer 2 Usage: Expert 0: 20.29%, Expert 1: 19.79%, Expert 2: 19.29%, Expert 3: 21.69%, Expert 4: 18.94%
2024-10-22 09:29:19,896 - INFO - MoE Layer 3 Usage: Expert 0: 16.20%, Expert 1: 22.59%, Expert 2: 22.07%, Expert 3: 20.08%, Expert 4: 19.06%
2024-10-22 09:29:19,907 - INFO - MoE Layer 4 Usage: Expert 0: 21.85%, Expert 1: 20.76%, Expert 2: 19.06%, Expert 3: 25.19%, Expert 4: 13.14%


Logging MoE usage stats...


2024-10-22 09:29:23,892 - INFO - MoE Layer 1 Usage: Expert 0: 21.29%, Expert 1: 18.51%, Expert 2: 18.66%, Expert 3: 21.18%, Expert 4: 20.36%
2024-10-22 09:29:23,893 - INFO - MoE Layer 2 Usage: Expert 0: 20.23%, Expert 1: 19.69%, Expert 2: 19.44%, Expert 3: 21.69%, Expert 4: 18.96%
2024-10-22 09:29:23,894 - INFO - MoE Layer 3 Usage: Expert 0: 16.11%, Expert 1: 22.70%, Expert 2: 22.18%, Expert 3: 19.94%, Expert 4: 19.08%
2024-10-22 09:29:23,895 - INFO - MoE Layer 4 Usage: Expert 0: 21.84%, Expert 1: 20.93%, Expert 2: 19.15%, Expert 3: 25.09%, Expert 4: 12.99%

***** Running Evaluation *****
  Num examples = 1000
  Batch size = 30


Logging MoE usage stats...


  0%|          | 0/34 [00:00<?, ?it/s]

Saving model checkpoint to ./results/pretraining_moe/checkpoints\checkpoint-1100
Configuration saved in ./results/pretraining_moe/checkpoints\checkpoint-1100\config.json
Configuration saved in ./results/pretraining_moe/checkpoints\checkpoint-1100\generation_config.json
Model weights saved in ./results/pretraining_moe/checkpoints\checkpoint-1100\model.safetensors
tokenizer config file saved in ./results/pretraining_moe/checkpoints\checkpoint-1100\tokenizer_config.json
Special tokens file saved in ./results/pretraining_moe/checkpoints\checkpoint-1100\special_tokens_map.json


{'eval_loss': 6.9355340003967285, 'eval_runtime': 0.555, 'eval_samples_per_second': 1801.93, 'eval_steps_per_second': 61.266, 'epoch': 139.24}


Deleting older checkpoint [results\pretraining_moe\checkpoints\checkpoint-1000] due to args.save_total_limit
2024-10-22 09:29:28,651 - INFO - MoE Layer 1 Usage: Expert 0: 20.99%, Expert 1: 18.89%, Expert 2: 19.02%, Expert 3: 20.53%, Expert 4: 20.57%
2024-10-22 09:29:28,652 - INFO - MoE Layer 2 Usage: Expert 0: 20.04%, Expert 1: 19.77%, Expert 2: 19.18%, Expert 3: 22.27%, Expert 4: 18.74%
2024-10-22 09:29:28,653 - INFO - MoE Layer 3 Usage: Expert 0: 15.93%, Expert 1: 23.03%, Expert 2: 22.23%, Expert 3: 20.43%, Expert 4: 18.39%
2024-10-22 09:29:28,654 - INFO - MoE Layer 4 Usage: Expert 0: 21.63%, Expert 1: 21.22%, Expert 2: 18.69%, Expert 3: 25.31%, Expert 4: 13.16%


Logging MoE usage stats...


2024-10-22 09:29:32,550 - INFO - MoE Layer 1 Usage: Expert 0: 21.33%, Expert 1: 18.72%, Expert 2: 18.60%, Expert 3: 20.96%, Expert 4: 20.39%
2024-10-22 09:29:32,551 - INFO - MoE Layer 2 Usage: Expert 0: 20.22%, Expert 1: 19.94%, Expert 2: 19.21%, Expert 3: 21.72%, Expert 4: 18.91%
2024-10-22 09:29:32,552 - INFO - MoE Layer 3 Usage: Expert 0: 15.89%, Expert 1: 22.74%, Expert 2: 22.45%, Expert 3: 19.92%, Expert 4: 19.00%
2024-10-22 09:29:32,554 - INFO - MoE Layer 4 Usage: Expert 0: 22.13%, Expert 1: 20.81%, Expert 2: 18.98%, Expert 3: 25.08%, Expert 4: 13.01%


Logging MoE usage stats...


2024-10-22 09:29:36,450 - INFO - MoE Layer 1 Usage: Expert 0: 21.21%, Expert 1: 18.82%, Expert 2: 18.55%, Expert 3: 20.91%, Expert 4: 20.50%
2024-10-22 09:29:36,451 - INFO - MoE Layer 2 Usage: Expert 0: 20.21%, Expert 1: 19.82%, Expert 2: 19.39%, Expert 3: 21.57%, Expert 4: 19.01%
2024-10-22 09:29:36,452 - INFO - MoE Layer 3 Usage: Expert 0: 16.21%, Expert 1: 22.38%, Expert 2: 22.35%, Expert 3: 20.01%, Expert 4: 19.05%
2024-10-22 09:29:36,453 - INFO - MoE Layer 4 Usage: Expert 0: 21.96%, Expert 1: 20.97%, Expert 2: 19.16%, Expert 3: 24.84%, Expert 4: 13.07%


Logging MoE usage stats...


2024-10-22 09:29:40,351 - INFO - MoE Layer 1 Usage: Expert 0: 21.23%, Expert 1: 18.81%, Expert 2: 18.58%, Expert 3: 21.06%, Expert 4: 20.32%
2024-10-22 09:29:40,352 - INFO - MoE Layer 2 Usage: Expert 0: 20.24%, Expert 1: 19.84%, Expert 2: 19.32%, Expert 3: 21.75%, Expert 4: 18.85%
2024-10-22 09:29:40,353 - INFO - MoE Layer 3 Usage: Expert 0: 16.05%, Expert 1: 22.69%, Expert 2: 22.17%, Expert 3: 20.12%, Expert 4: 18.97%
2024-10-22 09:29:40,355 - INFO - MoE Layer 4 Usage: Expert 0: 21.93%, Expert 1: 20.83%, Expert 2: 19.26%, Expert 3: 24.86%, Expert 4: 13.12%


Logging MoE usage stats...


2024-10-22 09:29:44,286 - INFO - MoE Layer 1 Usage: Expert 0: 21.25%, Expert 1: 18.74%, Expert 2: 18.57%, Expert 3: 21.04%, Expert 4: 20.41%
2024-10-22 09:29:44,287 - INFO - MoE Layer 2 Usage: Expert 0: 20.37%, Expert 1: 19.57%, Expert 2: 19.39%, Expert 3: 21.60%, Expert 4: 19.06%
2024-10-22 09:29:44,288 - INFO - MoE Layer 3 Usage: Expert 0: 16.19%, Expert 1: 22.43%, Expert 2: 22.18%, Expert 3: 20.22%, Expert 4: 18.98%
2024-10-22 09:29:44,290 - INFO - MoE Layer 4 Usage: Expert 0: 21.87%, Expert 1: 21.03%, Expert 2: 18.91%, Expert 3: 25.11%, Expert 4: 13.08%


Logging MoE usage stats...


2024-10-22 09:29:48,232 - INFO - MoE Layer 1 Usage: Expert 0: 21.22%, Expert 1: 18.85%, Expert 2: 18.62%, Expert 3: 20.90%, Expert 4: 20.41%
2024-10-22 09:29:48,233 - INFO - MoE Layer 2 Usage: Expert 0: 20.36%, Expert 1: 19.89%, Expert 2: 19.32%, Expert 3: 21.58%, Expert 4: 18.85%
2024-10-22 09:29:48,234 - INFO - MoE Layer 3 Usage: Expert 0: 16.07%, Expert 1: 22.66%, Expert 2: 22.28%, Expert 3: 20.05%, Expert 4: 18.95%
2024-10-22 09:29:48,236 - INFO - MoE Layer 4 Usage: Expert 0: 21.99%, Expert 1: 20.93%, Expert 2: 19.10%, Expert 3: 24.92%, Expert 4: 13.06%


Logging MoE usage stats...


2024-10-22 09:29:52,158 - INFO - MoE Layer 1 Usage: Expert 0: 21.20%, Expert 1: 18.62%, Expert 2: 18.52%, Expert 3: 21.22%, Expert 4: 20.44%
2024-10-22 09:29:52,158 - INFO - MoE Layer 2 Usage: Expert 0: 20.31%, Expert 1: 19.67%, Expert 2: 19.51%, Expert 3: 21.60%, Expert 4: 18.90%
2024-10-22 09:29:52,159 - INFO - MoE Layer 3 Usage: Expert 0: 16.07%, Expert 1: 22.52%, Expert 2: 22.56%, Expert 3: 19.88%, Expert 4: 18.97%
2024-10-22 09:29:52,161 - INFO - MoE Layer 4 Usage: Expert 0: 22.04%, Expert 1: 21.12%, Expert 2: 18.65%, Expert 3: 25.32%, Expert 4: 12.87%


Logging MoE usage stats...


2024-10-22 09:29:56,132 - INFO - MoE Layer 1 Usage: Expert 0: 21.48%, Expert 1: 18.82%, Expert 2: 18.30%, Expert 3: 20.89%, Expert 4: 20.51%
2024-10-22 09:29:56,133 - INFO - MoE Layer 2 Usage: Expert 0: 20.26%, Expert 1: 19.77%, Expert 2: 19.24%, Expert 3: 21.70%, Expert 4: 19.03%
2024-10-22 09:29:56,134 - INFO - MoE Layer 3 Usage: Expert 0: 16.04%, Expert 1: 22.49%, Expert 2: 22.45%, Expert 3: 19.99%, Expert 4: 19.03%
2024-10-22 09:29:56,136 - INFO - MoE Layer 4 Usage: Expert 0: 21.93%, Expert 1: 21.22%, Expert 2: 18.93%, Expert 3: 25.01%, Expert 4: 12.90%


Logging MoE usage stats...


2024-10-22 09:30:00,040 - INFO - MoE Layer 1 Usage: Expert 0: 21.34%, Expert 1: 18.60%, Expert 2: 18.58%, Expert 3: 21.01%, Expert 4: 20.47%
2024-10-22 09:30:00,041 - INFO - MoE Layer 2 Usage: Expert 0: 20.21%, Expert 1: 19.89%, Expert 2: 19.40%, Expert 3: 21.49%, Expert 4: 19.01%
2024-10-22 09:30:00,042 - INFO - MoE Layer 3 Usage: Expert 0: 16.02%, Expert 1: 22.70%, Expert 2: 22.47%, Expert 3: 20.01%, Expert 4: 18.79%
2024-10-22 09:30:00,043 - INFO - MoE Layer 4 Usage: Expert 0: 21.95%, Expert 1: 20.90%, Expert 2: 18.67%, Expert 3: 25.35%, Expert 4: 13.13%


Logging MoE usage stats...


2024-10-22 09:30:04,003 - INFO - MoE Layer 1 Usage: Expert 0: 21.19%, Expert 1: 18.80%, Expert 2: 18.43%, Expert 3: 21.16%, Expert 4: 20.41%
2024-10-22 09:30:04,004 - INFO - MoE Layer 2 Usage: Expert 0: 20.39%, Expert 1: 19.71%, Expert 2: 19.29%, Expert 3: 21.70%, Expert 4: 18.91%
2024-10-22 09:30:04,004 - INFO - MoE Layer 3 Usage: Expert 0: 16.13%, Expert 1: 22.50%, Expert 2: 22.55%, Expert 3: 19.92%, Expert 4: 18.91%
2024-10-22 09:30:04,006 - INFO - MoE Layer 4 Usage: Expert 0: 22.12%, Expert 1: 21.04%, Expert 2: 18.60%, Expert 3: 25.36%, Expert 4: 12.88%

***** Running Evaluation *****
  Num examples = 1000
  Batch size = 30


Logging MoE usage stats...


  0%|          | 0/34 [00:00<?, ?it/s]

Saving model checkpoint to ./results/pretraining_moe/checkpoints\checkpoint-1200
Configuration saved in ./results/pretraining_moe/checkpoints\checkpoint-1200\config.json
Configuration saved in ./results/pretraining_moe/checkpoints\checkpoint-1200\generation_config.json
Model weights saved in ./results/pretraining_moe/checkpoints\checkpoint-1200\model.safetensors
tokenizer config file saved in ./results/pretraining_moe/checkpoints\checkpoint-1200\tokenizer_config.json
Special tokens file saved in ./results/pretraining_moe/checkpoints\checkpoint-1200\special_tokens_map.json


{'eval_loss': 7.256348609924316, 'eval_runtime': 0.5411, 'eval_samples_per_second': 1848.009, 'eval_steps_per_second': 62.832, 'epoch': 151.9}


Deleting older checkpoint [results\pretraining_moe\checkpoints\checkpoint-1100] due to args.save_total_limit
2024-10-22 09:30:08,866 - INFO - MoE Layer 1 Usage: Expert 0: 21.18%, Expert 1: 18.84%, Expert 2: 18.81%, Expert 3: 20.70%, Expert 4: 20.46%
2024-10-22 09:30:08,867 - INFO - MoE Layer 2 Usage: Expert 0: 19.85%, Expert 1: 19.87%, Expert 2: 18.96%, Expert 3: 22.22%, Expert 4: 19.09%
2024-10-22 09:30:08,868 - INFO - MoE Layer 3 Usage: Expert 0: 15.99%, Expert 1: 23.22%, Expert 2: 22.03%, Expert 3: 20.22%, Expert 4: 18.54%
2024-10-22 09:30:08,869 - INFO - MoE Layer 4 Usage: Expert 0: 21.81%, Expert 1: 21.08%, Expert 2: 18.94%, Expert 3: 25.16%, Expert 4: 13.00%


Logging MoE usage stats...


2024-10-22 09:30:12,798 - INFO - MoE Layer 1 Usage: Expert 0: 21.34%, Expert 1: 18.62%, Expert 2: 18.53%, Expert 3: 21.13%, Expert 4: 20.38%
2024-10-22 09:30:12,799 - INFO - MoE Layer 2 Usage: Expert 0: 20.30%, Expert 1: 19.89%, Expert 2: 19.29%, Expert 3: 21.71%, Expert 4: 18.82%
2024-10-22 09:30:12,800 - INFO - MoE Layer 3 Usage: Expert 0: 16.04%, Expert 1: 22.52%, Expert 2: 22.27%, Expert 3: 19.96%, Expert 4: 19.21%
2024-10-22 09:30:12,801 - INFO - MoE Layer 4 Usage: Expert 0: 22.30%, Expert 1: 20.95%, Expert 2: 18.72%, Expert 3: 25.12%, Expert 4: 12.92%


Logging MoE usage stats...


2024-10-22 09:30:16,725 - INFO - MoE Layer 1 Usage: Expert 0: 21.39%, Expert 1: 18.74%, Expert 2: 18.51%, Expert 3: 21.14%, Expert 4: 20.22%
2024-10-22 09:30:16,726 - INFO - MoE Layer 2 Usage: Expert 0: 20.41%, Expert 1: 19.79%, Expert 2: 19.31%, Expert 3: 21.64%, Expert 4: 18.85%
2024-10-22 09:30:16,727 - INFO - MoE Layer 3 Usage: Expert 0: 16.18%, Expert 1: 22.47%, Expert 2: 22.30%, Expert 3: 20.05%, Expert 4: 19.00%
2024-10-22 09:30:16,728 - INFO - MoE Layer 4 Usage: Expert 0: 22.05%, Expert 1: 20.97%, Expert 2: 18.75%, Expert 3: 25.35%, Expert 4: 12.88%


Logging MoE usage stats...


2024-10-22 09:30:20,733 - INFO - MoE Layer 1 Usage: Expert 0: 21.40%, Expert 1: 18.77%, Expert 2: 18.62%, Expert 3: 20.82%, Expert 4: 20.39%
2024-10-22 09:30:20,734 - INFO - MoE Layer 2 Usage: Expert 0: 20.35%, Expert 1: 19.80%, Expert 2: 19.13%, Expert 3: 21.81%, Expert 4: 18.90%
2024-10-22 09:30:20,735 - INFO - MoE Layer 3 Usage: Expert 0: 16.06%, Expert 1: 22.42%, Expert 2: 22.26%, Expert 3: 20.20%, Expert 4: 19.06%
2024-10-22 09:30:20,736 - INFO - MoE Layer 4 Usage: Expert 0: 22.28%, Expert 1: 20.84%, Expert 2: 18.78%, Expert 3: 25.23%, Expert 4: 12.87%


Logging MoE usage stats...


2024-10-22 09:30:24,637 - INFO - MoE Layer 1 Usage: Expert 0: 21.27%, Expert 1: 18.69%, Expert 2: 18.48%, Expert 3: 21.07%, Expert 4: 20.49%
2024-10-22 09:30:24,638 - INFO - MoE Layer 2 Usage: Expert 0: 20.18%, Expert 1: 19.93%, Expert 2: 19.26%, Expert 3: 21.66%, Expert 4: 18.97%
2024-10-22 09:30:24,639 - INFO - MoE Layer 3 Usage: Expert 0: 16.13%, Expert 1: 22.44%, Expert 2: 22.43%, Expert 3: 19.85%, Expert 4: 19.15%
2024-10-22 09:30:24,640 - INFO - MoE Layer 4 Usage: Expert 0: 22.27%, Expert 1: 21.16%, Expert 2: 18.69%, Expert 3: 24.86%, Expert 4: 13.03%


Logging MoE usage stats...


2024-10-22 09:30:28,545 - INFO - MoE Layer 1 Usage: Expert 0: 21.32%, Expert 1: 18.75%, Expert 2: 18.63%, Expert 3: 20.93%, Expert 4: 20.37%
2024-10-22 09:30:28,546 - INFO - MoE Layer 2 Usage: Expert 0: 20.34%, Expert 1: 19.75%, Expert 2: 19.49%, Expert 3: 21.60%, Expert 4: 18.82%
2024-10-22 09:30:28,546 - INFO - MoE Layer 3 Usage: Expert 0: 16.00%, Expert 1: 22.56%, Expert 2: 22.28%, Expert 3: 20.02%, Expert 4: 19.13%
2024-10-22 09:30:28,547 - INFO - MoE Layer 4 Usage: Expert 0: 22.32%, Expert 1: 21.15%, Expert 2: 18.58%, Expert 3: 25.07%, Expert 4: 12.89%


Logging MoE usage stats...


2024-10-22 09:30:32,506 - INFO - MoE Layer 1 Usage: Expert 0: 21.27%, Expert 1: 18.79%, Expert 2: 18.53%, Expert 3: 21.10%, Expert 4: 20.30%
2024-10-22 09:30:32,507 - INFO - MoE Layer 2 Usage: Expert 0: 20.21%, Expert 1: 19.75%, Expert 2: 19.32%, Expert 3: 21.94%, Expert 4: 18.77%
2024-10-22 09:30:32,508 - INFO - MoE Layer 3 Usage: Expert 0: 16.10%, Expert 1: 22.25%, Expert 2: 22.45%, Expert 3: 20.01%, Expert 4: 19.19%
2024-10-22 09:30:32,509 - INFO - MoE Layer 4 Usage: Expert 0: 21.99%, Expert 1: 21.27%, Expert 2: 18.91%, Expert 3: 24.74%, Expert 4: 13.09%


Logging MoE usage stats...


2024-10-22 09:30:36,448 - INFO - MoE Layer 1 Usage: Expert 0: 21.19%, Expert 1: 18.86%, Expert 2: 18.51%, Expert 3: 21.10%, Expert 4: 20.34%
2024-10-22 09:30:36,449 - INFO - MoE Layer 2 Usage: Expert 0: 20.43%, Expert 1: 19.77%, Expert 2: 19.33%, Expert 3: 21.62%, Expert 4: 18.85%
2024-10-22 09:30:36,450 - INFO - MoE Layer 3 Usage: Expert 0: 16.01%, Expert 1: 22.63%, Expert 2: 22.32%, Expert 3: 20.14%, Expert 4: 18.89%
2024-10-22 09:30:36,451 - INFO - MoE Layer 4 Usage: Expert 0: 22.35%, Expert 1: 21.22%, Expert 2: 18.61%, Expert 3: 25.08%, Expert 4: 12.75%


Logging MoE usage stats...


2024-10-22 09:30:40,417 - INFO - MoE Layer 1 Usage: Expert 0: 21.21%, Expert 1: 18.67%, Expert 2: 18.73%, Expert 3: 20.93%, Expert 4: 20.47%
2024-10-22 09:30:40,418 - INFO - MoE Layer 2 Usage: Expert 0: 20.32%, Expert 1: 19.81%, Expert 2: 19.42%, Expert 3: 21.66%, Expert 4: 18.78%
2024-10-22 09:30:40,419 - INFO - MoE Layer 3 Usage: Expert 0: 16.36%, Expert 1: 22.63%, Expert 2: 22.42%, Expert 3: 19.83%, Expert 4: 18.76%
2024-10-22 09:30:40,421 - INFO - MoE Layer 4 Usage: Expert 0: 22.47%, Expert 1: 21.03%, Expert 2: 18.37%, Expert 3: 25.18%, Expert 4: 12.95%


Logging MoE usage stats...


2024-10-22 09:30:44,414 - INFO - MoE Layer 1 Usage: Expert 0: 21.44%, Expert 1: 18.82%, Expert 2: 18.39%, Expert 3: 20.99%, Expert 4: 20.37%
2024-10-22 09:30:44,415 - INFO - MoE Layer 2 Usage: Expert 0: 20.32%, Expert 1: 20.02%, Expert 2: 19.12%, Expert 3: 21.67%, Expert 4: 18.87%
2024-10-22 09:30:44,416 - INFO - MoE Layer 3 Usage: Expert 0: 16.13%, Expert 1: 22.38%, Expert 2: 22.32%, Expert 3: 20.10%, Expert 4: 19.07%
2024-10-22 09:30:44,417 - INFO - MoE Layer 4 Usage: Expert 0: 22.37%, Expert 1: 20.98%, Expert 2: 18.53%, Expert 3: 25.35%, Expert 4: 12.78%

***** Running Evaluation *****
  Num examples = 1000
  Batch size = 30


Logging MoE usage stats...


  0%|          | 0/34 [00:00<?, ?it/s]

Saving model checkpoint to ./results/pretraining_moe/checkpoints\checkpoint-1300
Configuration saved in ./results/pretraining_moe/checkpoints\checkpoint-1300\config.json
Configuration saved in ./results/pretraining_moe/checkpoints\checkpoint-1300\generation_config.json
Model weights saved in ./results/pretraining_moe/checkpoints\checkpoint-1300\model.safetensors
tokenizer config file saved in ./results/pretraining_moe/checkpoints\checkpoint-1300\tokenizer_config.json
Special tokens file saved in ./results/pretraining_moe/checkpoints\checkpoint-1300\special_tokens_map.json


{'eval_loss': 7.439208984375, 'eval_runtime': 0.5095, 'eval_samples_per_second': 1962.531, 'eval_steps_per_second': 66.726, 'epoch': 164.56}


Deleting older checkpoint [results\pretraining_moe\checkpoints\checkpoint-1200] due to args.save_total_limit
2024-10-22 09:30:49,107 - INFO - MoE Layer 1 Usage: Expert 0: 21.22%, Expert 1: 18.88%, Expert 2: 18.74%, Expert 3: 20.64%, Expert 4: 20.51%
2024-10-22 09:30:49,108 - INFO - MoE Layer 2 Usage: Expert 0: 19.91%, Expert 1: 19.88%, Expert 2: 19.42%, Expert 3: 22.25%, Expert 4: 18.54%
2024-10-22 09:30:49,109 - INFO - MoE Layer 3 Usage: Expert 0: 15.97%, Expert 1: 23.22%, Expert 2: 22.03%, Expert 3: 20.14%, Expert 4: 18.64%
2024-10-22 09:30:49,110 - INFO - MoE Layer 4 Usage: Expert 0: 22.26%, Expert 1: 20.97%, Expert 2: 18.61%, Expert 3: 25.40%, Expert 4: 12.76%


Logging MoE usage stats...


2024-10-22 09:30:52,902 - INFO - MoE Layer 1 Usage: Expert 0: 21.24%, Expert 1: 18.89%, Expert 2: 18.30%, Expert 3: 21.12%, Expert 4: 20.44%
2024-10-22 09:30:52,903 - INFO - MoE Layer 2 Usage: Expert 0: 20.32%, Expert 1: 20.17%, Expert 2: 18.98%, Expert 3: 21.81%, Expert 4: 18.71%
2024-10-22 09:30:52,904 - INFO - MoE Layer 3 Usage: Expert 0: 16.03%, Expert 1: 22.47%, Expert 2: 22.48%, Expert 3: 20.06%, Expert 4: 18.96%
2024-10-22 09:30:52,905 - INFO - MoE Layer 4 Usage: Expert 0: 22.44%, Expert 1: 21.15%, Expert 2: 18.43%, Expert 3: 25.06%, Expert 4: 12.92%


Logging MoE usage stats...


2024-10-22 09:30:56,723 - INFO - MoE Layer 1 Usage: Expert 0: 21.34%, Expert 1: 18.72%, Expert 2: 18.50%, Expert 3: 21.06%, Expert 4: 20.39%
2024-10-22 09:30:56,724 - INFO - MoE Layer 2 Usage: Expert 0: 20.30%, Expert 1: 19.86%, Expert 2: 19.13%, Expert 3: 21.87%, Expert 4: 18.83%
2024-10-22 09:30:56,725 - INFO - MoE Layer 3 Usage: Expert 0: 16.23%, Expert 1: 22.53%, Expert 2: 22.42%, Expert 3: 19.88%, Expert 4: 18.94%
2024-10-22 09:30:56,726 - INFO - MoE Layer 4 Usage: Expert 0: 22.53%, Expert 1: 21.01%, Expert 2: 18.52%, Expert 3: 24.89%, Expert 4: 13.06%


Logging MoE usage stats...


2024-10-22 09:31:00,516 - INFO - MoE Layer 1 Usage: Expert 0: 21.37%, Expert 1: 18.77%, Expert 2: 18.45%, Expert 3: 20.93%, Expert 4: 20.48%
2024-10-22 09:31:00,517 - INFO - MoE Layer 2 Usage: Expert 0: 20.44%, Expert 1: 19.86%, Expert 2: 19.28%, Expert 3: 21.66%, Expert 4: 18.77%
2024-10-22 09:31:00,517 - INFO - MoE Layer 3 Usage: Expert 0: 16.09%, Expert 1: 22.64%, Expert 2: 22.25%, Expert 3: 19.94%, Expert 4: 19.07%
2024-10-22 09:31:00,519 - INFO - MoE Layer 4 Usage: Expert 0: 22.45%, Expert 1: 21.02%, Expert 2: 18.41%, Expert 3: 25.28%, Expert 4: 12.83%


Logging MoE usage stats...


2024-10-22 09:31:04,314 - INFO - MoE Layer 1 Usage: Expert 0: 21.47%, Expert 1: 18.68%, Expert 2: 18.28%, Expert 3: 21.09%, Expert 4: 20.48%
2024-10-22 09:31:04,315 - INFO - MoE Layer 2 Usage: Expert 0: 20.31%, Expert 1: 19.79%, Expert 2: 19.19%, Expert 3: 21.77%, Expert 4: 18.94%
2024-10-22 09:31:04,316 - INFO - MoE Layer 3 Usage: Expert 0: 16.00%, Expert 1: 22.51%, Expert 2: 22.22%, Expert 3: 20.23%, Expert 4: 19.05%
2024-10-22 09:31:04,317 - INFO - MoE Layer 4 Usage: Expert 0: 22.41%, Expert 1: 21.26%, Expert 2: 18.13%, Expert 3: 25.32%, Expert 4: 12.88%


Logging MoE usage stats...
