In [2]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import TrainingArguments, Trainer, GPT2Tokenizer, default_data_collator
from typing import Optional
import logging
from torch.utils.tensorboard import SummaryWriter
from model_hf import GPTConfig, GPTLMHeadModel, MoEUsageLoggingCallback
import numpy as np


In [4]:
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("training.log", mode='w'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger('MoELogger')


# Load the GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
# Add special tokens, such as PAD token (if not already present)
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})  # Set pad token to eos token



In [5]:
# Prepare the model configuration
block_size = 128

config = GPTConfig(
    block_size=block_size,  # Small block size for debugging
    vocab_size=tokenizer.vocab_size,  # Small vocab size for debugging
    n_layer=4,      # Few layers for faster training
    n_head=4,
    n_embd=256,
    use_moe=True,  # Enable Mixture of Experts
    num_experts=5,
    num_experts_per_tok=2,
    moe_loss=True,
    moe_loss_type = "entropy_regularization",  # Type of load balancing loss "variance_penalty", "entropy_regularization", "diversity_regularization"
    moe_loss_coef = 1e0, 
)

# Initialize the model
model = GPTLMHeadModel(config)

GPT has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.


Using Mixture of Experts (MoE) in MLP
Using Mixture of Experts (MoE) in MLP
Using Mixture of Experts (MoE) in MLP
Using Mixture of Experts (MoE) in MLP
Number of parameters: 24.44M


In [7]:
model = GPTLMHeadModel.from_pretrained(pretrained_model_name_or_path="./results", local_files_only=True, device_map='cuda')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

Using Mixture of Experts (MoE) in MLP
Using Mixture of Experts (MoE) in MLP
Using Mixture of Experts (MoE) in MLP
Using Mixture of Experts (MoE) in MLP
Number of parameters: 37.31M


GPTLMHeadModel(
  (transformer): ModuleDict(
    (wte): Embedding(50257, 256)
    (wpe): Embedding(128, 256)
    (drop): Dropout(p=0.0, inplace=False)
    (h): ModuleList(
      (0-3): 4 x Block(
        (ln_1): LayerNorm()
        (attn): CausalSelfAttention(
          (c_attn): Linear(in_features=256, out_features=768, bias=True)
          (c_proj): Linear(in_features=256, out_features=256, bias=True)
          (attn_dropout): Dropout(p=0.0, inplace=False)
          (resid_dropout): Dropout(p=0.0, inplace=False)
        )
        (ln_2): LayerNorm()
        (mlp): MoE(
          (experts): ModuleList(
            (0-4): 5 x MLP(
              (c_fc): Linear(in_features=256, out_features=1024, bias=True)
              (gelu): GELU(approximate='none')
              (c_proj): Linear(in_features=1024, out_features=256, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (gate): Linear(in_features=256, out_features=5, bias=False)
        )

In [2]:
import torch
from transformers import GPT2Tokenizer
import logging
from model_hf import GPTLMHeadModel

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("training.log", mode='w'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger('MoELogger')

# Load the GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Add special tokens, such as PAD token (if not already present)
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})  # Set pad token to eos token

# Load the fine-tuned model (from your results folder)
model = GPTLMHeadModel.from_pretrained(pretrained_model_name_or_path="./results", local_files_only=True, device_map='cuda')

# Set the device (use GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Input prompt for text generation
input_text = "To be, or not to be, that is the question:"

# Encode the input text to get input IDs
input_ids = torch.tensor(tokenizer.encode(input_text), device='cuda')[None,...]
# Generate text with the model
generated_ids = model.generate(
    input_ids,
    max_length=100,        # Maximum number of tokens to generate
    temperature=0.7,       # Controls randomness in generation (lower is more deterministic)
    top_k=50,              # Keep only top k tokens with highest probability
    top_p=0.95,            # Nucleus sampling: focus on the top p cumulative probability
    repetition_penalty=1.2,  # Discourage repetition
    do_sample=True,        # Sample from the distribution instead of taking argmax
    num_return_sequences=1  # Number of sequences to generate
)

# Decode the generated text back to readable format
# generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)

print("Generated Text:")
print(generated_text)


Using Mixture of Experts (MoE) in MLP
Using Mixture of Experts (MoE) in MLP
Using Mixture of Experts (MoE) in MLP
Using Mixture of Experts (MoE) in MLP
Number of parameters: 37.31M
Generated Text:
To be, or not to be, that is the question:
ClAnd will upon:Iee you Now good,A and out spirit sleep othersOf too of sovereign,But speak noble.
Second: Henry again earthIs so.
Firstman
 Citizen
ICH:Ay to up the of by own!
INUS
o have person so;For might him do thee
T to his, your,He no state as bear unto fair tongueWhich you, my,'d


In [10]:
# Load the GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Add special tokens, such as PAD token (if not already present)
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})  # Set pad token to eos token

# Load the fine-tuned model
model = GPTLMHeadModel.from_pretrained(pretrained_model_name_or_path="./results", local_files_only=True)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Input prompt for text generation
input_text = "To be, or not to be, that is the question:"

# Encode the input text to get input IDs
input_ids = tokenizer.encode(input_text, return_tensors='pt').to(device)
if input_ids.size(1) > model.config.block_size:
    input_ids = input_ids[:, :model.config.block_size]
# Generate text with the model
generated_ids = model.generate(
    input_ids,
    max_length=model.config.block_size+1,        # Maximum number of tokens to generate
    temperature=0.9,       # Controls randomness in generation (lower is more deterministic)
    top_k=5000,              # Keep only top k tokens with highest probability
    top_p=0.99,            # Nucleus sampling: focus on the top p cumulative probability
    repetition_penalty=1.2,  # Discourage repetition
    do_sample=True,        # Sample from the distribution instead of taking argmax
    num_return_sequences=1  # Number of sequences to generate
)

# Decode the generated text back to readable format
generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

print("Generated Text:")
print(generated_text)


Using Mixture of Experts (MoE) in MLP
Using Mixture of Experts (MoE) in MLP
Using Mixture of Experts (MoE) in MLP
Using Mixture of Experts (MoE) in MLP
Number of parameters: 24.44M
Generated Text:
To be, or not to be, that is the question:
Thirdman
 bear jumps impartialWh her him us him Angelo
 your a and ina for head--As I.
All
Third:The for rough this Cam,. victory
 yourru,.Upon- queen but how.
 high.
Prov:This; honour no,First put another
 his, fellow
 ' to g by himself
 so and is What
ad like
.
aw.What this Can unly You thou,.3 HRY for as first part him again
Prov:It be, manily A judgmentWereI


In [11]:
# Load the GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Add special tokens, such as PAD token (if not already present)
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})  # Set pad token to eos token

# Load the fine-tuned model
model = GPTLMHeadModel.from_pretrained(pretrained_model_name_or_path="./results", local_files_only=True)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Input prompt for text generation
input_text = "To be, or not to be, that is the question:"

# Encode the input text to get input IDs
input_ids = tokenizer.encode(input_text, return_tensors='pt').to(device)

# Generate text with the model using cache (past_key_values)
generated_ids = model.generate(
    input_ids,
    max_length=model.config.block_size + 50,  # Generate up to 50 tokens beyond block size
    temperature=0.9,       # Controls randomness in generation (lower is more deterministic)
    top_k=50,              # Keep only top k tokens with highest probability
    top_p=0.95,            # Nucleus sampling: focus on the top p cumulative probability
    repetition_penalty=1.2,  # Discourage repetition
    do_sample=True,        # Sample from the distribution instead of taking argmax
    num_return_sequences=1,  # Number of sequences to generate
    use_cache=True         # Enable past_key_values to allow generation beyond block_size
)

# Decode the generated text back to readable format
generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

print("Generated Text:")
print(generated_text)


Using Mixture of Experts (MoE) in MLP
Using Mixture of Experts (MoE) in MLP
Using Mixture of Experts (MoE) in MLP
Using Mixture of Experts (MoE) in MLP
Number of parameters: 24.44M


AssertionError: Cannot forward sequence of length 129, block size is only 128