In [1]:
import os
from transformers import AutoTokenizer, AutoConfig, OlmoForCausalLM, TrainingArguments, Trainer, EarlyStoppingCallback
from data import dolma
from layerwisetrain.identity_model import IdentityWrapper
from layerwisetrain.dataset import collate_fn

In [25]:
import layerwisetrain.identity_model
import inspect
src = inspect.getsource(layerwisetrain.identity_model)

OSError: could not get source code

In [2]:
HF_TOKEN = os.environ["HF_TOKEN"]
BASE_MODEL = "amd/AMD-OLMo-1B"
BATCH_SIZE = 8
GRADIENT_ACCUMULATION_STEPS = 16
TEST_SIZE = 0.01
LR = 1e-4
MAX_GRAD_NORM = 1.0
EPOCHS = 3
EVALUATION_FREQUENCY = 0.1
WARMUP_RATIO = 0.1
EARLY_STOPPING_PATIENCE = 1

In [3]:
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, token=HF_TOKEN)
config = AutoConfig.from_pretrained(BASE_MODEL, token=HF_TOKEN)

In [4]:
model = OlmoForCausalLM(config)

In [5]:
RANDOM_SEED = 42
DOLMA_DOWNLOAD_PATH = "/mnt/ssd/data/dolma"

In [6]:
dolma_urls = dolma.get_dolma_urls(RANDOM_SEED)

In [7]:
dolma_fname = dolma.download_dolma_archive(dolma_urls[0], DOLMA_DOWNLOAD_PATH)
dolma_dataset = dolma.load_dolma_dataset(dolma_fname)
dolma_dataset = dolma.tokenize_dataset(dolma_dataset, tokenizer, config.max_position_embeddings)
#dolma_dataset = dolma.shuffle_length_groups(dolma_dataset, minibatch_size=BATCH_SIZE, random_seed=RANDOM_SEED)
dolma_dataset = dolma.train_test_split(dolma_dataset, test_size=0.1, random_seed=RANDOM_SEED)

In [8]:
dolma_dataset = dolma_dataset.map(lambda x: dict(**x, labels=x["input_ids"]))

In [9]:
model_identity = IdentityWrapper(model)

In [10]:
model_identity.to("cuda")
model_identity.train()

IdentityWrapper(
  (model): OlmoForCausalLM(
    (model): OlmoModel(
      (embed_tokens): Embedding(50304, 2048, padding_idx=1)
      (layers): ModuleList(
        (0-15): 16 x IdentityLayer()
      )
      (norm): OlmoLayerNorm()
      (rotary_emb): OlmoRotaryEmbedding()
    )
    (lm_head): Linear(in_features=2048, out_features=50304, bias=False)
  )
)

In [11]:
steps_per_epoch = len(dolma_dataset['train']) // (BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS)
steps_per_eval = int(steps_per_epoch * EVALUATION_FREQUENCY)

In [12]:
args = TrainingArguments(
    output_dir="model-identity",
    overwrite_output_dir=False,
    do_train=True,
    do_eval=True,
    do_predict=False,
    eval_strategy="steps",
    eval_steps=steps_per_eval,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    eval_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    learning_rate=LR,
    max_grad_norm=MAX_GRAD_NORM,
    num_train_epochs=EPOCHS,
    lr_scheduler_type='linear',
    warmup_ratio=WARMUP_RATIO,
    logging_dir='model-identity-logs',
    logging_strategy='steps',
    logging_steps=1,
    save_strategy='steps',
    save_steps=steps_per_eval,
    bf16=True,
    fp16=False,
    bf16_full_eval=True,
    fp16_full_eval=False,
    group_by_length=True,
    length_column_name='length',
    skip_memory_metrics=False,
    metric_for_best_model='loss',
)

In [13]:
trainer = Trainer(
    model=model_identity,
    args=args,
    data_collator=collate_fn,
    train_dataset=dolma_dataset['train'],
    eval_dataset=dolma_dataset['test'],
    tokenizer=tokenizer,
    callbacks=[
        EarlyStoppingCallback(early_stopping_patience=EARLY_STOPPING_PATIENCE)
    ]
)

  trainer = Trainer(


In [14]:
trainer.train()

Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Step,Training Loss,Validation Loss
1228,0.0003,1.6e-05


KeyboardInterrupt: 