Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DeepSpeed #384

Draft
wants to merge 39 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 32 commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
d1f5eb2
Preliminary DeepSpeed
Muennighoff Nov 24, 2023
2e137cd
Use DS Trainer
Muennighoff Nov 24, 2023
0984fc8
Add optimizer
Muennighoff Nov 24, 2023
9190e3c
Do not fuse
Muennighoff Nov 24, 2023
ad624aa
Do not fuse
Muennighoff Nov 24, 2023
cc0b628
Fix param loc
Muennighoff Nov 24, 2023
9f64db3
Simplify Sched
Muennighoff Nov 24, 2023
22d503c
debug
Muennighoff Nov 24, 2023
1e59c4c
kwargs
Muennighoff Nov 24, 2023
c1afa5e
Move batch to device
Muennighoff Nov 24, 2023
1afb8d4
Stage 3
Muennighoff Nov 25, 2023
9d2bf5a
Print loss
Muennighoff Nov 25, 2023
1e5d235
Simplify
Muennighoff Nov 27, 2023
73ff9cd
Temp debug
Muennighoff Nov 27, 2023
bab943c
Fix loss
Muennighoff Nov 27, 2023
deb6a60
Simplify API
Muennighoff Nov 27, 2023
160633a
Debug
Muennighoff Nov 27, 2023
b48c28c
DS Conditions
Muennighoff Nov 27, 2023
f2bc91e
DS Conditions
Muennighoff Nov 27, 2023
4dc7dc9
Step optim directly
Muennighoff Nov 27, 2023
7ffffd6
Allow DS Lion Optimizer
Muennighoff Nov 27, 2023
0cfcfa6
Revert to torch adam
Muennighoff Nov 27, 2023
2e44211
Rmv print
Muennighoff Nov 27, 2023
b22bcee
Force Lin scheduler
Muennighoff Nov 27, 2023
f6fdc2f
Properly configure DS
Muennighoff Nov 27, 2023
400e1eb
Support DS LR Scheduler
Muennighoff Nov 27, 2023
8853b41
Adapt configs
Muennighoff Nov 27, 2023
76dd7fa
Fix typo
Muennighoff Nov 27, 2023
0334d93
Rename configs
Muennighoff Nov 27, 2023
60e384d
Fix naming
Muennighoff Nov 27, 2023
a8d99da
Fix scheduler
Muennighoff Nov 27, 2023
f5d1f89
Merge branch 'main' into Muennighoff/DeepSpeed
Muennighoff Nov 27, 2023
257cc95
Fix typo
Muennighoff Nov 27, 2023
2fa115b
Format
Muennighoff Nov 27, 2023
97aaa95
Merge branch 'Muennighoff/DeepSpeed' of github.com:allenai/LLM into M…
Muennighoff Nov 27, 2023
cb3d5cf
Fix decay
Muennighoff Nov 27, 2023
ac18a1c
Merge
Muennighoff May 2, 2024
fe6a641
Rmv double init
Muennighoff May 2, 2024
b884e94
No opt step for DS
Muennighoff May 3, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
251 changes: 251 additions & 0 deletions configs/olmo-small-ablation-lumi-deepspeed.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,251 @@
run_name: olmo-small-ablation
seed: 6198
deepspeed: true
dry_run: false
no_pre_train_checkpoint: True

wandb:
name: ${run_name}
project: c4-small

model:
d_model: 2048
n_heads: 16
n_layers: 16
mlp_ratio: 8
alibi: true
alibi_bias_max: 8.0
attention_dropout: 0.0
attention_layer_norm: true
multi_query_attention: true
block_type: sequential
layer_norm_type: low_precision # if not compiling, use 'low_precision'
activation_type: swiglu
residual_dropout: 0.0
embedding_dropout: 0.0
max_sequence_length: 2048
vocab_size: 50277
embedding_size: 50304
eos_token_id: 50276
pad_token_id: 50276
init_device: cpu
init_std: 0.02

compile: null # causes instability on AMD GPUs

optimizer:
name: adamw
learning_rate: 1.0e-4
weight_decay: 0.01
betas:
- 0.9
- 0.95

scheduler:
name: linear_with_warmup
t_warmup: 2000
t_max: null

data:
paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/c4_en/gpt-neox-20b/c4-train.*.npy}
pad_direction: right
num_workers: 0
drop_last: true
pin_memory: true
prefetch_factor: 16
persistent_workers: true
timeout: 0

tokenizer:
identifier: EleutherAI/gpt-neox-20b
truncate_direction: right

save_folder: ${path.choose:${oc.env:SCRATCH_DIR,no_exist}/checkpoints,/results}/${oc.env:SLURM_JOB_ID,${run_name}}
save_overwrite: false
# Sharded checkpoints (best for restarts)
save_interval: 100000
save_num_checkpoints_to_keep: 9
# Unsharded checkpoints (for final storage)
save_interval_unsharded: 100000
save_num_unsharded_checkpoints_to_keep: -1

load_path: null

# max_duration: 953674 # 2T tokens
max_duration: 95367 # 200B tokens
global_train_batch_size: 1024
device_train_microbatch_size: 8

precision: amp_bf16

max_grad_norm: 1.0

speed_monitor:
window_size: 20

eval_interval: ${save_interval}
eval_subset_num_batches: -1
device_eval_batch_size: ${device_train_microbatch_size}
evaluators:
##########################
# Perplexity evaluations #
##########################
#- label: c4-validation
# subset_num_batches: 10
# data:
# paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/c4/en/c4-validation.*.npy}
# num_workers: 2
# drop_last: true
# pin_memory: true
# persistent_workers: true
# prefetch_factor: 4

#- label: rp-validation
# subset_num_batches: 10
# data:
# paths: ${path.glob:${path.choose:${oc.env:SCRATCH_DIR,no_exist}/pretraining_data/preprocessed,/net/nfs.cirrascale/allennlp/llm-data}/redpajama/redpajama-validation.npy}
# num_workers: 2
# drop_last: true
# pin_memory: true
# persistent_workers: true
# prefetch_factor: 4

- label: 4chan-validation
data:
paths:
- ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small_gptneox20b/4chan/val.npy
drop_last: true

- label: c4_100_domains-validation
data:
paths:
- ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small_gptneox20b/c4_100_domains/val.npy
drop_last: true

- label: c4_en-validation
data:
paths:
- ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small_gptneox20b/c4_en/val.npy
drop_last: true

- label: gab-validation
data:
paths:
- ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small_gptneox20b/gab/val.npy
drop_last: true

- label: ice-validation
data:
paths:
- ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small_gptneox20b/ice/val.npy
drop_last: true

- label: m2d2_s2orc-validation
data:
paths:
- ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small_gptneox20b/m2d2_s2orc/val.npy
drop_last: true

- label: m2d2_wiki-validation
data:
paths:
- ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small_gptneox20b/m2d2_wiki/val.npy
drop_last: true

- label: manosphere-validation
data:
paths:
- ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small_gptneox20b/manosphere/val.npy
drop_last: true

- label: mc4_en-validation
data:
paths:
- ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small_gptneox20b/mc4_en/val.npy
drop_last: true

- label: pile-validation
data:
paths:
- ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small_gptneox20b/pile/val.npy
drop_last: true

- label: stack_v2_held_out
data:
paths:
- ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small_gptneox20b/stack_v2_held_out/000_00000.npy
drop_last: true

- label: openai_humaneval_test
data:
paths:
- ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small_gptneox20b/openai_humaneval_test/0_00000.npy
drop_last: true

- label: mbpp_valid
data:
paths:
- ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small_gptneox20b/mbpp_valid/0_00000.npy
drop_last: true

# Too small (not enough tokens for a single batch)
# - label: ptb-validation
# data:
# paths:
# - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small_gptneox20b/ptb/val.npy
# drop_last: true

- label: twitterAEE-validation
data:
paths:
- ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small_gptneox20b/twitterAEE/val.npy
drop_last: true

# Too small (not enough tokens for a single batch)
# - label: wikitext_103-validation
# data:
# paths:
# - ${path.choose:${oc.env:SCRATCH_DIR,no_exist},/net/nfs.cirrascale/allennlp/llm-data}/eval-data/perplexity/v2_small_gptneox20b/wikitext_103/val.npy
# drop_last: true

##########################
# Downstream evaluations #
##########################
- label: piqa
type: downstream

- label: hellaswag
type: downstream

- label: winogrande
type: downstream

- label: openbook_qa
type: downstream

# - label: boolq # requires implemention of the pmi_dc matrix
# type: downstream
#
- label: sciq
type: downstream

- label: arc_easy
type: downstream

# - label: arc_challenge # requires implemention of the pmi_dc matrix
# type: downstream
#
- label: copa
type: downstream

- label: rte
type: downstream

- label: commitment_bank
type: downstream

- label: mrpc
type: downstream

- label: sst2
type: downstream