diff --git a/README.md b/README.md index 2f7d1bb0b..c38c9ad10 100644 --- a/README.md +++ b/README.md @@ -31,24 +31,19 @@ torchrun --nproc-per-node=8 scripts/train.py configs/c4-tiny.yaml \ #### Running on Cirrascale via [beaker-gantry](https://github.com/allenai/beaker-gantry) +Check the script at [`scripts/olmo-small-ablation-on-gantry.sh`](scripts/olmo-small-ablation-on-gantry.sh) for an example on how to run a training job on Cirrascale. + +After installing `beaker-gantry`, you can launch a training job like this: + ```bash -gantry run \ - --workspace ai2/llm-testing \ - --task-name "OLMo-tiny-c4" \ - --priority "high" \ - --beaker-image olmo-torch2-gantry \ - --cluster ai2/general-cirrascale-a100-80g-ib \ - --gpus 8 \ - --nfs \ - --env-secret WANDB_API_KEY=WANDB_API_KEY \ - --env LOG_FILTER_TYPE=local_rank0_only \ - --env OMP_NUM_THREADS=8 \ - --shared-memory 10GiB \ - --venv base \ - --yes \ - -- /bin/bash -c 'torchrun --nproc-per-node 8 scripts/train.py configs/c4-tiny.yaml --save_folder=/net/nfs.cirrascale/allennlp/llm-checkpoints/tmp --run_name=c4-tiny-test-run' +CONFIG_PATH=configs/choose_a_config.yml \ +LOAD_PATH=/optional/path/to/checkpoint/ \ + bash scripts/olmo-small-ablation-on-gantry.sh ``` +if `CONFIG_PATH` is not specified, the default config is `configs/olmo-small-ablation.yaml`; +if `LOAD_PATH` is not specified, the training will start from scratch. + #### Running on LUMI via Slurm First read our [LUMI](./LUMI.md) documentation, but submitting a new job essentially just boils down to running this: diff --git a/scripts/olmo-small-ablation-on-gantry.sh b/scripts/olmo-small-ablation-on-gantry.sh new file mode 100644 index 000000000..4e660f0fe --- /dev/null +++ b/scripts/olmo-small-ablation-on-gantry.sh @@ -0,0 +1,59 @@ +#!/bin/bash + +set -ex + +# check if LOAD_PATH is provided as an environment variable; if so, create an argument +# to pass to the training script +if [ -z ${LOAD_PATH+x} ]; then + LOAD_PATH_ARG="" +else + LOAD_PATH_ARG="--load_path=${LOAD_PATH}" +fi + + +# check if CONFIG PATH is provided as an environment variable; +# if so, use that instead of olmo-small-ablation.yaml +if [ -z ${CONFIG_PATH+x} ]; then + export CONFIG_PATH=configs/olmo-small-ablation.yaml +else + export CONFIG_PATH="${CONFIG_PATH}" +fi + +# get run name, we will use this as task name in gantry +RUN_NAME=$(cat $CONFIG_PATH | grep -ohP "^run_name\:\w*(.+)$" | sed 's/run_name:\s*//') + +# get a hash of the load path and config path; take the first 8 characters +RUN_HASH=$(echo "${LOAD_PATH_ARG}-${CONFIG_PATH}" | md5sum | cut -c 1-8) + +# compose the two +FULL_RUN_NAME="${RUN_NAME}-${RUN_HASH}" + +# check if there is an env var called 'WANDB_API_KEY' and if so, create a flag +# to pass to gantry +if [ -z ${WANDB_API_KEY+x} ]; then + WANDB_API_KEY_ARG="--env-secret WANDB_API_KEY=WANDB_API_KEY" +else + WANDB_API_KEY_ARG="--env WANDB_API_KEY=${WANDB_API_KEY}" +fi + +NUM_NODES=4 + +gantry run \ + --workspace ai2/llm-testing \ + --task-name "${FULL_RUN_NAME}" \ + --description "${FULL_RUN_NAME}" \ + --priority "normal" \ + --beaker-image olmo-torch2-gantry \ + --cluster ai2/general-cirrascale-a100-80g-ib \ + --gpus 8 \ + --replicas ${NUM_NODES} \ + --leader-selection \ + --host-networking \ + --nfs \ + ${WANDB_API_KEY_ARG} \ + --env LOG_FILTER_TYPE=local_rank0_only \ + --env OMP_NUM_THREADS=8 \ + --shared-memory 10GiB \ + --venv base \ + --yes \ + -- /bin/bash -c "torchrun --nnodes ${NUM_NODES}:${NUM_NODES} --nproc-per-node 8 --rdzv_id=101 --rdzv_backend=c10d --rdzv_endpoint=\$BEAKER_LEADER_REPLICA_HOSTNAME:29400 scripts/train.py ${CONFIG_PATH} --run_name=${FULL_RUN_NAME} ${LOAD_PATH_ARG} --device_train_microbatch_size=8 ${@}"