Container-Root/eks/deployment/distributed-training/pytorch/pytorchjob/fsdp/.env

#!/bin/bash

# Configuration for FSDP jobs

## AWS
export AWS_REGION=us-west-2
export ACCOUNT=$(aws sts get-caller-identity --query Account --output text)

## Docker Image
export REGISTRY=${ACCOUNT}.dkr.ecr.${AWS_REGION}.amazonaws.com/
export IMAGE=fsdp
## DOCKERFILE_EXT=nanogpt-sockets|nanogpt-efa|llama2-sockets|llama2-efa|llama2-efa-dlc
export DOCKERFILE_EXT=llama2-efa
export TAG=":${DOCKERFILE_EXT}"

## FSDP Job
export JOB_NAME=fsdp
export RDZV_HOST=etcd
export RDZV_PORT=2379
## NUM_WORKERS - Default 2, set to number of worker nodes
export NUM_WORKERS=8
## EFA_PER_WORKER - Default 0, number of EFA adapters per node. For G4dn.metal this is 1, for P4 use 4, for P5 use 32
export EFA_PER_WORKER=32
## GPU_PER_WORKER - number of GPUs per worker, the number of GPUs for the selected instance type.
export GPU_PER_WORKER=8
## INSTANCE_TYPE=p4de.24xlarge|p5.48xlarge|g4dn.metal|g4dn.8xlarge(default)|etc
export INSTANCE_TYPE=p5.48xlarge
## FI_PROVIDER=sockets(default)|efa
export FI_PROVIDER=efa

## Model
## Support is available for NanoGPT and Llama2. Only one of the sections below should be uncommented and should match the DOCKERFILE_EXT section above

## NanoGPT
## NanoGPT train command
#export CMD="python -m torch.distributed.run --nproc-per-node=$NPROC_PER_WORKER fsdp_train.py"
## MODEL_NAME=10.5M (default) | 124M | 201M | 1B | 1.5B | 20B
#export MODEL_NAME="10.5M"

## Llama2
## Register at Huggingface and get a token by visiting: https://huggingface.co/docs/hub/security-tokens, then insert your token here
export HF_TOKEN="<insert_your_huggingface_token_here>"
## Llama2 MODEL_NAME=meta-llama/Llama-2-7b-hf | meta-llama/Llama-2-13b-hf | meta-llama/Llama-2-70b-hf
export MODEL_NAME=meta-llama/Llama-2-7b-hf
## Llama2 train command
### samsum dataset
export CMD="huggingface-cli login --token ${HF_TOKEN} && torchrun --nproc_per_node=${GPU_PER_WORKER} --nnodes=${NUM_WORKERS} recipes/finetuning/finetuning.py --num_epochs=3 --batch_size_training=3 --enable_fsdp --model_name $MODEL_NAME --output_dir ."
### arrow dataset 
#export CMD="huggingface-cli login --token ${HF_TOKEN} && torchrun --nproc_per_node=${GPU_PER_WORKER} --nnodes=${NUM_WORKERS} recipes/finetuning/finetuning.py --num_epochs=3 --batch_size_training=16 --enable_fsdp --low_cpu_fsdp --batching_strategy padding --model_name $MODEL_NAME --dataset 'custom_dataset' --custom_dataset.file 'examples/custom_dataset.py' --output_dir ."