src/fmbench/configs/llama3/8b/config-llama3-8b-instruct-g5-p4d.yml

general:
  name: "llama3-8b-p4d-g5-v1"      
  model_name: "llama3-8b-instruct"
  
# AWS and SageMaker settings
aws:
  region: {region}
  # uncomment and set the Role ARN if not running on sagemaker
  sagemaker_execution_role: {role_arn}
  ## these are the buckets/resources you will create in your account below:
  bucket: {write_bucket} ## add the name of your desired bucket

## WRITE BUCKET -- Write the results, data, metrics, endpoint.json and payloads to this bucket directory
dir_paths:
    data_prefix: data ## add the prefix for all your data management/storage
    prompts_prefix: prompts
    all_prompts_file: all_prompts.csv
    metrics_dir: metrics
    models_dir: models
    metadata_dir: metadata ## add a file here to dynamically track the metrics dir

## READ BUCKET -- Represents the section to read from scripts, source data and tokenizer for a separate s3 bucket for read/write segregation
s3_read_data:
    read_bucket: {read_bucket}
    scripts_prefix: scripts ## add your own scripts in case you are using anything that is not on jumpstart
    script_files:
    - hf_token.txt ## add your scripts files you have in s3 (including inference files, serving stacks, if any)
    configs_prefix: configs
    config_files:
    - pricing.yml # mention the name of the config files that you want to be downloaded from s3 into the configs directory locally
    source_data_prefix: source_data  ## Add a source_data folder to store your raw data in an s3 path configured by you
    source_data_files:
    # - rajpurkar/squad_v2.jsonl
    - 2wikimqa_e.jsonl
    - 2wikimqa.jsonl
    - hotpotqa_e.jsonl
    - hotpotqa.jsonl
    - narrativeqa.jsonl
    - triviaqa_e.jsonl
    - triviaqa.jsonl
    tokenizer_prefix: llama3_tokenizer ## add the tokenizer.json and config.json from your specific tokenizer type
    prompt_template_dir: prompt_template
    prompt_template_file: prompt_template_llama3.txt ## add your desired prompt template type

## section that enables container to run notebooks and python scripts automatically 
run_steps:
    0_setup.ipynb: yes
    1_generate_data.ipynb: yes
    2_deploy_model.ipynb: yes
    3_run_inference.ipynb: yes
    4_model_metric_analysis.ipynb: yes
    5_cleanup.ipynb: yes


datasets:
  prompt_template_keys:
  - input
  - context
  filters:
  - language: en    
    min_length_in_tokens: 1
    max_length_in_tokens: 500
    payload_file: payload_en_1-500.jsonl
  - language: en
    min_length_in_tokens: 500
    max_length_in_tokens: 1000
    payload_file: payload_en_500-1000.jsonl
  - language: en
    min_length_in_tokens: 1000
    max_length_in_tokens: 2000
    payload_file: payload_en_1000-2000.jsonl
  - language: en
    min_length_in_tokens: 2000
    max_length_in_tokens: 3000
    payload_file: payload_en_2000-3000.jsonl
  - language: en
    min_length_in_tokens: 3000
    max_length_in_tokens: 3840
    payload_file: payload_en_3000-3840.jsonl
 
metrics:
  dataset_of_interest: en_3000-3840
  
pricing: pricing.yml

inference_parameters:
  sagemaker:
    max_new_tokens: 100
    top_p: 0.92
    temperature: 0.1
    details: True
    stop: '<|eot_id|>'

experiments:
  - name: llama-3-8b-instruct-g5-djl-deepspeed0.12.6-cu121
    model_id: meta-textgeneration-llama-3-8b-instruct
    model_version: "*"
    model_name: llama3-8b-instruct-g5.12xl
    ep_name: llama-3-8b-instruct-g5-12xl
    instance_type: "ml.g5.12xlarge"
    image_uri: '763104351884.dkr.ecr.{region}.amazonaws.com/djl-inference:0.27.0-deepspeed0.12.6-cu121'
    deploy: yes
    instance_count: 1
    deployment_script: jumpstart.py
    inference_script: sagemaker_predictor.py
    inference_spec:
      parameter_set: sagemaker
    payload_files:
    - payload_en_1-500.jsonl
    - payload_en_500-1000.jsonl
    - payload_en_1000-2000.jsonl
    - payload_en_2000-3000.jsonl
    - payload_en_3000-3840.jsonl
    concurrency_levels:
    - 1
    - 2
    - 4
    - 6
    - 8

    accept_eula: true
    env:
      SAGEMAKER_PROGRAM: "inference.py"
      ENDPOINT_SERVER_TIMEOUT: "3600"
      MODEL_CACHE_ROOT: "/opt/ml/model"
      SAGEMAKER_ENV: "1"
      HF_MODEL_ID: "/opt/ml/model"
      MAX_INPUT_LENGTH: "4095"
      MAX_TOTAL_TOKENS: "4096"
      SM_NUM_GPUS: "1"
      SAGEMAKER_MODEL_SERVER_WORKERS: "1"
      
  - name: llama-3-8b-instruct-p4d-djl-deepspeed0.12.6-cu121
    model_id: meta-textgeneration-llama-3-8b-instruct
    model_version: "*"
    model_name: llama3-8b-instruct
    ep_name: llama-3-8b-instruct-p4d
    instance_type: "ml.p4d.24xlarge"
    image_uri: '763104351884.dkr.ecr.{region}.amazonaws.com/djl-inference:0.27.0-deepspeed0.12.6-cu121'
    deploy: yes
    instance_count: 1
    deployment_script: jumpstart.py
    inference_script: sagemaker_predictor.py
    inference_spec:
      parameter_set: sagemaker
    payload_files:
    - payload_en_1-500.jsonl
    - payload_en_500-1000.jsonl
    - payload_en_1000-2000.jsonl
    - payload_en_2000-3000.jsonl
    - payload_en_3000-3840.jsonl
    concurrency_levels:
    - 1
    - 2
    - 4
    - 6
    - 8
    - 10
    - 12
    - 15

    accept_eula: true
    env:
      SAGEMAKER_PROGRAM: "inference.py"
      ENDPOINT_SERVER_TIMEOUT: "3600"
      MODEL_CACHE_ROOT: "/opt/ml/model"
      SAGEMAKER_ENV: "1"
      HF_MODEL_ID: "/opt/ml/model"
      MAX_INPUT_LENGTH: "4095"
      MAX_TOTAL_TOKENS: "4096"
      SM_NUM_GPUS: "1"
      SAGEMAKER_MODEL_SERVER_WORKERS: "1"
report:
  latency_budget: 2
  cost_per_10k_txn_budget: 20
  error_rate_budget: 0
  per_inference_request_file: per_inference_request_results.csv
  all_metrics_file: all_metrics.csv
  txn_count_for_showing_cost: 10000
  v_shift_w_single_instance: 0.025
  v_shift_w_gt_one_instance: 0.025