### Fine tune llama 2 70b on prompts using Pytorch FSDP and Amazon SageMaker Training Jobs.


In [1]:
! pip install -U sagemaker boto3

Collecting sagemaker
  Using cached sagemaker-2.189.0.tar.gz (893 kB)
  Preparing metadata (setup.py) ... [?25ldone
Collecting boto3
  Downloading boto3-1.28.59-py3-none-any.whl (135 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m135.8/135.8 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting botocore<1.32.0,>=1.31.59 (from boto3)
  Downloading botocore-1.31.59-py3-none-any.whl (11.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.2/11.2 MB[0m [31m46.5 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
Building wheels for collected packages: sagemaker
  Building wheel for sagemaker (setup.py) ... [?25ldone
[?25h  Created wheel for sagemaker: filename=sagemaker-2.189.0-py2.py3-none-any.whl size=1194917 sha256=a848a11efeeccd3ce8d7e57f41d66b0f0a547907c31f8374d562a83402150ca2
  Stored in directory: /Users/alokana/Library/Caches/pip/wheels/98/11/11/0ed146622a4b4485d9f3c5454fb42f07895c69b6d1d1516987
Successfully built sagemak

In [2]:
! pip install datasets



In [3]:
import sagemaker
from sagemaker.pytorch import PyTorch

sagemaker.config INFO - Not applying SDK defaults from location: /Library/Application Support/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /Users/alokana/Library/Application Support/sagemaker/config.yaml


In [4]:
sess = sagemaker.Session()
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

role = sagemaker.get_execution_role()

sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")

sagemaker.config INFO - Not applying SDK defaults from location: /Library/Application Support/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /Users/alokana/Library/Application Support/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /Library/Application Support/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /Users/alokana/Library/Application Support/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /Library/Application Support/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /Users/alokana/Library/Application Support/sagemaker/config.yaml
sagemaker role arn: arn:aws:iam::365792799466:role/test_step_role
sagemaker bucket: sagemaker-us-west-2-365792799466
sagemaker session region: us-west-2


### Download the dataset

In [5]:
from datasets import load_dataset

hc3 = load_dataset("Hello-SimpleAI/HC3","all")
hc3["train"].to_json(f"data/all.jsonl")

  from .autonotebook import tqdm as notebook_tqdm
Found cached dataset hc3 (/Users/alokana/.cache/huggingface/datasets/Hello-SimpleAI___hc3/all/1.1.0/5af5910f9f3fe7aace30e32ad4c1ab776ca08183d00e9b2a091308549f69f683)
100%|██████████| 1/1 [00:00<00:00, 57.99it/s]
Creating json from Arrow format: 100%|██████████| 25/25 [00:00<00:00, 35.40ba/s]


73742573

In [6]:
import pandas as pd
import json
  
# Opening JSON file
f1 = open('data/all.jsonl')
Lines1 = f1.readlines()

questions, answers = [], []
print(f"Total lines in training set {len(Lines1)}")
for line in Lines1[:10000]:
    row = json.loads(line)
    for answer in row["human_answers"]:
        questions.append(row["question"])
        answers.append(answer)
    for answer in row["chatgpt_answers"]:
        questions.append(row["question"])
        answers.append(answer)

test_file = open("data/test.jsonl","w")
for line in Lines1[10000:]:
    test_file.write(line)

test_file.close()
    
df = pd.DataFrame()
df["question"] = questions
df["answer"] = answers
df = df.sample(frac = 1)
df_train = df.iloc[:30000,:]
df_val = df.iloc[30000:40000,:]

df_train.to_csv("data/train.csv", index=False)
df_val.to_csv("data/val.csv", index=False)

Total lines in training set 24322


In [7]:
train_data_url = sess.upload_data(
    path="data/train.csv",
    key_prefix="alpaca/prompt",
)

valid_data_url = sess.upload_data(
    path="data/val.csv",
    key_prefix="alpaca/prompt",
)

In [8]:
print(f"training file path {train_data_url}")
print(f"validation file path {valid_data_url}")

training file path s3://sagemaker-us-west-2-365792799466/alpaca/prompt/train.csv
validation file path s3://sagemaker-us-west-2-365792799466/alpaca/prompt/val.csv


### Submit training job to SageMaker

In [11]:
hyperparameters = {}
SM_TRAIN_DIR = "/opt/ml/input/data" 

hyperparameters["model_name_or_path"] = "meta-llama/Llama-2-70b-hf"
hyperparameters["model_dir"] =  "/opt/ml/model"
hyperparameters["train_file"] = f"{SM_TRAIN_DIR}/train/train.csv"
hyperparameters["validation_file"] = f"{SM_TRAIN_DIR}/valid/val.csv"
hyperparameters["per_device_train_batch_size"] = 1
hyperparameters["per_device_eval_batch_size"] = 1
hyperparameters["block_size"] = 4096
hyperparameters["num_train_epochs"] = 1
hyperparameters["learning_rate"] = 2e-4
hyperparameters["transformer_layer_cls_to_wrap"] = "LlamaDecoderLayer" # provide the decoder layer
hyperparameters["access_token"] = "hf_XXXXXX"
hyperparameters["cache_dir"] = "/opt/ml/sagemaker/warmpoolcache" #change this to /tmp if not using warmpools.       

In [12]:
env = {}
env['FI_PROVIDER'] = 'efa'
env['NCCL_PROTO'] = 'simple'
env['FI_EFA_USE_DEVICE_RDMA'] = '1'
env['RDMAV_FORK_SAFE'] = '1'

In [15]:
base_job_name = "falcon-instruction-fine-tuning"
estimator = PyTorch(
    base_job_name=base_job_name,
    source_dir="./scripts",
    entry_point="train_fsdp.py",
    role=role,
    framework_version="2.0.1",
    py_version="py310",
    instance_count=2,
    instance_type="ml.p4de.24xlarge",
    hyperparameters=hyperparameters,
    disable_profiler=True,
    environment=env,
    distribution={"torch_distributed": {"enabled": True}},
    keep_alive_period_in_seconds=600, 
    disable_output_compression=True
)

sagemaker.config INFO - Not applying SDK defaults from location: /Library/Application Support/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /Users/alokana/Library/Application Support/sagemaker/config.yaml


In [16]:
estimator.fit({"train":train_data_url,"valid":valid_data_url})

INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: falcon-instruction-fine-tuning-2023-10-03-20-09-40-599


Using provided s3_resource
2023-10-03 20:09:42 Starting - Starting the training job
2023-10-03 20:09:42 Pending - Training job waiting for capacity......
2023-10-03 20:10:33 Pending - Preparing the instances for training........................
2023-10-03 20:14:37 Downloading - Downloading input data...
2023-10-03 20:15:02 Training - Downloading the training image........................
2023-10-03 20:19:24 Training - Training image download completed. Training in progress........bash: cannot set terminal process group (-1): Inappropriate ioctl for device
bash: no job control in this shell
2023-10-03 20:20:20,506 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training
2023-10-03 20:20:20,561 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)
2023-10-03 20:20:20,569 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.
2023-10-03 20:20:20,570 sagemaker_pytorch_container.training I

UnexpectedStatusException: Error for Training job falcon-instruction-fine-tuning-2023-10-03-20-09-40-599: Failed. Reason: AlgorithmError: ExecuteUserScriptError:
ExitCode 1
ErrorMessage "ImportError
 cannot import name 'flash_attn_func' from 'flash_attn' (/opt/conda/lib/python3.10/site-packages/flash_attn/__init__.py)
 
 The above exception was the direct cause of the following exception
 Traceback (most recent call last)
 File "/opt/ml/code/train_fsdp.py", line 7, in <module>
 from transformers import (
 File "<frozen importlib._bootstrap>", line 1075, in _handle_fromlist
 File "/opt/conda/lib/python3.10/site-packages/transformers/utils/import_utils.py", line 1273, in __getattr__
 value = getattr(module, name)
 File "/opt/conda/lib/python3.10/site-packages/transformers/utils/import_utils.py", line 1272, in __getattr__
 module = self._get_module(self._class_to_module[name])
 File "/opt/conda/lib/python3.10/site-packages/transformers/utils/import_utils.py", line 1284, in _get_module
 raise RuntimeError(
 RuntimeError: Failed to import transformers.models.llama.modeling_llama because of the following error (look up to see its trac

### Delete the warmpool if not needed

In [None]:
sess.update_training_job(estimator.latest_training_job.job_name, resource_config={"KeepAlivePeriodInSeconds":0})