### Fine tune llama 2 70b on prompts using Pytorch FSDP and Amazon SageMaker Training Jobs.


In [None]:
! pip install -U sagemaker

In [None]:
! pip install datasets

In [None]:
import sagemaker
from sagemaker.pytorch import PyTorch

In [None]:
sess = sagemaker.Session()
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

role = sagemaker.get_execution_role()

sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")

### Download the dataset

In [None]:
from datasets import load_dataset

hc3 = load_dataset("Hello-SimpleAI/HC3","all")
hc3["train"].to_json(f"data/all.jsonl")

In [None]:
import pandas as pd
import json
  
# Opening JSON file
f1 = open('data/all.jsonl')
Lines1 = f1.readlines()

questions, answers = [], []
print(f"Total lines in training set {len(Lines1)}")
for line in Lines1[:10000]:
    row = json.loads(line)
    for answer in row["human_answers"]:
        questions.append(row["question"])
        answers.append(answer)
    for answer in row["chatgpt_answers"]:
        questions.append(row["question"])
        answers.append(answer)

test_file = open("data/test.jsonl","w")
for line in Lines1[10000:]:
    test_file.write(line)

test_file.close()
    
df = pd.DataFrame()
df["question"] = questions
df["answer"] = answers
df = df.sample(frac = 1)
df_train = df.iloc[:30000,:]
df_val = df.iloc[30000:40000,:]

df_train.to_csv("data/train.csv", index=False)
df_val.to_csv("data/val.csv", index=False)

In [None]:
train_data_url = sess.upload_data(
    path="data/train.csv",
    key_prefix="alpaca/prompt",
)

valid_data_url = sess.upload_data(
    path="data/val.csv",
    key_prefix="alpaca/prompt",
)

In [None]:
print(f"training file path {train_data_url}")
print(f"validation file path {valid_data_url}")

### Submit training job to SageMaker

In [None]:
hyperparameters = {}
SM_TRAIN_DIR = "/opt/ml/input/data" 

hyperparameters["model_name_or_path"] = "meta-llama/Llama-2-70b-hf"
hyperparameters["model_dir"] =  "/opt/ml/model"
hyperparameters["train_file"] = f"{SM_TRAIN_DIR}/train/train.csv"
hyperparameters["validation_file"] = f"{SM_TRAIN_DIR}/valid/val.csv"
hyperparameters["per_device_train_batch_size"] = 1
hyperparameters["per_device_eval_batch_size"] = 1
hyperparameters["block_size"] = 4096
hyperparameters["num_train_epochs"] = 1
hyperparameters["learning_rate"] = 2e-4
hyperparameters["transformer_layer_cls_to_wrap"] = "LlamaDecoderLayer" # provide the decoder layer
hyperparameters["access_token"] = "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"
hyperparameters["cache_dir"] = "/opt/ml/sagemaker/warmpoolcache" #change this to /tmp if not using warmpools.

In [None]:
env = {}
env['FI_PROVIDER'] = 'efa'
env['NCCL_PROTO'] = 'simple'
env['FI_EFA_USE_DEVICE_RDMA'] = '1'
env['RDMAV_FORK_SAFE'] = '1'

In [None]:
base_job_name = "falcon-instruction-fine-tuning"
estimator = PyTorch(
    base_job_name=base_job_name,
    source_dir="./scripts",
    entry_point="train_fsdp.py",
    role=role,
    framework_version="2.0.0",
    py_version="py310",
    instance_count=4,
    instance_type="ml.p4d.24xlarge",
    hyperparameters=hyperparameters,
    disable_profiler=True,
    environment=env,
    distribution={"torch_distributed": {"enabled": True}},
    keep_alive_period_in_seconds=600,
    disable_output_compression=True
)

In [None]:
estimator.fit({"train":train_data_url,"valid":valid_data_url})

### Delete the warmpool if not needed

In [None]:
sess.update_training_job(estimator.latest_training_job.job_name, resource_config={"KeepAlivePeriodInSeconds":0})