In [None]:
! pip install -U sagemaker

In [36]:
import sagemaker
from sagemaker.pytorch import PyTorch

In [37]:
sess = sagemaker.Session()
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

role = sagemaker.get_execution_role()

sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")

INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials
INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials
INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials


sagemaker role arn: arn:aws:iam::365792799466:role/test_step_role
sagemaker bucket: sagemaker-us-east-1-365792799466
sagemaker session region: us-east-1


In [38]:
import pandas as pd
import json
  
# Opening JSON file
f1 = open('data/all.jsonl')
Lines1 = f1.readlines()

questions, answers = [], []
print(f"Total lines in training set {len(Lines1)}")
for line in Lines1[:10000]:
    row = json.loads(line)
    for answer in row["human_answers"]:
        questions.append(row["question"])
        answers.append(answer)
    for answer in row["chatgpt_answers"]:
        questions.append(row["question"])
        answers.append(answer)

test_file = open("data/test.jsonl","w")
for line in Lines1[10000:]:
    test_file.write(line)

test_file.close()
    
df = pd.DataFrame()
df["question"] = questions
df["answer"] = answers
df = df.sample(frac = 1)
df_train = df.iloc[:30000,:]
df_val = df.iloc[30000:,:]

df_train.to_csv("data/train.csv", index=False)
df_val.to_csv("data/val.csv", index=False)

Total lines in training set 24322


In [39]:
train_data_url = sess.upload_data(
    path="data/train.csv",
    key_prefix="alpaca/prompt",
)

valid_data_url = sess.upload_data(
    path="data/val.csv",
    key_prefix="alpaca/prompt",
)

In [40]:
print(f"training file path {train_data_url}")
print(f"validation file path {valid_data_url}")

training file path s3://sagemaker-us-east-1-365792799466/alpaca/prompt/train.csv
validation file path s3://sagemaker-us-east-1-365792799466/alpaca/prompt/val.csv


In [41]:
hyperparameters = {}
SM_DATA_DIR = "/opt/ml/input/data/train" 

hyperparameters["model_name_or_path"] = "EleutherAI/gpt-neox-20b"
hyperparameters["model_dir"] = "/opt/ml/model"
hyperparameters["train_file"] = f"{SM_DATA_DIR}/train/train.csv"
hyperparameters["validation_file"] = f"{SM_DATA_DIR}/valid/val.csv"
hyperparameters["per_device_train_batch_size"] = 1
hyperparameters["per_device_eval_batch_size"] = 1
hyperparameters["block_size"] = 2048
hyperparameters["num_train_epochs"] = 1


In [42]:
smp_options = {
    "enabled":True,
    "parameters": {                        # Required
        "pipeline_parallel_degree": 1,     # Required
        "ddp": True,
        "ddp_dist_backend": "auto",
        # parameters for sharded data parallelism
        "sharded_data_parallel_degree": 16,              # Add this to activate sharded data parallelism
        "partitions":1,
        #"delayed_parameter_initialization":True,
        "offload_activations": True,           
        "fp16":True,
        "skip_tracing": True

    }
}

mpi_options = {
    "enabled" : True,                      # Required
    "processes_per_host" : 8               # Required
}

In [43]:

base_job_name = "gptneox20b-instruction-fine-tuning"
estimator = PyTorch(
    base_job_name=base_job_name,
    source_dir="./scripts",
    entry_point="train.py",
    role=role,
    framework_version="2.0.0",
    py_version="py310",
    instance_count=2,
    instance_type="ml.p4d.24xlarge",
    hyperparameters=hyperparameters,
    disable_profiler=True,
    distribution={
        "smdistributed": {"modelparallel": smp_options},
        "mpi": mpi_options
    }
)

INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials


In [None]:
estimator.fit({"train":train_data_url,"valid":valid_data_url})