In [None]:
import sagemaker
from sagemaker.pytorch import PyTorch

In [None]:
sess = sagemaker.Session()
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

role = sagemaker.get_execution_role()

sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")

In [None]:
import pandas as pd
import json
  
# Opening JSON file
f1 = open('data/all.jsonl')
Lines1 = f1.readlines()

questions, answers = [], []
print(f"Total lines in training set {len(Lines1)}")
for line in Lines1[:10000]:
    row = json.loads(line)
    for answer in row["human_answers"]:
        questions.append(row["question"])
        answers.append(answer)
    for answer in row["chatgpt_answers"]:
        questions.append(row["question"])
        answers.append(answer)

test_file = open("data/test.jsonl","w")
for line in Lines1[10000:]:
    test_file.write(line)

test_file.close()
    
df = pd.DataFrame()
df["question"] = questions
df["answer"] = answers
df = df.sample(frac = 1)
df_train = df.iloc[:30000,:]
df_val = df.iloc[30000:,:]

df_train.to_csv("data/train.csv", index=False)
df_val.to_csv("data/val.csv", index=False)

In [None]:
train_data_url = sess.upload_data(
    path="data/train.csv",
    key_prefix="promptsds",
)

valid_data_url = sess.upload_data(
    path="data/val.csv",
    key_prefix="promptsds",
)

In [None]:
print(f"training file path {train_data_url}")
print(f"validation file path {valid_data_url}")

In [None]:
hyperparameters = {}

hyperparameters["model_name_or_path"] = "EleutherAI/gpt-j-6b"
hyperparameters["train_file"] = "/opt/ml/input/data/train/train.csv"
hyperparameters["validation_file"] = "/opt/ml/input/data/valid/val.csv"
hyperparameters["per_device_train_batch_size"] = 1
hyperparameters["per_device_eval_batch_size"] = 1
hyperparameters["block_size"] = 1024
hyperparameters["model_dir"] = "/opt/ml/model"
hyperparameters["num_train_epochs"] = 2


In [None]:
smp_options = {
    "enabled":True,
    "parameters": {                        # Required
        "pipeline_parallel_degree": 1,     # Required
        "ddp": True,
        "ddp_dist_backend": "auto",
        # parameters for sharded data parallelism
        "sharded_data_parallel_degree": 8,              # Add this to activate sharded data parallelism
        "partitions":1,
        #"delayed_parameter_initialization":True,
        "offload_activations": True,           
        "bf16":True,
        "skip_tracing": True

    }
}

mpi_options = {
    "enabled" : True,                      # Required
    "processes_per_host" : 8               # Required
}

In [None]:

base_job_name = "gptneox20b-instruction-fine-tuning"
estimator = PyTorch(
    base_job_name=base_job_name,
    source_dir="./scripts",
    entry_point="train.py",
    role=role,
    framework_version="1.13.1",
    py_version="py39",
    instance_count=1,
    instance_type="ml.p4d.24xlarge",
    hyperparameters=hyperparameters,
    disable_profiler=True,
    distribution={
        "smdistributed": {"modelparallel": smp_options},
        "mpi": mpi_options
    }

)

In [None]:
estimator.fit({"train":train_data_url,"valid":valid_data_url})