## Fine Tune Flan T5 on prompt dataset

In [None]:
! pip install -U sagemaker

In [None]:
! pip install transformers

### Dataset creation

We will use the HC3 dataset to fine tune the model. The dataset can be found here on Huggingface - https://huggingface.co/datasets/Hello-SimpleAI/HC3

In [None]:
import pandas as pd
import json
  
# Opening JSON file
f1 = open('data/all.jsonl')
Lines1 = f1.readlines()

questions, answers = [], []
for line in Lines1[:20000]:
    row = json.loads(line)
    for answer in row["human_answers"]:
        questions.append("Prompt: "+row["question"])
        answers.append("Response: "+answer)
    for answer in row["chatgpt_answers"]:
        questions.append("Prompt: "+row["question"])
        answers.append("Response: "+answer)

test_file = open("data/test.jsonl","w")
for line in Lines1[20000:]:
    test_file.write(line)

test_file.close()


df = pd.DataFrame()
df["question"] = questions
df["answer"] = answers
df = df.sample(frac = 1)
df_train = df.iloc[:60000,:]
df_val = df.iloc[60000:,:]

df_train.to_csv("data/train.csv", index=False)
df_val.to_csv("data/val.csv", index=False)

### Upload data to S3

In [None]:
import sagemaker
from sagemaker.pytorch import PyTorch

In [None]:
sess = sagemaker.Session()
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

role = sagemaker.get_execution_role()

sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")

In [None]:
train_data_url = sess.upload_data(
    path="data/train.csv",
    key_prefix="promptsds",
)

valid_data_url = sess.upload_data(
    path="data/val.csv",
    key_prefix="promptsds",
)

In [None]:
print(f"training file path {train_data_url}")
print(f"validation file path {valid_data_url}")

### Fine Tune FLAN T5 XXL (11b) on Seq2Seq

In [None]:
base_job_name="finetune-flant5-11b"

In [None]:
hyperparameters = {}

hyperparameters["model_name_or_path"] = "google/flan-t5-xxl"
hyperparameters["train_file"] = "/opt/ml/input/data/train/train.csv"
hyperparameters["validation_file"] = "/opt/ml/input/data/valid/val.csv"
hyperparameters["per_device_train_batch_size"] = 8
hyperparameters["per_device_eval_batch_size"] = 8
hyperparameters["block_size"] = 2048
hyperparameters["checkpoint_dir"] = "/opt/ml/checkpoints"
hyperparameters["num_train_epochs"] = 1
hyperparameters["max_train_steps"] = 250

#### Store model files as checkpoints for easy deployment

In [None]:

checkpoint_dir = "/opt/ml/checkpoints"
checkpoint_s3_path = "s3://" + sess.default_bucket() + "/flant5-checkpoints"

In [None]:
smp_options = {
    "enabled":True,
    "parameters": {                        # Required
        "pipeline_parallel_degree": 1,     # Required
        "ddp": True,
        # parameters for sharded data parallelism
        "sharded_data_parallel_degree": 16,              # Add this to activate sharded data parallelism
        "partitions":1,
        "bf16":True,
        "skip_tracing": True
    }
}

mpi_options = {
    "enabled" : True,                      # Required
    "processes_per_host" : 8               # Required
}

In [None]:
# launch with smp

estimator = PyTorch(
    base_job_name=base_job_name,
    source_dir="./scripts",
    entry_point="train.py",
    role=role,
    framework_version="1.13.1",
    py_version="py39", 
    instance_count=2,
    instance_type="ml.p4d.24xlarge",
    hyperparameters=hyperparameters,
    checkpoint_local_path=checkpoint_dir,   
    checkpoint_s3_uri=checkpoint_s3_path,
    disable_profiler=True,
    keep_alive_period_in_seconds=600,
    debugger_hook_config=False,
    distribution={
        "smdistributed": {"modelparallel": smp_options},
        "mpi": mpi_options
    }
)

In [None]:
estimator.fit({"train":train_data_url,"valid":valid_data_url})

#### Store the checkpoint path to reuse in the deploy notebook

In [None]:
%store checkpoint_s3_path