# Training a downstream task using the Evo model

In [None]:
%pip install -qU pip
%pip install -qU sagemaker boto3 awscli ipywidgets

In [None]:
import json
from pathlib import Path
import os
import sys
from typing import List

# if the following command fails you may need to modify
# it to match the notebook's CWD:
!cp utilities.py evo-model/scripts
sys.path.append('../')
from scripts.utilities import join

import boto3
import sagemaker
from sagemaker.experiments.run import Run
from sagemaker.inputs import TrainingInput
from sagemaker.pytorch import PyTorch
from sagemaker.debugger import TensorBoardOutputConfig

In [None]:
boto_session = boto3.session.Session()
sagemaker_session = sagemaker.session.Session(boto_session)

REGION_NAME = sagemaker_session.boto_region_name
S3_BUCKET = "sgh-misc"  # change this to a bucket in your account
EXPERIMENT_NAME = "evo-downstream-task"

SAGEMAKER_EXECUTION_ROLE = sagemaker.session.get_execution_role(sagemaker_session)
print(f"Assumed SageMaker role is {SAGEMAKER_EXECUTION_ROLE}")

In [None]:
S3_DATA_PREFIX_URI = "s3://sgh-misc/evo-datasets/all/"
S3_DATA_URI = join("/", S3_DATA_PREFIX_URI, "examples.jsonl")
MODEL_ID = "togethercomputer/evo-1-8k-base"
TRAINING_JOB_NAME = "evo-downstream-task-training"

S3_DATA_URI

#### Define the training container 

In [None]:
pytorch_image_uri = f"763104351884.dkr.ecr.{REGION_NAME}.amazonaws.com/pytorch-training:2.2.0-gpu-py310-cu121-ubuntu20.04-sagemaker"
pytorch_image_uri

## Pre-process the data

Here we augment the data (to mitigate the effects of the category bias in the training examples)
and compute Evo embeddings for each example. The augmented data and concomitant embeddings are 
stored in s3.

In [None]:
# Note that p4 instances aren't available for ProcessingJobs so we
# shoehorn our data-preprocessing into an Estimator

# instance_type = "ml.p3.2xlarge" # useful for testing plumbing
instance_type = "ml.p4d.24xlarge"

data_processing_hyperparameters = {
    "model_checkpoint": MODEL_ID,
    "batch_size": 1,
    "augment_datasets": 1,
    "log_level" : "INFO",
    "output_s3_prefix_uri": S3_DATA_PREFIX_URI
}

data_processor = PyTorch(
    base_job_name=TRAINING_JOB_NAME,
    entry_point="data-preprocessing.py",
    source_dir="evo-model/scripts/",
    instance_type=instance_type,
    instance_count=1,
    image_uri=pytorch_image_uri,
    role=SAGEMAKER_EXECUTION_ROLE,
    hyperparameters=data_processing_hyperparameters,
    sagemaker_session=sagemaker_session,
    tags=[{"Key": "project", "Value": "evo-model-downstream-task-data-prep"}],
    keep_alive_period_in_seconds=1800)

In [None]:
data_processor.fit({
    "data": TrainingInput(s3_data=S3_DATA_URI, input_mode="File")
    },
    wait=True)

## Train the downstream task

In [None]:
downstream_hyperparameters = {
    "epochs": 500,
    "batch_size": 128,
    "learning_rate": 0.001, # default for Adam is 0.001
    "log_level" : "INFO",
    # "model_revision": "1.1_fix",
    "model_revision": "main",
    "train_test_split": 0.8
}

#### Define Metrics to track

You can view these metrics on the Training Job page in the Sagemaker console.

In [None]:
metric_definitions = [
    {"Name": "epoch",           "Regex": "Epoch #([0-9]+)"},
    {"Name": "train_loss",      "Regex": "Train Loss: ([0-9.e-]+)"},
    {"Name": "eval_loss",       "Regex": "Eval Loss: ([0-9.e-]+)"},
    {"Name": "train_RMS_error", "Regex": "Train RMSerr: ([0-9.e-]+)"},
    {"Name": "eval_RMS_error",  "Regex": "Eval RMSerr: ([0-9.e-]+)"}   
]

#### 2.4 Define the tensorboard configurations to track the training results

In [None]:
LOG_DIR = "/opt/ml/output/tensorboard"

output_path = join("/", f"s3://{S3_BUCKET}", "sagemaker-output", "training",
                        TRAINING_JOB_NAME)
s3_output_path = join("/", output_path, "tensorboard")
tensorboard_output_config = TensorBoardOutputConfig(
    s3_output_path=s3_output_path,
    container_local_output_path=LOG_DIR)
print(s3_output_path)

### Define Estimator

In [None]:
instance_type = "ml.p3.2xlarge" 
# instance_type = "ml.p4d.24xlarge"

evo_estimator = PyTorch(
    base_job_name=TRAINING_JOB_NAME,
    entry_point="downstream_task.py",
    source_dir="evo-model/scripts/",
    instance_type=instance_type,
    instance_count=1,
    image_uri=pytorch_image_uri,
    role=SAGEMAKER_EXECUTION_ROLE,
    hyperparameters=downstream_hyperparameters,
    metric_definitions=metric_definitions,
    sagemaker_session=sagemaker_session,
    tags=[{"Key": "project", "Value": "evo-model-downstream-task"}],
    keep_alive_period_in_seconds=1800,
    tensorboard_output_config=tensorboard_output_config)

In [None]:
print(f"Training with data from {S3_DATA_PREFIX_URI}")

In [None]:
with Run(
    experiment_name=EXPERIMENT_NAME,
    sagemaker_session=sagemaker_session,
) as run:
    evo_estimator.fit({"data": TrainingInput(s3_data=S3_DATA_PREFIX_URI,
                                             input_mode="File")},
                      wait=True)

In [None]:
training_job_name = evo_estimator.latest_training_job.name
training_job_name

## Training Results 

The above training process has placed logging results in the s3 location `s3_output_path` where
the SageMaker Tensorboard application can access it. You can use the Tensorboard app to see the training curves and other information about your runs above. For example, you might see something like this: ![training curves](resources/training-curves.png "Training Curves")