### 0. Install dependencies

In [1]:
%pip install -qU pip
%pip install -qU sagemaker boto3 awscli boto3 ipywidgets

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
import json
from pathlib import Path
import os
from time import strftime
from functools import partial
import importlib

import utilities as u

import boto3
import sagemaker
from sagemaker.experiments.run import Run
from sagemaker.inputs import TrainingInput
from sagemaker.pytorch import PyTorch

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


In [3]:
sagemaker.__version__, boto3.__version__

('2.215.0', '1.34.84')

In [4]:
boto_session = boto3.session.Session()
sagemaker_session = sagemaker.session.Session(boto_session)
omics = boto3.client("omics")

REGION_NAME = sagemaker_session.boto_region_name
S3_BUCKET = sagemaker_session.default_bucket()

EXPERIMENT_NAME = "hyenaDNA-pretraining-v2"

SAGEMAKER_EXECUTION_ROLE = sagemaker.session.get_execution_role(sagemaker_session)
print(f"Assumed SageMaker role is {SAGEMAKER_EXECUTION_ROLE}")


Assumed SageMaker role is arn:aws:iam::111918798052:role/DevelopmentRole
S3_DATA_URI s3://sagemaker-us-east-1-111918798052/data/


### 1. Read the data from AWS HealthOmics

To train the HyenaDNA model, we will directly utilize the data from the Healthomics Sequence Store. Let's retrieve the read set URI from the Healthomics Sequence Store. Make sure to replace your sequence store id in the following cell. 

In [49]:
seq_store_id = "4308389581" # replace with your sequence store id

In [29]:
seq_store_info = omics.get_sequence_store(id=seq_store_id)
s3_uri = seq_store_info["s3Access"]["s3Uri"]
s3_arn = seq_store_info["s3Access"]["s3AccessPointArn"]
key_arn = seq_store_info["sseConfig"]["keyArn"]
s3_uri, s3_arn, key_arn

('s3://111918798052-4308389-m7r4grkrg7nkpmf5swnjwf1iqsdieuse1b-s3alias/111918798052/sequenceStore/4308389581/',
 'arn:aws:s3:us-east-1:559620149354:accesspoint/111918798052-4308389581',
 'arn:aws:kms:us-east-1:559620149354:key/ef42c6a8-5692-4a6c-9a66-a2d1058a9a41')

In [48]:
S3_DATA_URI = f"{s3_uri}readSet/"
S3_DATA_URI

's3://111918798052-4308389-m7r4grkrg7nkpmf5swnjwf1iqsdieuse1b-s3alias/111918798052/sequenceStore/4308389581/readSet/'

For the training job to access the objects in the above S3 access point, `s3uri`, you must add a policy
to this execution role (`SAGEMAKER_EXECUTION_ROLE`). The output of the following cell is the policy that
you should attach to this role:

In [30]:
print(json.dumps({
    "Version": "2012-10-17",
    "Statement": [
        { 
            "Sid": "S3DirectAccess",
            "Effect": "Allow",
            "Action": [ 
                "s3:GetObject",
                "s3:ListBucket"
            ], 
            "Resource": "*",
            "Condition": {
                "StringEquals": {
                    "s3:DataAccessPointArn": s3_arn
                } 
            }
        },
        { 
            "Sid": "DefaultSequenceStoreKMSDecrypt",
            "Effect": "Allow",
            "Action": "kms:Decrypt",
            "Resource": key_arn
        }
    ] 
}, indent=2))

{
  "Version": "2012-10-17",
  "Statement": [
    {
      "Sid": "S3DirectAccess",
      "Effect": "Allow",
      "Action": [
        "s3:GetObject",
        "s3:ListBucket"
      ],
      "Resource": "*",
      "Condition": {
        "StringEquals": {
          "s3:DataAccessPointArn": "arn:aws:s3:us-east-1:559620149354:accesspoint/111918798052-4308389581"
        }
      }
    },
    {
      "Sid": "DefaultSequenceStoreKMSDecrypt",
      "Effect": "Allow",
      "Action": "kms:Decrypt",
      "Resource": "arn:aws:kms:us-east-1:559620149354:key/ef42c6a8-5692-4a6c-9a66-a2d1058a9a41"
    }
  ]
}


### 2. Training



### 2.1 Define the training container 

In [42]:
pytorch_image_uri = f"763104351884.dkr.ecr.{REGION_NAME}.amazonaws.com/pytorch-training:2.2.0-gpu-py310-cu121-ubuntu20.04-sagemaker"
pytorch_image_uri

'763104351884.dkr.ecr.us-east-1.amazonaws.com/pytorch-training:2.2.0-gpu-py310-cu121-ubuntu20.04-sagemaker'

#### 2.2 Define the training job parameters

In [43]:
MODEL_ID = 'LongSafari/hyenadna-small-32k-seqlen-hf'
TRAINING_JOB_NAME = 'hyenaDNA-pretraining'

# Additional training parameters
hyperparameters = {
    "species" : "mouse",
    "epochs": 150,
    "model_checkpoint": MODEL_ID,
    "max_length": 32_000,
    "batch_size": 4, 
    "learning_rate": 6e-4,
    "weight_decay" : 0.1,
    "log_level" : "INFO",
    "log_interval" : 100
}


#### 2.3 Define Metrics to track


In [44]:
metric_definitions = [
    {"Name": "epoch", "Regex": "Epoch: ([0-9.]*)"},
    {"Name": "step", "Regex": "Step: ([0-9.]*)"},
    {"Name": "train_loss", "Regex": "Train Loss: ([0-9.e-]*)"},
    {"Name": "train_perplexity", "Regex": "Train Perplexity: ([0-9.e-]*)"},
    {"Name": "eval_loss", "Regex": "Eval Average Loss: ([0-9.e-]*)"},
    {"Name": "eval_perplexity", "Regex": "Eval Perplexity: ([0-9.e-]*)"}
]

#### 2.4 Define the tensorboard configurations to track the training results

In [45]:
from sagemaker.debugger import TensorBoardOutputConfig

LOG_DIR="/opt/ml/output/tensorboard"

output_path = os.path.join(
    "s3://", S3_BUCKET, "sagemaker-output", "training", TRAINING_JOB_NAME
)

tensorboard_output_config = TensorBoardOutputConfig(
    s3_output_path=os.path.join(output_path, 'tensorboard'),
    container_local_output_path=LOG_DIR
)

#### 2.4 Define Estimator

In [46]:
hyenaDNA_estimator = PyTorch(
    base_job_name=TRAINING_JOB_NAME,
    entry_point="train_hf_accelerate.py",
    source_dir="scripts/",
    instance_type="ml.g5.12xlarge",
    instance_count=1,
    image_uri=pytorch_image_uri,
    role=SAGEMAKER_EXECUTION_ROLE,
    hyperparameters=hyperparameters,
    metric_definitions=metric_definitions,
    sagemaker_session=sagemaker_session,
    distribution={"torch_distributed": {"enabled": True}},
    tags=[{"Key": "project", "Value": "genomics-model-pretraining"}],
    keep_alive_period_in_seconds=1800,
    tensorboard_output_config=tensorboard_output_config,
)


#### 2.5 Start Training with Distributed Data Parallel

In [47]:
with Run(
    experiment_name=EXPERIMENT_NAME,
    sagemaker_session=sagemaker_session,
) as run:
    hyenaDNA_estimator.fit(
        {
            "data": TrainingInput(
                s3_data=S3_DATA_URI, input_mode="File"
            ),
        },
        wait=True,
    )


INFO:sagemaker:Creating training-job with name: hyenaDNA-pretraining-2024-04-16-00-12-50-506


2024-04-16 00:12:50 Starting - Starting the training job...
2024-04-16 00:13:08 Pending - Training job waiting for capacity...
2024-04-16 00:13:34 Pending - Preparing the instances for training...
2024-04-16 00:14:16 Downloading - Downloading input data...
2024-04-16 00:14:31 Downloading - Downloading the training image...............
2024-04-16 00:17:12 Training - Training image download completed. Training in progress.......[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2024-04-16 00:18:08,487 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2024-04-16 00:18:08,546 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)[0m
[34m2024-04-16 00:18:08,556 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2024-04-16 00:18:08,557 sagemaker_pytorch_container.training INFO  

In [50]:
training_job_name = hyenaDNA_estimator.latest_training_job.name
training_job_name

'hyenaDNA-pretraining-2024-04-16-00-12-50-506'

### 5. Training Results 

* In our training process we had pushed the training resulsts to Tensorboard. You can see them using SageMaker tensorboad application. To obtain the link to the tensorboard, please replace the sagemaker studio profile name below. 

In [51]:
user_profile = "shamika" # replace with your sagemaker studio profile name

Execuate following cell to get link to the the tensorboard application

In [52]:
from sagemaker.interactive_apps.tensorboard import TensorBoardApp

with open("/opt/ml/metadata/resource-metadata.json", "r") as f:
    app_metadata = json.loads(f.read())
    sm_user_profile_name = app_metadata["SpaceName"]
    sm_domain_id = app_metadata["DomainId"]

tb_app = TensorBoardApp(REGION_NAME)
tb_app.get_app_url(
    training_job_name=training_job_name,
    create_presigned_domain_url=True,           
    domain_id=sm_domain_id,                 
    user_profile_name=user_profile, 
    open_in_default_web_browser=False,
    optional_create_presigned_url_kwargs={} 
)



'https://studio-d-xgpxwyumgsdh.studio.us-east-1.sagemaker.aws/auth?token=eyJhbGciOiJIUzI1NiJ9.eyJmYXNDcmVkZW50aWFscyI6IkFZQURlT2RaZlVqVDBRZEtkWDB0MmtrV1c2OEFYd0FCQUJWaGQzTXRZM0o1Y0hSdkxYQjFZbXhwWXkxclpYa0FSRUZyV2tvclZtVnBZell3VW5GamQxaFpXRkp0VDJNeEx6Um1TV2RUTHpOWVNtcHJiSFZ1YzFkS09ISlRZWFJNV1VWVVluVk1OVFZXT1dWTFUycDVibXh3WnowOUFBRUFCMkYzY3kxcmJYTUFTMkZ5YmpwaGQzTTZhMjF6T25WekxXVmhjM1F0TVRvNU9EQXpOVEk0TWpZeE1UVTZhMlY1TDJFNE9UZ3labVU0TFRFM056Y3ROR0kwWmkwNE9UVTVMV00yTldNeE9XSXhZak14TUFDNEFRSUJBSGdubFhwQlJKL2g4bXY4ek1wY292U0FSWk1INi8vbTBCY2lxRVYwbnNNREVRR2tqM3ZuSGc3U255ZVJDSTh1c3J4SUFBQUFmakI4QmdrcWhraUc5dzBCQndhZ2J6QnRBZ0VBTUdnR0NTcUdTSWIzRFFFSEFUQWVCZ2xnaGtnQlpRTUVBUzR3RVFRTU01RFd2VEcrRlFoVGVpODZBZ0VRZ0R0d2o0cFVMaG10M0ZBV2Z1djJTM1I0RkhGamY2bW9JUGMybE1RTU1XdVJCWm5URXAyZXE1eW9HV25aaFZha3AvWGRSN3NYRW9pNDY2MVk3d0lBQUFBQURBQUFFQUFBQUFBQUFBQUFBQUFBQUFEQ2FVUTJVaUJmcDNWSTVDNTU2OG0vLy8vLy93QUFBQUVBQUFBQUFBQUFBQUFBQUFFQUFBUXpkUUF6cm9qaXJRRWNLUkZEODA3cUdqYXg3RkwrVy9NdUJWOEx0SVRZOU9pL2YxUU5tTkJhTm1LSD

### 6. Deploy trained model to an realtime endpoint

In [53]:
from sagemaker.pytorch.model import PyTorchModel
from sagemaker.serializers import JSONSerializer
from sagemaker.estimator import Estimator

#training_job_name = "hyenaDNA-pretraining-2024-04-06-06-23-26-412"
attached_estimator = Estimator.attach(training_job_name)

model_data = attached_estimator.model_data
model_data


2024-04-16 02:44:34 Starting - Starting the training job
2024-04-16 02:44:34 Pending - Preparing the instances for training
2024-04-16 02:44:34 Downloading - Downloading the training image
2024-04-16 02:44:34 Training - Training image download completed. Training in progress.
2024-04-16 02:44:34 Uploading - Uploading generated training model
2024-04-16 02:44:34 Completed - Instances not retained as a result of warmpool resource limits being exceeded


's3://sagemaker-us-east-1-111918798052/hyenaDNA-pretraining-2024-04-16-00-12-50-506/output/model.tar.gz'

In [71]:
# Deploy the model to create a real-time endpoint
endpoint_name = 'hyenaDNA-pretrained-mouse-ep'  
pytorch_deployment_uri = f"763104351884.dkr.ecr.{REGION_NAME}.amazonaws.com/pytorch-inference:2.2.0-gpu-py310-cu118-ubuntu20.04-sagemaker"

hyenaDNAModel = PyTorchModel(
    model_data=model_data,
    role=SAGEMAKER_EXECUTION_ROLE,
    image_uri=pytorch_deployment_uri,
    entry_point="inference.py",
    source_dir="scripts/",
    sagemaker_session=sagemaker_session,
    name=endpoint_name,
    env = {
        'MMS_MAX_REQUEST_SIZE': '2000000000',
        'MMS_MAX_RESPONSE_SIZE': '2000000000',
        'MMS_DEFAULT_RESPONSE_TIMEOUT': '900',
        'TS_MAX_RESPONSE_SIZE':'2000000000',
        'TS_MAX_REQUEST_SIZE':'2000000000',
    }
)

In [72]:
#real_time_endpoint_name = "hyenaDNA-mouse-pretrained-real-ep"
env = {
    'SAGEMAKER_MODEL_SERVER_TIMEOUT':'7200', 
    'TS_MAX_RESPONSE_SIZE':'2000000000',
    'TS_MAX_REQUEST_SIZE':'2000000000',
    'MMS_MAX_RESPONSE_SIZE':'2000000000',
    'MMS_MAX_REQUEST_SIZE':'2000000000'
}

# deploy the endpoint endpoint
realtime_predictor = hyenaDNAModel.deploy(
    initial_instance_count=1,
    instance_type="ml.g5.8xlarge",
    endpoint_name=endpoint_name,
    env=env,
)

INFO:sagemaker:Repacking model artifact (s3://sagemaker-us-east-1-111918798052/hyenaDNA-pretraining-2024-04-16-00-12-50-506/output/model.tar.gz), script artifact (scripts/), and dependencies ([]) into single tar.gz file located at s3://sagemaker-us-east-1-111918798052/hyenaDNA-pretrained-mouse-ep/model.tar.gz. This may take some time depending on model size...
INFO:sagemaker:Creating model with name: hyenaDNA-pretrained-mouse-ep
INFO:sagemaker:Creating endpoint-config with name hyenaDNA-pretrained-mouse-ep
INFO:sagemaker:Creating endpoint with name hyenaDNA-pretrained-mouse-ep


---------!

### 7. Test the realtime endpoint



In [73]:
import json
sample_genome_data = []
with open("./sample_mouse_data.json") as file:
    for line in file:
        sample_genome_data.append(json.loads(line))
len(sample_genome_data)

10

In [None]:
from sagemaker.deserializers import JSONDeserializer
from sagemaker.serializers import JSONSerializer


data = [sample_genome_data[0]]
realtime_predictor.serializer = JSONSerializer()
realtime_predictor.deserializer = JSONDeserializer()
realtime_predictor.predict(data=data)

{'embeddings': [[[-0.50390625,
    0.447265625,
    -1.03125,
    0.546875,
    0.50390625,
    -0.53125,
    0.59375,
    0.71875,
    -0.349609375,
    -0.404296875,
    -4.8125,
    0.84375,
    0.359375,
    1.2265625,
    1.0390625,
    -0.64453125,
    -2.0625,
    -0.416015625,
    0.34375,
    0.30078125,
    -0.53515625,
    -0.9765625,
    -1.2890625,
    -1.2734375,
    1.0234375,
    -0.16796875,
    -0.578125,
    0.64453125,
    -0.2392578125,
    -0.439453125,
    -1.0390625,
    0.330078125,
    1.25,
    0.1708984375,
    0.1494140625,
    -0.07861328125,
    0.447265625,
    0.71875,
    -1.015625,
    1.28125,
    -0.1298828125,
    -0.404296875,
    0.6484375,
    -0.1376953125,
    0.76171875,
    -0.4921875,
    -0.1142578125,
    -0.671875,
    -0.40234375,
    -0.3359375,
    -1.3984375,
    -0.1875,
    -1.46875,
    1.40625,
    -0.29296875,
    -0.01177978515625,
    -0.140625,
    -0.6875,
    -0.193359375,
    0.69140625,
    0.671875,
    -1.15625,
    -1.

#### 7. Cleanup



In [70]:
realtime_predictor.delete_endpoint()

INFO:sagemaker:Deleting endpoint configuration with name: hyenaDNA-mouse-pretrained-ep
INFO:sagemaker:Deleting endpoint with name: hyenaDNA-mouse-pretrained-ep
