### 0. Install dependencies

In [1]:
%pip install -q --upgrade pip
%pip install -q --upgrade sagemaker boto3 awscli boto3 ipywidgets

Note: you may need to restart the kernel to use updated packages.
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
aiobotocore 2.7.0 requires botocore<1.31.65,>=1.31.16, but you have botocore 1.34.80 which is incompatible.[0m[31m
[0mNote: you may need to restart the kernel to use updated packages.


In [2]:
import boto3
import os
import sagemaker
from sagemaker.experiments.run import Run
from sagemaker.inputs import TrainingInput
from sagemaker.pytorch import PyTorch
from time import strftime

import json
from pathlib import Path

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


In [3]:
boto_session = boto3.session.Session()
sagemaker_session = sagemaker.session.Session(boto_session)

REGION_NAME = sagemaker_session.boto_region_name
S3_BUCKET = sagemaker_session.default_bucket()

EXPERIMENT_NAME = "hyenaDNA-pretraining"

SAGEMAKER_EXECUTION_ROLE = sagemaker.session.get_execution_role(sagemaker_session)
print(f"Assumed SageMaker role is {SAGEMAKER_EXECUTION_ROLE}")

Assumed SageMaker role is arn:aws:iam::111918798052:role/DevelopmentRole


### 1. Read the data from AWS HealthOmics

In [4]:
seq_store_id = "4308389581"

!aws omics get-sequence-store --id {seq_store_id} > /tmp/seq-store.json

In [19]:
seq_store_info = json.loads(Path("/tmp/seq-store.json").read_text())
s3AccessPoint = seq_store_info["s3Access"]["s3Uri"]
s3AccessPoint

's3://111918798052-4308389-m7r4grkrg7nkpmf5swnjwf1iqsdieuse1b-s3alias/111918798052/sequenceStore/4308389581/'

### 2. Training



### 2.1 Define the training container 

In [6]:
pytorch_image_uri = f"763104351884.dkr.ecr.{REGION_NAME}.amazonaws.com/pytorch-training:2.2.0-gpu-py310-cu121-ubuntu20.04-sagemaker"
pytorch_image_uri

'763104351884.dkr.ecr.us-east-1.amazonaws.com/pytorch-training:2.2.0-gpu-py310-cu121-ubuntu20.04-sagemaker'

#### 2.2 Define the training job parameters

In [40]:
MODEL_ID = 'LongSafari/hyenadna-small-32k-seqlen-hf'

# Additional training parameters
hyperparameters = {
    "species" : "mouse",
    "epochs": 200,
    "model_checkpoint": MODEL_ID,
    "max_length": 32_000,
    "batch_size": 8, 
    "learning_rate": 6e-4,
    "weight_decay" : 0.1,
    "log_level" : "INFO",
    "log_interval" : 100
}


#### 2.3 Define Metrics to track


In [99]:
metric_definitions = [
    {"Name": "epoch", "Regex": "Epoch: ([0-9.]*)"},
    {"Name": "step", "Regex": "Step: ([0-9.]*)"},
    {"Name": "train_loss", "Regex": "Train Loss: ([0-9.e-]*)"},
    {"Name": "train_perplexity", "Regex": "Train perplexity: ([0-9.e-]*)"},
    {"Name": "eval_loss", "Regex": "Eval Average Loss: ([0-9.e-]*)"},
    {"Name": "eval_perplexity", "Regex": "Eval perplexity: ([0-9.e-]*)"}
]

#### 2.4 Define Estimator

In [100]:
hyenaDNA_estimator = PyTorch(
    base_job_name="hyenaDNA-pretraining",
    entry_point="train_hf.py",
    source_dir="scripts/",
    instance_type="ml.g5.8xlarge",
    instance_count=1,
    image_uri=pytorch_image_uri,
    role=SAGEMAKER_EXECUTION_ROLE,
    hyperparameters=hyperparameters,
    metric_definitions=metric_definitions,
    sagemaker_session=sagemaker_session,
    distribution={"torch_distributed": {"enabled": True}},
    tags=[{"Key": "project", "Value": "genomics-model-pretraining"}],
    keep_alive_period_in_seconds=1800
)


#### 2.5 Start Training with Distributed Data Parallel

In [127]:
with Run(
    experiment_name=EXPERIMENT_NAME,
    sagemaker_session=sagemaker_session,
) as run:
    hyenaDNA_estimator.fit(
        {
            "data": TrainingInput(
                s3_data=data_uri, input_mode="File"
            ),
        },
        wait=False,
    )


INFO:sagemaker:Creating training-job with name: hyenaDNA-pretraining-2024-04-09-03-59-01-633


### 5. Training Results 

### 6. Deploy to an Endpoint and Test

In [10]:
from sagemaker.pytorch.model import PyTorchModel
from sagemaker.serializers import JSONSerializer
from sagemaker.estimator import Estimator

training_job_name = "hyenaDNA-pretraining-2024-04-06-06-23-26-412"
attached_estimator = Estimator.attach(training_job_name)

model_data = attached_estimator.model_data
model_data


2024-04-06 17:23:11 Starting - Starting the training job
2024-04-06 17:23:11 Pending - Preparing the instances for training
2024-04-06 17:23:11 Downloading - Downloading the training image
2024-04-06 17:23:11 Training - Training image download completed. Training in progress.
2024-04-06 17:23:11 Uploading - Uploading generated training model
2024-04-06 17:23:11 Completed - Instances not retained as a result of warmpool resource limits being exceeded


's3://sagemaker-us-east-1-111918798052/hyenaDNA-pretraining-2024-04-06-06-23-26-412/output/model.tar.gz'

In [57]:
MODEL_ID

'LongSafari/hyenadna-small-32k-seqlen-hf'

In [131]:
# Deploy the model to create a real-time endpoint
endpoint_name = 'hyenaDNA-mouse-pretrained-ep'  

hyenaDNAModel = PyTorchModel(
    model_data=model_data,
    role=SAGEMAKER_EXECUTION_ROLE,
    framework_version="2.1",
    py_version="py310",
    entry_point="inference.py",
    source_dir="scripts/",
    sagemaker_session=sagemaker_session,
    name=endpoint_name,
    env={
        'SAGEMAKER_MODEL_SERVER_TIMEOUT':'3600', 
        'TS_MAX_RESPONSE_SIZE':'2000000000',
        'TS_MAX_REQUEST_SIZE':'2000000000',
        'MMS_MAX_RESPONSE_SIZE':'2000000000',
        'MMS_MAX_REQUEST_SIZE':'2000000000'
    }
)



In [None]:
predictor = hyenaDNAModel.deploy(
    initial_instance_count=1,
    instance_type='ml.g5.2xlarge',
    env={
        'SAGEMAKER_MODEL_SERVER_TIMEOUT':'3600', 
        'TS_MAX_RESPONSE_SIZE':'2000000000',
        'TS_MAX_REQUEST_SIZE':'2000000000',
        'MMS_MAX_RESPONSE_SIZE':'2000000000',
        'MMS_MAX_REQUEST_SIZE':'2000000000'
    }
)

In [132]:
from sagemaker.async_inference.async_inference_config import AsyncInferenceConfig
from sagemaker.s3 import s3_path_join
from sagemaker.utils import name_from_base

async_endpoint_name = "hyenaDNA-mouse-pretrained-aync-ep"

# create async endpoint configuration
async_config = AsyncInferenceConfig(
    output_path= s3_path_join(
        "s3://", S3_BUCKET, "async_inference/output"
    ),  # Where our results will be stored
    # Add nofitication SNS if needed
    notification_config={
        # "SuccessTopic": "PUT YOUR SUCCESS SNS TOPIC ARN",
        # "ErrorTopic": "PUT YOUR ERROR SNS TOPIC ARN",
    },  #  Notification configuration
)

In [128]:
env = {"MODEL_SERVER_WORKERS": "1"}

# deploy the endpoint endpoint
async_predictor = hyenaDNAModel.deploy(
    initial_instance_count=1,
    instance_type="ml.g5.2xlarge",
    async_inference_config=async_config,
    endpoint_name=async_endpoint_name,
    env=env,
)

INFO:sagemaker:Repacking model artifact (s3://sagemaker-us-east-1-111918798052/hyenaDNA-pretraining-2024-04-06-06-23-26-412/output/model.tar.gz), script artifact (scripts/), and dependencies ([]) into single tar.gz file located at s3://sagemaker-us-east-1-111918798052/hyenaDNA-mouse-pretrained-ep/model.tar.gz. This may take some time depending on model size...
INFO:sagemaker:Creating model with name: hyenaDNA-mouse-pretrained-ep
INFO:sagemaker:Creating endpoint-config with name hyenaDNA-mouse-pretrained-aync-ep
INFO:sagemaker:Creating endpoint with name hyenaDNA-mouse-pretrained-aync-ep


---------!

In [118]:
s3_client = boto3.client('s3')
s3_client.upload_file("./sample_mouse_data.json", S3_BUCKET, "async_inference/input/sample_mouse_data.json")


In [129]:
import json
from sagemaker.serializers import JSONSerializer
from sagemaker.deserializers import JSONDeserializer

# Assuming async_predictor is your deployed predictor object
async_predictor.serializer = JSONSerializer()
async_predictor.deserializer = JSONDeserializer()

In [130]:
response = async_predictor.predict_async([sample_genome_data[0]])


In [119]:
data = {
    "s3://", S3_BUCKET, "async_inference/input/sample_mouse_data.json"
    # "language": "pl"
}

res = async_predictor.predict_async(data=data)
print(res)

<sagemaker.async_inference.async_inference_response.AsyncInferenceResponse object at 0x7f7fbc3bdab0>


In [120]:
# Since it is async inference, get_results is looking for the output_path
# If the inference completed, you'll get the results from the output path. Otherwise, you'll get error that the output_path file doesn't exist
res.get_result()

ValueError: NumpyDeserializer cannot read content type binary/octet-stream.

In [114]:
sample_genome_data = []
with open("./sample_mouse_data.json", 'r') as file:
    for line in file:
        sample_genome_data.append(json.loads(line))
len(sample_genome_data)

10

In [115]:
predictor.serializer = sagemaker.serializers.JSONSerializer()
predictor.deserializer = sagemaker.deserializers.JSONDeserializer()

prediction = predictor.predict([sample_genome_data[0]])



ModelError: An error occurred (ModelError) when calling the InvokeEndpoint operation: Received server error (500) from primary with message "{
  "code": 500,
  "type": "InternalServerException",
  "message": "Worker died."
}
". See https://us-east-1.console.aws.amazon.com/cloudwatch/home?region=us-east-1#logEventViewer:group=/aws/sagemaker/Endpoints/hyenaDNA-mouse-pretrained-ep-2024-04-09-17-38-25-169 in account 111918798052 for more information.

In [16]:
from sagemaker.serve.builder.model_builder import ModelBuilder
from sagemaker.serve.builder.schema_builder import SchemaBuilder

In [28]:
sample_genome_data = []
with open("./sample_mouse_data.json", 'r') as file:
    for line in file:
        sample_genome_data.append(json.loads(line))
len(sample_genome_data)

10

In [31]:
len(sample_genome_data[0])

31999

In [32]:
input = sample_genome_data[0]
output = sample_genome_data[0]
schema = SchemaBuilder(input, output)

In [44]:
import torch

from sagemaker.serve import InferenceSpec
from transformers import AutoModelForCausalLM

# custom inference spec
class HyenaDNAInfSpec(InferenceSpec):
    def invoke(self, input_object: object, model: object):       
        with torch.no_grad():
            output = model(input_object)
        return output
        
    def load(self, model_dir: str):
        model = AutoModelForCausalLM.from_pretrained(
            MODEL_ID, 
            torch_dtype=torch.bfloat16, 
            device_map="auto", 
            trust_remote_code=True
        )
        checkpoint = torch.load(model_dir + '/checkpoint.pt', map_location=torch.device('cpu'))
        model.load_state_dict(checkpoint['state_dict'])
        model.eval()
        return model

spec = HyenaDNAInfSpec()

In [49]:
from sagemaker.serve.mode.function_pointers import Mode
model_builder = ModelBuilder(
    mode=Mode.SAGEMAKER_ENDPOINT,  # you can change it to Mode.LOCAL_CONTAINER for local testing
    model_path=model_data,
    inference_spec=spec,
    schema_builder=schema,
    role_arn= SAGEMAKER_EXECUTION_ROLE
)

In [50]:
model = model_builder.build()




FileNotFoundError: [Errno 2] No such file or directory: 's3://sagemaker-us-east-1-111918798052/hyenaDNA-pretraining-2024-04-06-06-23-26-412/output/model.tar.gz/checkpoint.pt'