### 0. Install dependencies

In [1]:
%pip install -q --upgrade pip
%pip install -q --upgrade sagemaker boto3 awscli boto3 ipywidgets

Note: you may need to restart the kernel to use updated packages.
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
aiobotocore 2.7.0 requires botocore<1.31.65,>=1.31.16, but you have botocore 1.34.77 which is incompatible.
autovizwidget 0.21.0 requires pandas<2.0.0,>=0.20.1, but you have pandas 2.1.2 which is incompatible.
hdijupyterutils 0.21.0 requires pandas<2.0.0,>=0.17.1, but you have pandas 2.1.2 which is incompatible.
sparkmagic 0.21.0 requires pandas<2.0.0,>=0.17.1, but you have pandas 2.1.2 which is incompatible.[0m[31m
[0mNote: you may need to restart the kernel to use updated packages.


In [2]:
import boto3
import os
import sagemaker
from sagemaker.experiments.run import Run
from sagemaker.inputs import TrainingInput
from sagemaker.pytorch import PyTorch
from time import strftime

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


In [7]:
boto_session = boto3.session.Session()
sagemaker_session = sagemaker.session.Session(boto_session)

REGION_NAME = sagemaker_session.boto_region_name
S3_BUCKET = sagemaker_session.default_bucket()

EXPERIMENT_NAME = "hyenaDNA-pretraining"

SAGEMAKER_EXECUTION_ROLE = sagemaker.session.get_execution_role(sagemaker_session)
print(f"Assumed SageMaker role is {SAGEMAKER_EXECUTION_ROLE}")

Assumed SageMaker role is arn:aws:iam::111918798052:role/DevelopmentRole


### 1. Read the data from AWS HealthOmics

In [27]:
data_uri = "s3://shamika-hcls/datasources/genomics-data/species"
data_uri

's3://shamika-hcls/datasources/genomics-data/species'

### 2. Training



### 2.1 Define the training container 

In [12]:
pytorch_image_uri = f"763104351884.dkr.ecr.{REGION_NAME}.amazonaws.com/pytorch-training:2.2.0-gpu-py310-cu121-ubuntu20.04-sagemaker"
pytorch_image_uri

'763104351884.dkr.ecr.us-east-1.amazonaws.com/pytorch-training:2.2.0-gpu-py310-cu121-ubuntu20.04-sagemaker'

#### 2.2 Define the training job parameters

In [28]:
MODEL_ID = 'LongSafari/hyenadna-small-32k-seqlen-hf'

# Additional training parameters
hyperparameters = {
    "epochs": 10,
    "model_checkpoint": MODEL_ID,
    "max_length": 32_000,
    "batch_size": 8, 
    "logging_steps": 2,
    "learning_rate": 6e-4,
    "weight_decay" : 0.1,
    "log_level" : "INFO",
    "log_interval" : 100
}


#### 2.3 Define Metrics to track


In [29]:
metric_definitions = [
    {"Name": "epoch", "Regex": "Epoch: ([0-9.]*)"},
    {"Name": "step", "Regex": "Step: ([0-9.]*)"},
    {"Name": "train_loss", "Regex": "Training Loss: ([0-9.e-]*)"},
    {"Name": "train_perplexity", "Regex": "Training Perplexity: ([0-9.e-]*)"},
    {
        "Name": "train_samples_per_second",
        "Regex": "Training Samples/sec: ([0-9.e-]*)",
    },
    {
        "Name": "train_tokens_per_second",
        "Regex": "Training Tokens/sec: ([0-9.e-]*)",
    },
    {"Name": "eval_loss", "Regex": "Eval Loss: ([0-9.e-]*)"},
    {"Name": "eval_perplexity", "Regex": "Eval Perplexity: ([0-9.e-]*)"},
    {
        "Name": "eval_samples_per_second",
        "Regex": "Eval Samples/sec: ([0-9.e-]*)",
    },
    {"Name": "eval_tokens_per_second", "Regex": "Eval Tokens/sec: ([0-9.e-]*)"},
]

#### 2.4 Define Estimator

In [30]:
hyenaDNA_estimator = PyTorch(
    base_job_name="hyenaDNA-pretraining",
    entry_point="train_hf.py",
    source_dir="scripts/",
    instance_type="ml.g5.8xlarge",
    instance_count=1,
    image_uri=pytorch_image_uri,
    role=SAGEMAKER_EXECUTION_ROLE,
    hyperparameters=hyperparameters,
    metric_definitions=metric_definitions,
    sagemaker_session=sagemaker_session,
    distribution={"torch_distributed": {"enabled": True}},
    tags=[{"Key": "project", "Value": "esm-benchmarking"}],
    keep_alive_period_in_seconds=1800
)


#### 2.5 Start Training

In [None]:
with Run(
    experiment_name=EXPERIMENT_NAME,
    sagemaker_session=sagemaker_session,
) as run:
    hyenaDNA_estimator.fit(
        {
            "training": TrainingInput(
                s3_data=data_uri, input_mode="File"
            ),
        },
        wait=True,
    )


INFO:sagemaker:Creating training-job with name: hyenaDNA-pretraining-2024-04-04-07-32-31-797


2024-04-04 07:32:32 Starting - Starting the training job...
2024-04-04 07:32:50 Pending - Training job waiting for capacity...
2024-04-04 07:33:05 Pending - Preparing the instances for training...
2024-04-04 07:33:50 Downloading - Downloading input data..