# PyTorch Hyperparameter tuning with Ray Tune

***

## Prerequisites

In [None]:
%pip install -r ./scripts/requirements.txt --upgrade

In [None]:
# Copy Ray launcher script to the scripts directory. 
%cp ../../../scripts/launcher.py ./scripts/

***

# Step 1 - Import Modules

Here we’ll import some libraries and define some variables.

In [None]:
import os

# os.environ["AWS_PROFILE"] = "<aws_profile>"

In [None]:
import boto3
import sagemaker

In [None]:
sagemaker_client = boto3.client("sagemaker")
s3_client = boto3.client("s3")

Create a SageMaker Session and save the default region and the execution role in some Python variables

In [None]:
sagemaker_session = sagemaker.Session()
region = boto3.session.Session().region_name
role = sagemaker.get_execution_role()

In [None]:
bucket_name = sagemaker_session.default_bucket()

***

# Step 2 - Run the job

In [None]:
! pygmentize ./scripts/train.py

In [None]:
import sagemaker
from sagemaker.config import load_sagemaker_config

In [None]:
sagemaker_session = sagemaker.Session()

bucket_name = sagemaker_session.default_bucket()
default_prefix = sagemaker_session.default_bucket_prefix
configs = load_sagemaker_config()

In [None]:
instance_type = "ml.m5.2xlarge"  # Override the instance type if you want to get a different container version
instance_count = 1

instance_type

In [None]:
image_uri = sagemaker.image_uris.retrieve(
    framework="pytorch",
    region=sagemaker_session.boto_session.region_name,
    version="2.7.1",
    instance_type=instance_type,
    image_scope="training"
)

image_uri

In [None]:
from sagemaker.modules.configs import (
    Compute,
    OutputDataConfig,
    RemoteDebugConfig,
    SourceCode,
    StoppingCondition,
)
from sagemaker.modules.train import ModelTrainer

args = [
    "--entrypoint",
    "train.py",
    "--l1",
    "64",
    "--l2",
    "32",
    "--lr",
    "0.01",
    "--batch_size",
    "16",
    "--max_epochs",
    "10"
]

# Define the script to be run
source_code = SourceCode(
    source_dir="./scripts",
    requirements="requirements.txt",
    command=f"python launcher.py {' '.join(args)}",
)

# Define the compute
compute_configs = Compute(
    instance_type=instance_type,
    instance_count=instance_count,
    keep_alive_period_in_seconds=0,
)

# define Training Job Name
job_name = "train-ray-tune"

# define OutputDataConfig path
if default_prefix:
    output_path = f"s3://{bucket_name}/{default_prefix}/{job_name}"
else:
    output_path = f"s3://{bucket_name}/{job_name}"

# Define the ModelTrainer
model_trainer = ModelTrainer(
    training_image=image_uri,
    source_code=source_code,
    base_job_name=job_name,
    compute=compute_configs,
    stopping_condition=StoppingCondition(max_runtime_in_seconds=18000),
    output_data_config=OutputDataConfig(
        s3_output_path=output_path, 
        compression_type="NONE"
    ),
    environment={
        "RAY_PROMETHEUS_HOST": "<PROMETHEUS_HOST>",
        "RAY_GRAFANA_HOST": "<GRAFANA_HOST>",
        "RAY_PROMETHEUS_NAME": "prometheus",
    },
    role=role,
).with_remote_debug_config(RemoteDebugConfig(enable_remote_debug=True))

In [None]:
# starting the train job with our uploaded datasets as input
model_trainer.train(wait=False)