# ModelTrainer Demo - Simple Script Mode

### Prerequisites

In [None]:
!pip uninstall sagemaker-core -y

Make sure your SageMaker version is updated.

In [None]:
!pip install sagemaker ../../dist/sagemaker_core-1.0.1.tar.gz

### Imports

In [None]:
import sagemaker
import pandas as pd
import os
import boto3
import math
import numpy as np
from sklearn.preprocessing import StandardScaler
from generate_synthetic_housing_data import *

### Parameters

In [None]:
sess = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()

# Local data paths
local_train_dir = os.path.join(os.getcwd(), "data/train")
local_test_dir = os.path.join(os.getcwd(), "data/test")
os.makedirs(local_train_dir, exist_ok=True)
os.makedirs(local_test_dir, exist_ok=True)


### Prepare Synthetic Housing Data

For all the examples below, we'll be generating a synthetic housing dataset.

In [None]:
df = generate_houses(1506)

# Get training columns
train_cols = list(df.columns)
del train_cols[-1]
train_cols

# Split data
training_index = math.floor(0.8 * df.shape[0])
x_train, y_train = df[train_cols][:training_index], df.PRICE[:training_index]
x_test, y_test = df[train_cols][training_index:], df.PRICE[training_index:]

# Scale price
y_train = y_train / 100000
y_test = y_test / 100000

# Standardize data
x_train_np = StandardScaler().fit_transform(x_train)
x_test_np = StandardScaler().fit_transform(x_test)

In [None]:
x_train.head()

Rearrange dataframe for SageMaker training and scale price.

In [None]:
train_df = pd.DataFrame(data=x_train_np)
train_df.columns = x_train.columns
train_df["PRICE"] = y_train / 100000
first_col = train_df.pop("PRICE")
train_df.insert(0, "PRICE", first_col)

test_df = pd.DataFrame(data=x_test_np)
test_df.columns = x_test.columns
test_df["PRICE"] = y_test.reset_index(drop=True) / 100000
first_col = test_df.pop("PRICE")
test_df.insert(0, "PRICE", first_col)

Save as both CSV and Numpy data types to demonstrate data type flexibility in model training.

In [None]:
# Save as Numpy
np.save(os.path.join(local_train_dir, "x_train.npy"), x_train_np)
np.save(os.path.join(local_train_dir, "y_train.npy"), y_train)

print(f"Saved training data to local directory: {local_train_dir}")


np.save(os.path.join(local_test_dir, "x_test.npy"), x_test_np)
np.save(os.path.join(local_test_dir, "y_test.npy"), y_test)

print(f"Saved test data to local directory: {local_test_dir}")

# PyTorch - Simple Script Mode Example

## Estimator - Cognitive Overload

Today, when a new user wants to use the Estimator class, there is a steep learning curve that is needed in order to interact with and create AWS Infrastructure.  The Estimator class __init__() accepts 49 parameters. Data scientist want to be able to run their training jobs in the cloud with minimal effort.

https://sagemaker.readthedocs.io/en/stable/api/training/estimators.html

* 49 parameters
* Can be confusing to know what parameters go together conceptually if user is new to working in cloud/aws
* Estimator does not provide much help with providing user with reasonable defaults
   * ie, instance_count required, role required, pre step of uploading to S3 is required


In [None]:
## Upload data to S3
numpy_train_s3_uri = sess.upload_data(path=local_train_dir, bucket=bucket, key_prefix="data/synthetic/housing/train")
numpy_test_s3_uri = sess.upload_data(path=local_test_dir, bucket=bucket, key_prefix="data/synthetic/housing/test")

In [None]:
from sagemaker.pytorch import PyTorch
hyperparameters = {"epochs": 25, "batch_size": 128, "learning_rate": 0.01}

train_instance_type = "ml.c5.xlarge"

estimator = PyTorch( # Training with Pytorch coupled with Pytorch Estimator. (ie, framework to framework estimator mapping)
    entry_point="train_pytorch.py",
    source_dir="pytorch-scripts", # Local directory where the training script is located
    instance_type=train_instance_type,
    instance_count=1,
    hyperparameters=hyperparameters,
    role=role,
    base_job_name="pytorch-model",
    framework_version="1.13.1",
    py_version="py39",
) 


In [None]:
inputs = {"train": numpy_train_s3_uri, "test": numpy_test_s3_uri} # Estimator requires inputs to be S3 uri

estimator.fit(inputs)

## ModelTrainer - Cognitive Overload

Our Redesign will tackle this problem essentially with “syntactic sugar” by simplifying parameters, grouping parameters together into constructs that are easy to understand conceptually, and providing reasonable defaults for certain optional parameters.

* 13 parameters
* Model Trainer is smart enough to set some defaults for user
* Conceptual grouping with data class
* Decouple image retrieval from Framework Estimators (instead of having N Estimators, 1 ImageSpec class that can be used for all sagemaker images)
* Leverage data classes to group parameters that go together (ie, ResourceConfig, StoppingCondition, OutputDataConfig)


##### Example 1 - Model Trainer Provides resonable defaults


In [None]:
import sys
sys.path.append("..") 

from sagemaker_core.shapes import ResourceConfig
from model_trainer.ModelTrainer import ModelTrainer, ImageSpec

image = ImageSpec(
    framework_name="pytorch",
    version="1.13.1",
    py_version="py39",
    image_scope="training",
)

# Model Trainer can pick reasonable defaults - Role, StoppingCondition, ResourceConfig, and OutputDataConfig
model_trainer = ModelTrainer(
    training_image=image
)

In [None]:
from model_trainer.ModelTrainer import SourceCodeConfig

hyper_parameters = {"epochs": "25", "batch_size": "128", "learning_rate": "0.01"}
data_inputs = {"train": local_train_dir, "test": local_test_dir}

source_code_config = SourceCodeConfig(
    training_script="train_pytorch.py",
    source_dir="pytorch-scripts",
)

# Run with code related setup
model_training_job = model_trainer.run(
    inputs=data_inputs,
    hyper_parameters=hyper_parameters,
    source_code_config=source_code_config
)


##### Example 2 - user provides all the configurations


In [None]:
from model_trainer.ModelTrainer import ModelTrainer, ImageSpec
from sagemaker_core.shapes import ResourceConfig, StoppingCondition, OutputDataConfig
from sagemaker.session import Session
from sagemaker import get_execution_role

session = Session()
base_name = "pytorch-model"
role = get_execution_role()

instance_type = "ml.c5.xlarge"
resource_config = ResourceConfig(
    instance_count=1,
    instance_type=instance_type,
    volume_size_in_gb=30,
)

stopping_condition = StoppingCondition(max_runtime_in_seconds=86400)

output_data_config = OutputDataConfig(
    s3_output_path=f"s3://{session.default_bucket()}/{base_name}/output/",
    compression_type="NONE"
)

image = ImageSpec(
    framework_name="pytorch",
    version="1.13.1",
    py_version="py39",
    image_scope="training",
    instance_type=instance_type
)

model_trainer = ModelTrainer(
    training_image=image,
    resource_config=resource_config,
    output_data_config=output_data_config,
    stopping_condition=stopping_condition,
    base_name=base_name,
    role=role,
)

In [None]:
from model_trainer.ModelTrainer import SourceCodeConfig

hyper_parameters = {"epochs": "25", "batch_size": "128", "learning_rate": "0.01"}
inputs = {"train": local_train_dir, "test": local_test_dir}

source_code_config = SourceCodeConfig(
    command="python /opt/ml/input/data/code/train_pytorch.py",
    source_dir="pytorch-scripts",
)

# Run with code related setup
model_training_job = model_trainer.run(
    inputs=inputs,
    hyper_parameters=hyper_parameters,
    source_code_config=source_code_config
)

## Estimator - Local to Remote Transition

Transition from local to remote in current interfaces like Estimator are not easily discoverable and require creating new object instances to enable one or the either (ie, need an `Estimator(...instance_type="local")` vs `Estimator(...instance_type="ml.m5.xlarge")`. Additionally, users can struggle to setup a remote enviornment that mimics their local. This requires user to create a docker image locally with appropriate dependencies, push to remote ecr, and use image uri when create an `Estimator` with remote execution.


* Must create new Estimator to transition between local and remote
* hidden discoverability
* Increase overhead for setting up remote runs with dependencies users have installed locally
   * https://github.com/aruncs2005/llama2-fine-tuning-sagemaker/blob/main/2.%20Fine%20tune%20with%20FSDP.ipynb


##### Estimator Local Run

In [None]:
numpy_train_path = "file://" + local_train_dir
numpy_test_path = "file://" + local_test_dir

In [None]:
from sagemaker.pytorch import PyTorch
hyperparameters = {"epochs": 25, "batch_size": 128, "learning_rate": 0.01}

train_instance_type = "local"
inputs = {"train": numpy_train_path, "test": numpy_test_path} # Estimator requires inputs to be S3 uri

estimator = PyTorch( # Training with Pytorch coupled with Pytorch Estimator. (ie, framework to framework estimator mapping)
    entry_point="train_pytorch.py",
    source_dir="pytorch-scripts", # Local directory where the training script is located
    instance_type=train_instance_type,
    instance_count=1,
    hyperparameters=hyperparameters,
    role=role,
    base_job_name="pytorch-model",
    framework_version="1.13.1",
    py_version="py39",
) 

estimator.fit(inputs)

##### Estimator Remote Run

In [None]:
## Upload data to S3
numpy_train_s3_uri = sess.upload_data(path=local_train_dir, bucket=bucket, key_prefix="data/synthetic/housing/train")
numpy_test_s3_uri = sess.upload_data(path=local_test_dir, bucket=bucket, key_prefix="data/synthetic/housing/test")

In [None]:
from sagemaker.pytorch import PyTorch
hyperparameters = {"epochs": 25, "batch_size": 128, "learning_rate": 0.01}

train_instance_type = "ml.c5.xlarge"
inputs = {"train": numpy_train_s3_uri, "test": numpy_test_s3_uri} # Estimator requires inputs to be S3 uri

estimator = PyTorch( # Training with Pytorch coupled with Pytorch Estimator. (ie, framework to framework estimator mapping)
    entry_point="train_pytorch.py",
    source_dir="pytorch-scripts", # Local directory where the training script is located
    instance_type=train_instance_type,
    instance_count=1,
    hyperparameters=hyperparameters,
    role=role,
    base_job_name="pytorch-model",
    framework_version="1.13.1",
    py_version="py39",
) 
estimator.fit(inputs)


## ModelTrainer - Local to Remote Transition

Instead of defining local vs remote in object level, our redesign will do it at the execution method level so a user can easily re-use a trainer object and transition from local to remote with more explicit enum value (ie, `trainer.run(mode=LOCAL)` vs `trainer.run(mode=REMOTE)` . To help, with creating ecr image that matches their local setup, we will also introduce a `local_snapshot()` utility method that will create a docker image based on the current python version and packages installed in the local enviornment and upload to users private ecr.

* `TrainingRunMode.Local` & `TrainingRunMode.Remote`
* `local_snapshot(ecr_upload=True)`

In [None]:
import sys
sys.path.append("..") 

from sagemaker_core.shapes import ResourceConfig
from model_trainer.ModelTrainer import ModelTrainer, ImageSpec


instance_type = "ml.c5.xlarge"
framework_image = ImageSpec(
    framework_name="pytorch",
    version="1.13.1",
    py_version="py39",
    image_scope="training",
    instance_type=instance_type
)


# Model Trainer can pick reasonable - Role, StoppingCondition, and OutputDataConfig
model_trainer = ModelTrainer(
    training_image=framework_image,
)

In [None]:
from model_trainer.ModelTrainer import SourceCodeConfig

hyper_parameters = {"epochs": "25", "batch_size": "128", "learning_rate": "0.01"}
inputs = {"train": local_train_dir, "test": local_test_dir}

source_code_config = SourceCodeConfig(
    training_srcipt="train_pytorch.py",
    source_dir="pytorch-scripts",
)

##### ModelTrainer Local Run

In [None]:
from model_trainer.ModelTrainer import TrainingRunMode
# Run with code related setup
model_training_job = model_trainer.run(
    inputs=inputs,
    hyper_parameters=hyper_parameters,
    source_code_config=source_code_config,
    training_run_mode=TrainingRunMode.LOCAL
)

##### ModelTrainer Remote Run

In [None]:
from model_trainer.ModelTrainer import TrainingRunMode
# Run with code related setup
model_training_job = model_trainer.run(
    inputs=inputs,
    hyper_parameters=hyper_parameters,
    source_code_config=source_code_config,
    training_run_mode=TrainingRunMode.REMOTE
)

#### ModelTrainer Local Snapshot

Redesign can contain utility methods like local_snapshot that can assist user in transitioning from local to remote jobs more easily

In [None]:
import sys
sys.path.append("..") 

from sagemaker_core.shapes import ResourceConfig
from model_trainer.ModelTrainer import ModelTrainer, ImageSpec, local_snapshot


instance_type = "ml.c5.xlarge"
resource_config = ResourceConfig(
    instance_count=1,
    instance_type=instance_type,
    volume_size_in_gb=30,
)

image = local_snapshot(ecr_upload=True) # create image, uploads to ecr, returns image uri

# Model Trainer can pick reasonable - Role, StoppingCondition, and OutputDataConfig
model_trainer = ModelTrainer(
    training_image=image,
    resource_config=resource_config,
)