# ModelTrainer Demo - Simple Script Mode

### Prerequisites

In [None]:
!pip uninstall sagemaker-core -y

Make sure your SageMaker version is updated.

In [None]:
!pip install sagemaker ../../dist/sagemaker_core-1.0.1.tar.gz

### Imports

In [None]:
import sagemaker
import pandas as pd
import os
import boto3
import math
import numpy as np
from sklearn.preprocessing import StandardScaler
from generate_synthetic_housing_data import *

### Parameters

In [None]:
sess = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()

# Local data paths
local_train_dir = os.path.join(os.getcwd(), "data/train")
local_test_dir = os.path.join(os.getcwd(), "data/test")
os.makedirs(local_train_dir, exist_ok=True)
os.makedirs(local_test_dir, exist_ok=True)


### Prepare Synthetic Housing Data

For all the examples below, we'll be generating a synthetic housing dataset.

In [None]:
df = generate_houses(1506)

# Get training columns
train_cols = list(df.columns)
del train_cols[-1]
train_cols

# Split data
training_index = math.floor(0.8 * df.shape[0])
x_train, y_train = df[train_cols][:training_index], df.PRICE[:training_index]
x_test, y_test = df[train_cols][training_index:], df.PRICE[training_index:]

# Scale price
y_train = y_train / 100000
y_test = y_test / 100000

# Standardize data
x_train_np = StandardScaler().fit_transform(x_train)
x_test_np = StandardScaler().fit_transform(x_test)

In [None]:
x_train.head()

Rearrange dataframe for SageMaker training and scale price.

In [None]:
train_df = pd.DataFrame(data=x_train_np)
train_df.columns = x_train.columns
train_df["PRICE"] = y_train / 100000
first_col = train_df.pop("PRICE")
train_df.insert(0, "PRICE", first_col)

test_df = pd.DataFrame(data=x_test_np)
test_df.columns = x_test.columns
test_df["PRICE"] = y_test.reset_index(drop=True) / 100000
first_col = test_df.pop("PRICE")
test_df.insert(0, "PRICE", first_col)

Save as both CSV and Numpy data types to demonstrate data type flexibility in model training.

In [None]:
# Save as Numpy
np.save(os.path.join(local_train_dir, "x_train.npy"), x_train_np)
np.save(os.path.join(local_train_dir, "y_train.npy"), y_train)

print(f"Saved training data to local directory: {local_train_dir}")


np.save(os.path.join(local_test_dir, "x_test.npy"), x_test_np)
np.save(os.path.join(local_test_dir, "y_test.npy"), y_test)

print(f"Saved test data to local directory: {local_test_dir}")

# PyTorch - Simple Script Mode Example

## Estimator
https://sagemaker.readthedocs.io/en/stable/api/training/estimators.html

https://sagemaker.readthedocs.io/en/stable/api/training/estimators.html#sagemaker.estimator.EstimatorBase.fit

Pain Points:
1. Inputs requires an S3 path, so user needs extra step to upload to S3
2. Overwhelming parameters, some of which are conflicting (ie, entry_point vs container_entrypoint, container_arguments)
    - Can be difficult for users not aware of cloud, aws, or infra setup concepts to understand what parameters do
2. Reliance on Framework Estimators for image URI evaluation. Makes it difficult to keep parity - need to create a new estimator for each new framwork, algo, etc.
3. Script mode is confusing to users even experienced data scientist because of reliance on training_toolkit 
    (don't know it runs in the container, entry point redirection, debugging issues in the toolkit itself is difficult if not aware)

In [None]:
## Upload data to S3
numpy_train_s3_uri = sess.upload_data(path=local_train_dir, bucket=bucket, key_prefix="data/synthetic/housing/train")
numpy_test_s3_uri = sess.upload_data(path=local_test_dir, bucket=bucket, key_prefix="data/synthetic/housing/test")

In [None]:
from sagemaker.pytorch import PyTorch
hyperparameters = {"epochs": 25, "batch_size": 128, "learning_rate": 0.01}

train_instance_type = "ml.c5.xlarge"
inputs = {"train": numpy_train_s3_uri, "test": numpy_test_s3_uri} # Estimator requires inputs to be S3 uri

estimator = PyTorch( # Training with Pytorch coupled with Pytorch Estimator. (ie, framework to framework estimator mapping)
    entry_point="train_pytorch.py",
    source_dir="pytorch-scripts", # Local directory where the training script is located
    instance_type=train_instance_type,
    instance_count=1,
    hyperparameters=hyperparameters,
    role=role,
    base_job_name="pytorch-model",
    framework_version="1.13.1",
    py_version="py39",
) 
estimator.fit(inputs)


## ModelTrainer
https://docs.aws.amazon.com/sagemaker/latest/APIReference/API_CreateTrainingJob.html

For a TrainingJob the following are the required parameters:

Infra, CloudSetup
- OutputDataConfig
- ResourceConfig
- RoleArn
- StoppingCondition
- TrainingJobName


Train Execution Setup
- AlgorithmSpecification
- InputDataConfig - not required in API, but needed if your data is not in the training container



What is Different:
1. User can set local dir as input don't need step to upload to S3
2. Use encapsulation and leverage SageMakerCore shape classes to group params that go together conceptually (& and in the API)
3. Decouple from 1-to-1 Framework Estimators and levarge a ImageSpec class for 1-to-all image_uri evalutions
4. Remove reliance on training toolkit for script mode by leveraging a ContainerScriptConfig class (right now using some workaround for setting enviornment variables in container, but just for demo)



```python
class ModelTrainer:

    # Container specs, Cloud, Infra Setup. ModelTrainer reasonably sets some defaults if not provided by user.
    def __init__(
            self,
            training_image: Optional[Union[str | ImageSpec]] = None, # Replace framework estimators. User provide a FrameworkImageSpec or image_uri
            training_input_mode: Optional[str] = None,
            algorithm_name: Optional[str] = None, # Name of an algorithm resource that user created or from AWS Marketplace
            environment: Optional[Dict[str, str]] = None,
            hyper_parameters: Optional[Dict[str, str]] = None,
            distribution: Optional[DistributionConfig] = None,
            resource_config: Optional[ResourceConfig] = None, # Leverage SageMakerCore shape classes for encapsulation 
            stopping_condition: Optional[StoppingCondition] = None,
            output_data_config: Optional[OutputDataConfig] = None,
            base_name: Optional[str] = None,
            role: Optional[str] = None,
            session: Optional[Session] = None,
        ):

    # Related to code/training execution. Idea, that user may setup a ModelTrainer() with their cloud infra setup 
    # and want to use it for different training workflows they may have setup locally
    def run(
            self,
            inputs: Union[Dict[str, str], Dict[str, S3DataSource], Dict[str, FileSystemDataSource]], # Similar to Estimator but we upload to S3 for user
            source_code_conifg: Optional[SourceCodeConfig] = None, # Abstraction for user provided source code and commands to execute on container startup
            hyper_parameters: Optional[Dict[str, str]] = None,
            environment: Optional[Dict[str, str]] = None,
            distribution: Optional[DistributionConfig] = None,
            training_run_mode: Optional[TrainingRunMode] = TrainingRunMode.REMOTE, # easily switch between LocalMode and Remote (TrainingRunMode.LOCAL)
            wait: bool = True,
            logs: bool = True,
    ) -> TrainingJob:


class SourceCodeConfig(Base):
    """
    SourceCodeConfig
    
    Attributes:
        commnd (str): The raw commands to execute in the training job container (eg, "python train.py <args>").
        source_dir (Union[str, S3DataSource]): The directory containing the source code to be used in the training job container. This can be a local directory path or an S3 URI.
            References to files in the source_dir in the container should be relative to the source_dir and in the format "opt/ml/input/data/code/<path_to_file>".
    """
    command: str
    source_dir: Union[str, S3DataSource]


class TrainingRunMode(Enum):
    REMOTE = "Remote"
    LOCAL = "Local"
    
class ImageSpec():
    def __init__(
        self, 
        framework_name: str,
        version: str, 
        image_scope: Optional[str] = None, 
        instance_type: Optional[str] = None, 
        py_version: Optional[str] = None, 
        region: Optional[str] = "us-west-2",
        ...
    ):
        self.framework_name = framework_name
        self.image_scope = image_scope
        self.py_version = py_version
        self.version = version
        self.instance_type = instance_type
        self.region = region
        
    def get_image_uri(self):
        return image_uris.retrieve(
            framework=self.framework_name, 
            image_scope=self.image_scope,
            instance_type=self.instance_type,
            py_version=self.py_version,
            region=self.region, 
            version=self.version,
            ...
        )

```


In [None]:
import sys
sys.path.append("..") 

from model_trainer.ModelTrainer import ModelTrainer, ImageSpec
from sagemaker_core.shapes import ResourceConfig


instance_type = "ml.c5.xlarge"
resource_config = ResourceConfig(
    volume_size_in_gb=30,
    instance_count=1,
    instance_type=instance_type,
)

framework_image = ImageSpec(
    framework_name="pytorch",
    version="1.13.1",
    py_version="py39",
    image_scope="training",
    instance_type=instance_type
)

In [None]:

# Example 1: Infra and Container spec setup
model_trainer = ModelTrainer(
    training_image=framework_image,
    resource_config=resource_config
)

In [None]:
from model_trainer.ModelTrainer import SourceCodeConfig

hyper_parameters = {"epochs": "25", "batch_size": "128", "learning_rate": "0.01"}
inputs = {"train": local_train_dir, "test": local_test_dir}

source_code_config = SourceCodeConfig(
    command="python /opt/ml/input/data/code/train_pytorch.py",
    source_dir="pytorch-scripts",
)

In [None]:
# Run with code related setup
model_training_job = model_trainer.run(
    inputs=inputs,
    hyper_parameters=hyper_parameters,
    source_code_config=source_code_config
)


In [None]:
# Example 2: Setup infra, container and some code related setup like container_script_config and hyper_parameters
source_code_config = SourceCodeConfig(
    command="sh -c 'python $SM_CHANNEL_CODE/train_pytorch.py'",
    source_dir="pytorch-scripts"
)

model_trainer = ModelTrainer(
    training_image=framework_image,
    resource_config=resource_config,
    hyper_parameters=hyper_parameters,
    source_code_config=source_code_config
)

In [None]:
# Run with minimal params
model_training_job = model_trainer.run(inputs=inputs)