In [42]:
import sagemaker
import boto3
import pandas as pd
import torch
from torchvision import datasets
from sagemaker.pytorch.estimator import PyTorch
import os

role = sagemaker.get_execution_role()
boto_session = boto3.Session()
sess = sagemaker.Session(boto_session)
region = boto_session.region_name

# S3 bucket for saving code and model artifacts.
# Feel free to specify a different bucket and prefix
bucket = sagemaker.Session().default_bucket()
prefix = 'sagemaker/MNIST_demo'
job_name = 'training-pytorch'

print(f'Bucket: {bucket}')
print(f'Prefix: {prefix}')
print(f'Region: {region}')
print(f'Job Name: {job_name}')

Bucket: sagemaker-us-east-1-424632853466
Prefix: sagemaker/MNIST_demo
Region: us-east-1
Job Name: training-pytorch


# Upload data to S3 bucket

In [43]:
# Download MNIST data from torchvision
datasets.MNIST('data', download=True)

Dataset MNIST
    Number of datapoints: 60000
    Root location: data
    Split: Train

In [44]:
train_data = sess.upload_data(path='data/MNIST/processed', bucket=bucket, key_prefix=prefix)
train_data

's3://sagemaker-us-east-1-424632853466/sagemaker/MNIST_demo'

In [45]:
output_dir = f"s3://{bucket}/{prefix}/output_run"
output_dir

's3://sagemaker-us-east-1-424632853466/sagemaker/MNIST_demo/output_run'

In [46]:
source_code_dir = f"s3://{bucket}/{prefix}/source_code"
source_code_dir

's3://sagemaker-us-east-1-424632853466/sagemaker/MNIST_demo/source_code'

# Launch Training job

In [48]:
pytorch_estimator = PyTorch(entry_point='train_sagemaker.py',
                            instance_type='ml.p2.xlarge',
                            instance_count=1,
                            framework_version='1.8.0', # 1.8.1 is not supported yet
                            py_version='py3',
                            role=role,
                            source_dir='.', # local folder to be packed and used in entry point
                            output_path=output_dir, # output parent folder in S3 to store everything generated
                            #model_dir=model_dir,
                            code_location= source_code_dir,
                            #git_config={'repo':'', 'branch':'main'}, # git repo with the training script
                            hyperparameters = {'epochs': 7, 'batch_size': 64, 'use_cuda': True })

pytorch_estimator.fit({'training': train_data},
                       #job_name=job_name
                     ) # in this case we have just 1 data folder

2021-04-03 11:42:22 Starting - Starting the training job...
2021-04-03 11:42:24 Starting - Launching requested ML instancesProfilerReport-1617450099: InProgress
......
2021-04-03 11:43:51 Starting - Preparing the instances for training...............
2021-04-03 11:46:21 Downloading - Downloading input data
2021-04-03 11:46:21 Training - Downloading the training image........................
2021-04-03 11:50:22 Training - Training image download completed. Training in progress.[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2021-04-03 11:50:09,873 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2021-04-03 11:50:09,897 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2021-04-03 11:50:12,919 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2021-04-03 11:50:14,428 sage

In [49]:
try:
    trained_model_file = pytorch_estimator.model_data # This is the S3 bucket URL with the trained model
except:
    trained_model_file = 's3://sagemaker-us-east-1-424632853466/sagemaker/MNIST_demo/output_run/pytorch-training-2021-04-02-20-34-43-399/output/model.tar.gz'
trained_model_file

's3://sagemaker-us-east-1-424632853466/sagemaker/MNIST_demo/output_run/pytorch-training-2021-04-03-11-41-39-464/output/model.tar.gz'

# Deploying the model
Deploying a model can be done in two ways:  
- From a model trained in sagemaker
- From a model trained outside sagemaker

## From a model trained in SageMaker

In [50]:
# If the model is deployed from an Estimator object, then model_fn should be included in the training script
predictor = pytorch_estimator.deploy(initial_instance_count=1,
                                     instance_type='ml.m4.xlarge')

-----------------!

## From a model trained outside SageMaker

In [None]:
# In this case, an inference script should be provided. There is no need to add the model_fn function in the training script
pytorch_model = sagemaker.pytorch.model.PyTorchModel(model_data=trained_model_file,
                                                     role=role,
                                                     framework_version='1.8.0', # 1.8.1 is not supported yet
                                                     entry_point='inference_sagemaker_simple.py',
                                                     py_version='py3')

In [None]:
predictor = pytorch_model.deploy(initial_instance_count=1,
                                 instance_type='ml.m4.xlarge' )

# Testing predictions

In [51]:
testing_data, _ = torch.load('data/MNIST/processed/test.pt')
testing_data = testing_data.numpy()[:2]
testing_data.shape

(2, 28, 28)

In [52]:
predictor.accept

('application/x-npy',)

In [53]:
predictor.serializer

<sagemaker.serializers.NumpySerializer at 0x7f02c969c160>

In [54]:
predictor.predict(data=testing_data)

ModelError: An error occurred (ModelError) when calling the InvokeEndpoint operation: Received server error (0) from model with message "Your invocation timed out while waiting for a response from container model. Review the latency metrics for each container in Amazon CloudWatch, resolve the issue, and try again.". See https://us-east-1.console.aws.amazon.com/cloudwatch/home?region=us-east-1#logEventViewer:group=/aws/sagemaker/Endpoints/pytorch-training-2021-04-03-11-53-08-554 in account 424632853466 for more information.

# Delete Session

In [55]:
sagemaker.Session().delete_endpoint(predictor.endpoint)

The endpoint attribute has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [56]:
!git commit

On branch main
Your branch is up to date with 'origin/main'.

Changes not staged for commit:
	[31mmodified:   requirements.txt[m
	[31mdeleted:    setup.py[m

Untracked files:
	[31mLaunch Training.ipynb[m
	[31mRecommendation System Using MXNET on AWS Sagemaker.ipynb[m
	[31mdata/[m
	[31mrecommender.py[m

no changes added to commit


In [57]:
!pwd

/home/ec2-user/SageMaker/MNIST-deployment
