### Imports 

In [6]:
from sagemaker.tensorflow import TensorFlow
from sagemaker.inputs import TrainingInput
from sagemaker import get_execution_role
import tensorflow as tf
import numpy as np
import sagemaker
import boto3
import os

In [7]:
print(f'Using TensorFlow version: {tf.__version__}')
print(f'Using SageMaker version: {sagemaker.__version__}')

Using TensorFlow version: 2.3.0
Using SageMaker version: 2.16.1


#### Seed for Reproducibility

In [8]:
SEED = 123
np.random.seed(SEED)
tf.random.set_seed(SEED)

### Essentials

In [3]:
role = get_execution_role()
session = boto3.Session()
sagemaker_session = sagemaker.Session()

s3 = session.resource('s3')
TF_FRAMEWORK_VERSION = '2.3.0'

BUCKET = 'cv-examples-892313895307' # USE YOUR ACCOUNT ID OR INITIALS AS SUFFIX
PREFIX = 'cifar-clf'

### Test Local Mode

In [4]:
hyperparameters = {'epochs': 1}
estimator = TensorFlow(entry_point='cifar_train.py',
                       model_dir='/opt/ml/model/1/', # Note: this will a s3 path for real run
                       instance_type='local',
                       instance_count=1,
                       hyperparameters=hyperparameters,
                       role=role,
                       framework_version=TF_FRAMEWORK_VERSION, 
                       py_version='py37',
                       script_mode=True)

In [None]:
estimator.fit({'train': 'file://./DATA/CIFAR_10/train', 'val': 'file://./DATA/CIFAR_10/validation'})

### Copy Local Train & Validation Data to S3

In [6]:
!aws s3 cp ./DATA/CIFAR_10/train s3://{BUCKET}/{PREFIX}/train --recursive

upload: DATA/CIFAR_10/train/y_train.npy to s3://cv-examples-892313895307/cifar-clf/train/y_train.npy
upload: DATA/CIFAR_10/train/X_train.npy to s3://cv-examples-892313895307/cifar-clf/train/X_train.npy


In [7]:
!aws s3 cp ./DATA/CIFAR_10/validation s3://{BUCKET}/{PREFIX}/validation --recursive

upload: DATA/CIFAR_10/validation/y_validation.npy to s3://cv-examples-892313895307/cifar-clf/validation/y_validation.npy
upload: DATA/CIFAR_10/validation/X_validation.npy to s3://cv-examples-892313895307/cifar-clf/validation/X_validation.npy


In [14]:
train_input = TrainingInput(s3_data=f's3://{BUCKET}/{PREFIX}/train', 
                            distribution='FullyReplicated', 
                            content_type='npy')

validation_input = TrainingInput(s3_data=f's3://{BUCKET}/{PREFIX}/validation', 
                                 distribution='FullyReplicated', 
                                 content_type='npy')

In [15]:
inputs = {'train': train_input, 'val': validation_input}

{'train': <sagemaker.inputs.TrainingInput at 0x7f2ad14ed630>,
 'val': <sagemaker.inputs.TrainingInput at 0x7f2ad14ed0b8>}

### SageMaker Training

In [16]:
model_name = 'cifar-model-1'
hyperparameters = {'epochs': 30}
estimator_parameters = {'entry_point':'cifar_train.py',
                        'instance_type': 'ml.m5.4xlarge',
                        'instance_count': 1,
                        'model_dir': f'/opt/ml/model',
                        'role': role,
                        'hyperparameters': hyperparameters,
                        'output_path': f's3://{BUCKET}/{PREFIX}/out',
                        'base_job_name': f'cv-{model_name}',
                        'framework_version': TF_FRAMEWORK_VERSION,
                        'py_version': 'py37',
                        'script_mode': True}

estimator = TensorFlow(**estimator_parameters)

estimator.fit(inputs)

2020-11-24 03:10:52 Starting - Starting the training job...
2020-11-24 03:10:54 Starting - Launching requested ML instances.........
2020-11-24 03:12:27 Starting - Preparing the instances for training......
2020-11-24 03:13:31 Downloading - Downloading input data......
2020-11-24 03:14:41 Training - Training image download completed. Training in progress..[34m2020-11-24 03:14:50,378 sagemaker-training-toolkit INFO     Imported framework sagemaker_tensorflow_container.training[0m
[34m2020-11-24 03:14:50,384 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2020-11-24 03:15:51,986 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2020-11-24 03:15:52,099 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2020-11-24 03:15:52,116 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2020-11-24 03:15:52,126 sagemaker-training-to

[34m1546/1546 - 39s - loss: 1.7210 - accuracy: 0.3867 - val_loss: 1.5949 - val_accuracy: 0.3920[0m
[34mEpoch 2/2[0m

2020-11-24 03:17:19 Uploading - Uploading generated training model[34m1546/1546 - 38s - loss: 1.5229 - accuracy: 0.4492 - val_loss: 1.4968 - val_accuracy: 0.4360[0m
[34m2020-11-24 03:17:16,964 sagemaker-training-toolkit INFO     Reporting training SUCCESS[0m

2020-11-24 03:17:26 Completed - Training job completed
Training seconds: 235
Billable seconds: 235


In [18]:
model_location = estimator.model_data
model_location

's3://cv-examples-892313895307/cifar-clf/out/cv-cifar-model-1-2020-11-24-03-10-52-624/output/model.tar.gz'

In [19]:
output_1 = f's3://{BUCKET}/{PREFIX}/mme/model1.tar.gz'

In [20]:
!aws s3 cp {model_location} {output_1} 

Completed 5.7 MiB/5.7 MiB (15.3 MiB/s) with 1 file(s) remainingcopy: s3://cv-examples-892313895307/cifar-clf/out/cv-cifar-model-1-2020-11-24-03-10-52-624/output/model.tar.gz to s3://cv-examples-892313895307/cifar-clf/mme/model1.tar.gz


### Deploy Model

In [28]:
from sagemaker.tensorflow import TensorFlowModel
from datetime import datetime
import time

In [29]:
current_time = datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d-%H-%M-%S')
current_time

'2020-11-24-03-23-57'

In [30]:
model = TensorFlowModel(model_data=output_1, 
                        role=role,  
                        name='cv-cifar-model', 
                        sagemaker_session=sagemaker_session, 
                        framework_version=TF_FRAMEWORK_VERSION)

In [32]:
predictor = model.deploy(initial_instance_count=1, 
                         instance_type='ml.m5.large', 
                         endpoint_name=f'cv-model-1-endpoint-{current_time}')

update_endpoint is a no-op in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
Using already existing model: cv-cifar-model


-------------!

### Test Deployed Endpoint

In [33]:
from tensorflow.keras.preprocessing import image
import numpy as np

In [35]:
img_path = './DATA/CIFAR_10/raw_images/truck.png'
img = image.load_img(img_path)
data = image.img_to_array(img)
data = data.astype('float32')/255
data = data.reshape(1, 32, 32, 3)
data.shape

(1, 32, 32, 3)

In [36]:
predictor.predict(data)

{'predictions': [[0.0102487942,
   0.00091414561,
   0.110007435,
   0.713140547,
   0.0296733547,
   0.0567950346,
   0.0268647559,
   0.0452287532,
   0.00440646755,
   0.00272068148]]}