### SETUP SAGEMAKER PERMISSIONS

In [3]:
import sagemaker
bucket=sagemaker.Session().default_bucket()
prefix = 'DEMO-Tensorflow-MNIST-FASHION'
 
# Define IAM role
import boto3
#import re
from sagemaker import get_execution_role

role = get_execution_role()
session = boto3.session.Session()
region = session.region_name

### UPLOAD DATA TO S3 BUCKET

In [4]:
# load mnist fashion data from keras

import tensorflow as tf

(x_train, y_train), (x_test, y_test) = tf.keras.datasets.fashion_mnist.load_data()

print("Train samples Shape ------")
print(x_train.shape, y_train.shape)
print()

print("Test samples Shape ------")
print(x_test.shape, y_test.shape)

2023-06-30 07:52:15.188447: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX512F, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Train samples Shape ------
(60000, 28, 28) (60000,)

Test samples Shape ------
(10000, 28, 28) (10000,)


In [27]:
# save data locally
import numpy as np

with open ('x_train.npy','wb') as f:
    np.save(f, x_train)

with open ('y_train.npy','wb') as f:
    np.save(f, y_train)
    
with open ('x_test.npy','wb') as f:
    np.save(f, x_test)

with open ('y_test.npy','wb') as f:
    np.save(f, y_test)

In [58]:
# upload to s3 bucket

boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train/x_train.npy')).upload_file('x_train.npy')
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train/y_train.npy')).upload_file('y_train.npy')

boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'test/x_test.npy')).upload_file('x_test.npy')
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'test/y_test.npy')).upload_file('y_test.npy')


### CONFIGURATION

In [11]:
from datetime import datetime
date = datetime.now().strftime("%y%m%d-%H%M%S")

image_uri_cpu='763104351884.dkr.ecr.{}.amazonaws.com/tensorflow-training:2.3.1-cpu-py37-ubuntu18.04'.format(region)
image_uri_gpu='763104351884.dkr.ecr.{}.amazonaws.com/tensorflow-training:2.3.1-gpu-py37-cu102-ubuntu18.04'.format(region)
image_uri = image_uri_cpu

epochs = 10

instance_type = 'ml.m5.4xlarge' 

device='cpu'

job_name = '{}-Fashion-Mnist-{}-{}-{}e'.format(
    date,
    instance_type.replace('.','-').replace('ml-', ''),
    device,
    epochs)

### BUILD TENSORFLOW ESTIMATOR

In [12]:
use_spot_instances = True
max_run = 3600
max_wait = 7200 if use_spot_instances else None

In [13]:
from sagemaker.tensorflow import TensorFlow

estimator = TensorFlow(entry_point='train.py',
                      role=role,
                      instance_count=1,
                      instance_type=instance_type,
                      image_uri=image_uri,
                      model_dir='s3://{}/{}/'.format(bucket, prefix),
                      hyperparameters={
                          'epochs': epochs
                      },
                      script_mode=False,
                      use_spot_instances=use_spot_instances,
                      max_run = max_run,
                      max_wait = max_wait)

In [72]:
#estimator.hyperparameters()

### FIT THE ESTIMATOR

In [14]:
train_data_path = f"s3://{bucket}/{prefix}/train/"
test_data_path = f"s3://{bucket}/{prefix}/test/"

#print(train_data_path)
#print(test_data_path)

In [15]:
#training_data_uri = "s3://sagemaker-sample-data-{}/tensorflow/mnist".format(region)
#print(training_data_uri)

In [16]:
estimator.fit({"train": train_data_path,
               "test": test_data_path},
              wait=False, # True makes notebook wait and logs output in real time
              job_name=job_name)

INFO:sagemaker:Creating training-job with name: 230630-080628-Fashion-Mnist-m5-4xlarge-cpu-10e
