In [None]:
!pip install --upgrade pip 
!pip install -q sagemaker-experiments

### import sagemaker
import json
import boto3

role = sagemaker.get_execution_role()
sess = sagemaker.Session()
region = sess.boto_region_name
bucket = sess.default_bucket()
prefix = 'tfdeepmodel'

In [None]:
import numpy as np
import tensorflow as tf
import os
from tensorflow.keras.preprocessing import sequence
from tensorflow.python.keras.datasets import imdb

In [None]:
max_features = 20000
maxlen = 400

(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')

x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

In [None]:
data_dir = os.path.join(os.getcwd(), 'imdb_data')
os.makedirs(data_dir, exist_ok=True)

train_dir = os.path.join(os.getcwd(), 'imdb_data/train')
os.makedirs(train_dir, exist_ok=True)

test_dir = os.path.join(os.getcwd(), 'imdb_data/test')
os.makedirs(test_dir, exist_ok=True)

csv_test_dir = os.path.join(os.getcwd(), 'imdb_data/csv-test')
os.makedirs(csv_test_dir, exist_ok=True)

np.save(os.path.join(train_dir, 'x_train.npy'), x_train)
np.save(os.path.join(train_dir, 'y_train.npy'), y_train)
np.save(os.path.join(test_dir, 'x_test.npy'), x_test)
np.save(os.path.join(test_dir, 'y_test.npy'), y_test)
np.savetxt(os.path.join(csv_test_dir, 'csv-test.csv'), 
           np.array(x_test[:100], dtype=np.int32), fmt='%d', delimiter=",")

In [None]:
traindata_s3_prefix = f'{prefix}/imdb_data/train'
testdata_s3_prefix = f'{prefix}/imdb_data/test'

train_s3 = sess.upload_data(path='./imdb_data/train/', key_prefix=traindata_s3_prefix)
test_s3 = sess.upload_data(path='./imdb_data/test/', key_prefix=testdata_s3_prefix)

In [None]:
!mkdir code

In [None]:
%%writefile code/tensorflow_sentiment.py
import logging
logging.getLogger("tensorflow").setLevel(logging.ERROR)
import argparse
import codecs
import json
import numpy as np
import os
import tensorflow as tf

max_features = 20000
maxlen = 400
embedding_dims = 300
filters = 256
kernel_size = 3
hidden_dims = 256

def parse_args():
    parser = argparse.ArgumentParser()

    # hyperparameters sent by the client are passed as command-line arguments to the script
    parser.add_argument('--epochs', type=int, default=1)
    parser.add_argument('--batch_size', type=int, default=64)
    parser.add_argument('--learning_rate', type=float, default=0.01)
    parser.add_argument('--drop_out_rate', type=float, default=0.2)

    # data directories
    parser.add_argument('--train', type=str, default=os.environ.get('SM_CHANNEL_TRAIN'))
    parser.add_argument('--test', type=str, default=os.environ.get('SM_CHANNEL_TEST'))

    # model directory /opt/ml/model default set by SageMaker
    parser.add_argument('--model_dir', type=str, default=os.environ.get('SM_MODEL_DIR'))

    return parser.parse_known_args()


def save_history(path, history):
    history_for_json = {}
    # transform float values that aren't json-serializable
    for key in list(history.history.keys()):
        if type(history.history[key]) == np.ndarray:
            history_for_json[key] == history.history[key].tolist()
        elif type(history.history[key]) == list:
            if type(history.history[key][0]) == np.float32 or type(history.history[key][0]) == np.float64:
                history_for_json[key] = list(map(float, history.history[key]))

    with codecs.open(path, 'w', encoding='utf-8') as f:
        json.dump(history_for_json, f, separators=(',', ':'), sort_keys=True, indent=4) 


def get_train_data(train_dir):
    x_train = np.load(os.path.join(train_dir, 'x_train.npy'))
    y_train = np.load(os.path.join(train_dir, 'y_train.npy'))
    print(f'x train {x_train.shape} y train {y_train.shape}')

    return x_train, y_train


def get_test_data(test_dir):
    x_test = np.load(os.path.join(test_dir, 'x_test.npy'))
    y_test = np.load(os.path.join(test_dir, 'y_test.npy'))
    print(f'x test {x_test.shape} y test {y_test.shape}')

    return x_test, y_test


def get_model(args):
    embedding_layer = tf.keras.layers.Embedding(max_features,
                                                embedding_dims,
                                                input_length=maxlen)

    sequence_input = tf.keras.Input(shape=(maxlen,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)
    x = tf.keras.layers.Dropout(args.drop_out_rate)(embedded_sequences)
    x = tf.keras.layers.Conv1D(filters, kernel_size, padding='valid', activation='relu', strides=1)(x)
    x = tf.keras.layers.MaxPooling1D()(x)
    x = tf.keras.layers.GlobalMaxPooling1D()(x)
    x = tf.keras.layers.Dense(hidden_dims, activation='relu')(x)
    x = tf.keras.layers.Dropout(args.drop_out_rate)(x)
    preds = tf.keras.layers.Dense(1, activation='sigmoid')(x)

    model = tf.keras.Model(sequence_input, preds)
    optimizer = tf.keras.optimizers.Adam(args.learning_rate)
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    
    return model


if __name__ == "__main__":

    args, _ = parse_args()

    x_train, y_train = get_train_data(args.train)
    x_test, y_test = get_test_data(args.test)

    model = get_model(args)

    history = model.fit(x_train, y_train,
              batch_size=args.batch_size,
              epochs=args.epochs,
              validation_data=(x_test, y_test))

    save_history(args.model_dir + "/history.p", history)
    
    # create a TensorFlow SavedModel for deployment to a SageMaker endpoint with TensorFlow Serving
    model.save(args.model_dir + '/1')

In [None]:
from smexperiments.experiment import Experiment
from smexperiments.trial import Trial
from botocore.exceptions import ClientError
from time import gmtime, strftime
import time

experiment_name = 'imdb-sentiment-analysis-tfdeepmodel'

try:
    experiment = Experiment.create(
        experiment_name=experiment_name, 
        description='Training a sentiment classification model using imdb dataset.')
except ClientError as e:
    print(f'{experiment_name} experiment already exists! Reusing the existing experiment.')

In [None]:
from sagemaker.tensorflow import TensorFlow

exp_datetime = strftime('%Y-%m-%d-%H-%M-%S', gmtime())
jobname = f'imdb-tf-tfdeepmodel-{exp_datetime}'

s3_output_location = f's3://{bucket}/{prefix}/{jobname}'
code_dir = f's3://{bucket}/{prefix}/{jobname}'

train_instance_type = 'ml.c4.xlarge'
hyperparameters = {'epochs': 2, 'batch_size': 64, 
                   'learning_rate': 0.01, 'drop_out_rate': 0.2}
model_local_path = '/opt/ml/model'

estimator = TensorFlow(source_dir='code',
                       entry_point='tensorflow_sentiment.py',
                       output_path=s3_output_location,
                       code_location=code_dir,
                       model_dir=model_local_path,
                       instance_type=train_instance_type,
                       instance_count=1,
                       enable_sagemaker_metrics=True,
                       hyperparameters=hyperparameters,
                       role=role,
                       framework_version='2.1',
                       py_version='py3')

data_channels = {'train':train_s3, 'test': test_s3}
print(data_channels)

In [None]:
jobname

In [None]:
exp_trial = Trial.create(experiment_name=experiment_name, 
                         trial_name=jobname)

experiment_config={'ExperimentName': experiment_name,
                   'TrialName': exp_trial.trial_name,
                   'TrialComponentDisplayName': 'Training'}

estimator.fit(inputs=data_channels,
              job_name=jobname,
              experiment_config=experiment_config,
              logs=True)

Deployment

In [None]:
import numpy as np
import tensorflow as tf
import os

from tensorflow.keras.preprocessing import sequence
from tensorflow.python.keras.datasets import imdb

In [43]:
max_features = 2000
maxlen = 100

(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')

x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

csv_test_dir_prefix = 'imdb_data/test'
csv_test_filename = 'test.csv'
csv_test_dir = os.path.join(os.getcwd(), csv_test_dir_prefix)
os.makedirs(csv_test_dir, exist_ok=True)

np.savetxt(os.path.join(csv_test_dir, csv_test_filename), 
           np.array(x_test, dtype=np.int32), fmt='%d', delimiter=",")

test_data_s3prefix = f'{prefix}/data/csv_test'
test_data_s3 = sess.upload_data(path=csv_test_dir, 
                                key_prefix=test_data_s3prefix)
print(test_data_s3)

25000 train sequences
25000 test sequences
x_train shape: (25000,)
x_test shape: (25000, 100)
s3://sagemaker-us-east-1-104877823522/tfdeepmodel/data/csv_test


In [None]:
# In Experiments and trials, you should see your training job as a trial in the list.
training_job_name='imdb-tf-tfdeepmodel-2022-04-29-23-09-26'

# Once you have attached training_job_name and reload estimator, 
# you should see the history of the job printed in the output.

estimator_deploy = TensorFlow.attach(training_job_name) # It gives details about the train

In [None]:
training_job_name

In [None]:
exp_datetime = strftime('%Y-%m-%d-%H-%M-%S', gmtime())
jobname = f'imdb-tf-batach-transform-{exp_datetime}'

# Creating a new trial for the experiment
exp_trial = Trial.load(trial_name=training_job_name)

experiment_config={
    'ExperimentName': experiment_name,
    'TrialName': exp_trial.trial_name,
    'TrialComponentDisplayName': 'Inference-BatchTransform'}

In [44]:
exp_datetime = strftime('%Y-%m-%d-%H-%M-%S', gmtime())
jobname = f'imdb-tf-batach-transform-{exp_datetime}'

s3_output_location = f's3://{bucket}/{prefix}/{jobname}'

# Run SageMaker batch transform
# Below method creates a Transformer object with the compute resource desired for the inference.
# The max_payload argument allows us to control the size of each mini-batch 
# that SageMaker Batch Transform is splitting.
transformer = estimator_deploy.transformer(instance_count=1, 
                                    instance_type='ml.c4.xlarge',
                                    max_payload = 2,
                                    accept = 'application/jsonlines',
                                    output_path = s3_output_location,
                                    assemble_with = 'Line')

transformer.transform(test_data_s3, 
                      content_type='text/csv', 
                      split_type = 'Line', 
                      job_name = jobname,
                      experiment_config = experiment_config)
print('Waiting for transform job: ' + transformer.latest_transform_job.job_name)

INFO:sagemaker:Creating model with name: imdb-tf-tfdeepmodel-2022-04-29-23-09-26-2022-04-30-14-47-30-509
INFO:sagemaker:Creating transform job with name: imdb-tf-batach-transform-2022-04-30-14-47-30


............................[34mINFO:__main__:starting services[0m
[34mINFO:tfs_utils:using default model name: model[0m
[34mINFO:tfs_utils:tensorflow serving model config: [0m
[34mmodel_config_list: {
  config: {
    name: "model",
    base_path: "/opt/ml/model",
    model_platform: "tensorflow"
  }[0m
[34m}[0m
[34mINFO:__main__:using default model name: model[0m
[34mINFO:__main__:tensorflow serving model config: [0m
[35mINFO:__main__:starting services[0m
[35mINFO:tfs_utils:using default model name: model[0m
[35mINFO:tfs_utils:tensorflow serving model config: [0m
[35mmodel_config_list: {
  config: {
    name: "model",
    base_path: "/opt/ml/model",
    model_platform: "tensorflow"
  }[0m
[35m}[0m
[35mINFO:__main__:using default model name: model[0m
[35mINFO:__main__:tensorflow serving model config: [0m
[34mmodel_config_list: {
  config: {
    name: "model",
    base_path: "/opt/ml/model",
    model_platform: "tensorflow"
  }[0m
[34m}[0m
[34mINFO:__mai

In [None]:
# the below code for error info
#job_name = 'imdb-tf-batach-transform-2022-04-30-13-17-26'
#sage = boto3.client('sagemaker')
#sage.describe_training_job(TrainingJobName=job_name)['FailureReason']

In [45]:
output = transformer.output_path
output_prefix = 'imdb_data/test_output'
!mkdir -p {output_prefix}
!aws s3 cp --recursive {output} {output_prefix}
!head {output_prefix}/{csv_test_filename}.out

download: s3://sagemaker-us-east-1-104877823522/tfdeepmodel/imdb-tf-batach-transform-2022-04-30-14-47-30/test.csv.out to imdb_data/test_output/test.csv.out
{    "predictions": [[0.00121012202], [0.987351477], [0.988187253], [0.471521199], [0.985716701], [0.21193333], [0.746866286], [4.6839632e-06], [0.870585], [0.624816418], [0.890301466], [0.000782313466], [8.27054123e-07], [0.177196428], [0.99649626], [4.26554045e-08], [0.873272], [0.146907523], [1.71601994e-06], [5.35920153e-05], [0.783496439], [0.999953032], [0.575047851], [0.985893846], [0.861900806], [0.997783244], [0.0003040171], [0.954340935], [0.978446126], [0.00388134504], [0.997629881], [0.467636317], [0.00424363185], [2.04393189e-07], [0.0110195149], [0.00498084], [0.95458591], [0.976222336], [0.0318695195], [3.15710985e-10], [0.998732865], [0.126794934], [0.0029830914], [0.854636], [0.861298859], [0.0197699796], [1.19097228e-06], [3.78755827e-09], [2.99653493e-06], [0.732227623], [0.000793339743], [0.00520773232], [0.87400

In [46]:
results=[]
with open(f'{output_prefix}/{csv_test_filename}.out', 'r') as f:
    lines = f.readlines()
    for line in lines:
        print(line)
        json_output = json.loads(line)
        result = [float('%.3f'%(item)) for sublist in json_output['predictions'] 
                                       for item in sublist]
        results += result

print(results)

{    "predictions": [[0.00121012202], [0.987351477], [0.988187253], [0.471521199], [0.985716701], [0.21193333], [0.746866286], [4.6839632e-06], [0.870585], [0.624816418], [0.890301466], [0.000782313466], [8.27054123e-07], [0.177196428], [0.99649626], [4.26554045e-08], [0.873272], [0.146907523], [1.71601994e-06], [5.35920153e-05], [0.783496439], [0.999953032], [0.575047851], [0.985893846], [0.861900806], [0.997783244], [0.0003040171], [0.954340935], [0.978446126], [0.00388134504], [0.997629881], [0.467636317], [0.00424363185], [2.04393189e-07], [0.0110195149], [0.00498084], [0.95458591], [0.976222336], [0.0318695195], [3.15710985e-10], [0.998732865], [0.126794934], [0.0029830914], [0.854636], [0.861298859], [0.0197699796], [1.19097228e-06], [3.78755827e-09], [2.99653493e-06], [0.732227623], [0.000793339743], [0.00520773232], [0.874007881], [0.879774749], [0.986484706], [0.0773308724], [0.00234446162], [0.691360831], [0.00171622657], [0.0102049792], [0.00118135393], [0.000836284715], [0.

In [47]:
def get_sentiment(score):
    return 'positive' if score > 0.5 else 'negative' 

In [48]:
import re

regex = re.compile(r'^[\?\s]+')
word_index = imdb.get_word_index()

In [49]:
data_index=199
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])
first_decoded_review = ' '.join([reverse_word_index.get(i - 3, '?') 
                                 for i in x_test[data_index]])
regex.sub('', first_decoded_review)

"one to remember they didn't even use the hotel to it's ? potential the ? are fantastic and the hotel is ? on a ? at low ? you can walk almost 1 4 ? into the ? which is actually an ? sight first thing in the morning or late at night when the wind is ? through the ? br br the best way to see this movie is with the ? in your hand so you can fast forward through the action and i'm using that ? ? scenes and ? at the beauty of the ?"

In [50]:
print(f'Labeled sentiment for this review is {get_sentiment(y_test[data_index])}')
print(f'Predicted sentiment is {get_sentiment(results[data_index])}')

Labeled sentiment for this review is negative
Predicted sentiment is positive


In [None]:
"""
Fully managed mini-batching helps make inferences on a large dataset efficiently.
You can use a separate SageMaker-managed compute infrastructure that is different from your notebook instance. You can easily run prediction with a cluster of instances for faster prediction.
You only pay for the runtime of a batch transform job, even with a much larger compute cluster.
You can schedule and kick off a model prediction independently in the cloud with SageMaker batch transform. It is not necessary to use a Python notebook in SageMaker Studio to start a prediction job.
"""