In [53]:
data_bucket_name = 'classifai-backend'
dataset_name = 'test'

In [54]:
import sagemaker
from sagemaker import get_execution_role, image_uris

role = get_execution_role()
sess = sagemaker.Session()
training_image = image_uris.retrieve('image-classification', sess.boto_region_name, version='latest')

Defaulting to the only supported framework/algorithm version: 1. Ignoring framework/algorithm version: latest.


In [55]:
# Find im2rec in our environment and set up some other vars in our environemnt

base_dir='/tmp'

%env BASE_DIR=$base_dir
%env S3_DATA_BUCKET_NAME = $data_bucket_name
%env DATASET_NAME = $dataset_name

import sys,os

suffix='/mxnet/tools/im2rec.py'
im2rec = list(filter((lambda x: os.path.isfile(x + suffix )), sys.path))[0] + suffix
%env IM2REC=$im2rec

env: BASE_DIR=/tmp
env: S3_DATA_BUCKET_NAME=classifai-backend
env: DATASET_NAME=test
env: IM2REC=/home/ec2-user/anaconda3/envs/mxnet_latest_p37/cpu/lib/python3.7/site-packages/mxnet/tools/im2rec.py


In [59]:
# Pull our images from S3
!aws s3 sync s3://$S3_DATA_BUCKET_NAME/public/$DATASET_NAME $BASE_DIR/$DATASET_NAME --quiet

In [60]:
%%bash
# Use the IM2REC script to convert our images into RecordIO files

# Clean up our working dir of existing LST and REC files
cd $BASE_DIR
rm *.rec
rm *.lst

# First we need to create two LST files (training and test lists), noting the correct label class for each image
# We'll also save the output of the LST files command, since it includes a list of all of our label classes
echo "Creating LST files"
python $IM2REC --list --recursive --pass-through --test-ratio=0.3 --train-ratio=0.7 $DATASET_NAME $DATASET_NAME > ${DATASET_NAME}_classes

echo "Label classes:"
cat ${DATASET_NAME}_classes

# Then we create RecordIO files from the LST files
echo "Creating RecordIO files"
python $IM2REC --num-thread=4 ${DATASET_NAME}_train.lst $DATASET_NAME
python $IM2REC --num-thread=4 ${DATASET_NAME}_test.lst $DATASET_NAME
ls -lh *.rec

Creating LST files
Label classes:
clip 0
mouse 1
pen 2
Creating RecordIO files
Creating .rec file from /tmp/test_train.lst in /tmp
time: 0.05218958854675293  count: 0
Creating .rec file from /tmp/test_test.lst in /tmp
time: 0.048287153244018555  count: 0
-rw-rw-r-- 1 ec2-user ec2-user 787K Jul  6 20:24 test_test.rec
-rw-rw-r-- 1 ec2-user ec2-user 1.8M Jul  6 20:24 test_train.rec


In [61]:
# Upload our train and test RecordIO files to S3 in the bucket that our sagemaker session is using
bucket = sess.default_bucket()

s3train_path = 's3://{}/{}/train/'.format(bucket, dataset_name)
s3validation_path = 's3://{}/{}/validation/'.format(bucket, dataset_name)

# Clean up any existing data
!aws s3 rm s3://{bucket}/{dataset_name}/train --recursive
!aws s3 rm s3://{bucket}/{dataset_name}/validation --recursive

# Upload the rec files to the train and validation channels
!aws s3 cp /tmp/{dataset_name}_train.rec $s3train_path
!aws s3 cp /tmp/{dataset_name}_test.rec $s3validation_path

delete: s3://sagemaker-us-east-2-964137047091/test/train/test_train.rec
delete: s3://sagemaker-us-east-2-964137047091/test/validation/test_test.rec
upload: ../../../../../tmp/test_train.rec to s3://sagemaker-us-east-2-964137047091/test/train/test_train.rec
upload: ../../../../../tmp/test_test.rec to s3://sagemaker-us-east-2-964137047091/test/validation/test_test.rec


In [62]:
train_data = sagemaker.inputs.TrainingInput(
    s3train_path, 
    distribution='FullyReplicated', 
    content_type='application/x-recordio', 
    s3_data_type='S3Prefix'
)

validation_data = sagemaker.inputs.TrainingInput(
    s3validation_path, 
    distribution='FullyReplicated', 
    content_type='application/x-recordio', 
    s3_data_type='S3Prefix'
)

data_channels = {'train': train_data, 'validation': validation_data}

In [63]:
s3_output_location = 's3://{}/{}/output'.format(bucket, dataset_name)

image_classifier = sagemaker.estimator.Estimator(
    training_image,
    role, 
    instance_count=1, 
    instance_type='ml.p2.xlarge',
    output_path=s3_output_location,
    sagemaker_session=sess
)

In [64]:
num_classes=! ls -l {base_dir}/{dataset_name} | wc -l
num_classes=int(num_classes[0]) - 1

num_training_samples=! cat {base_dir}/{dataset_name}_train.lst | wc -l
num_training_samples = int(num_training_samples[0])

# Learn more about the Sagemaker built-in Image Classifier hyperparameters here: https://docs.aws.amazon.com/sagemaker/latest/dg/IC-Hyperparameter.html

# These hyperparameters we won't want to change, as they define things like
# the size of the images we'll be sending for input, the number of training classes we have, etc.
base_hyperparameters=dict(
    use_pretrained_model=1,
    image_shape='3,224,224',
    num_classes=num_classes,
    num_training_samples=num_training_samples,
)

# These are hyperparameters we may want to tune, as they can affect the model training success:
hyperparameters={
    **base_hyperparameters, 
    **dict(
        learning_rate=0.001,
        mini_batch_size=5,
    )
}


image_classifier.set_hyperparameters(**hyperparameters)

hyperparameters

{'use_pretrained_model': 1,
 'image_shape': '3,224,224',
 'num_classes': 3,
 'num_training_samples': 21,
 'learning_rate': 0.001,
 'mini_batch_size': 5}

In [65]:
%%time

import time
now = str(int(time.time()))
training_job_name = 'IC-' + dataset_name.replace('_', '-') + '-' + now

image_classifier.fit(inputs=data_channels, job_name=training_job_name, logs=True)

job = image_classifier.latest_training_job
model_path = f"{base_dir}/{job.name}"

print(f"\n\n Finished training! The model is available for download at: {image_classifier.output_path}/{job.name}/output/model.tar.gz")

2021-07-06 20:25:12 Starting - Starting the training job...
2021-07-06 20:25:35 Starting - Launching requested ML instancesProfilerReport-1625603111: InProgress
......
2021-07-06 20:26:35 Starting - Preparing the instances for training............
2021-07-06 20:28:35 Downloading - Downloading input data
2021-07-06 20:28:35 Training - Downloading the training image......
2021-07-06 20:29:35 Training - Training image download completed. Training in progress.[34mDocker entrypoint called with argument(s): train[0m
[34m[07/06/2021 20:29:33 INFO 139781595309888] Reading default configuration from /opt/amazon/lib/python3.7/site-packages/image_classification/default-input.json: {'use_pretrained_model': 0, 'num_layers': 152, 'epochs': 30, 'learning_rate': 0.1, 'lr_scheduler_factor': 0.1, 'optimizer': 'sgd', 'momentum': 0, 'weight_decay': 0.0001, 'beta_1': 0.9, 'beta_2': 0.999, 'eps': 1e-08, 'gamma': 0.9, 'mini_batch_size': 32, 'image_shape': '3,224,224', 'precision_dtype': 'float32'}[0m
[3

[34m[07/06/2021 20:30:31 INFO 139781595309888] Epoch[16] Train-accuracy=0.950000[0m
[34m[07/06/2021 20:30:31 INFO 139781595309888] Epoch[16] Time cost=1.221[0m
[34m[07/06/2021 20:30:32 INFO 139781595309888] Epoch[16] Validation-accuracy=1.000000[0m
[34m[07/06/2021 20:30:33 INFO 139781595309888] Epoch[17] Train-accuracy=1.000000[0m
[34m[07/06/2021 20:30:33 INFO 139781595309888] Epoch[17] Time cost=1.212[0m
[34m[07/06/2021 20:30:34 INFO 139781595309888] Epoch[17] Validation-accuracy=1.000000[0m
[34m[07/06/2021 20:30:36 INFO 139781595309888] Epoch[18] Train-accuracy=1.000000[0m
[34m[07/06/2021 20:30:36 INFO 139781595309888] Epoch[18] Time cost=1.228[0m
[34m[07/06/2021 20:30:36 INFO 139781595309888] Epoch[18] Validation-accuracy=1.000000[0m
[34m[07/06/2021 20:30:38 INFO 139781595309888] Epoch[19] Train-accuracy=0.950000[0m
[34m[07/06/2021 20:30:38 INFO 139781595309888] Epoch[19] Time cost=1.272[0m
[34m[07/06/2021 20:30:38 INFO 139781595309888] Epoch[19] Validation-ac

In [66]:
%%time
# Deploying a model to an endpoint takes a few minutes to complete

deployed_endpoint = image_classifier.deploy(
    initial_instance_count = 1,
    instance_type = 'ml.t2.medium'
)

---------------!CPU times: user 239 ms, sys: 21 ms, total: 260 ms
Wall time: 7min 31s


In [67]:
deployed_endpoint.delete_endpoint()