# Dog breed classifier
Finetuning a Resnet-18 model to classify 133 dog breeds based on image.

In [14]:
!pip install smdebug
# !pip install torchvision

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Collecting smdebug
  Downloading smdebug-1.0.12-py2.py3-none-any.whl (270 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m270.1/270.1 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hCollecting pyinstrument==3.4.2
  Downloading pyinstrument-3.4.2-py2.py3-none-any.whl (83 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m83.3/83.3 kB[0m [31m18.4 MB/s[0m eta [36m0:00:00[0m
Collecting pyinstrument-cext>=0.2.2
  Downloading pyinstrument_cext-0.2.4-cp39-cp39-manylinux2010_x86_64.whl (20 kB)
Installing collected packages: pyinstrument-cext, pyinstrument, smdebug
Successfully installed pyinstrument-3.4.2 pyinstrument-cext-0.2.4 smdebug-1.0.12


In [1]:
import sagemaker
import boto3
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()
bucket = sagemaker_session.default_bucket()

## Dataset
The data is a collection of images of dogs of different breeds. There are 133 different dog breeds in each of the training, validation, and test sets. By using the command "find . -type f | wc -l" in the terminal, I discover there are 6680 images in the training set, 835 in the validation set, and 836 in the test set. I obtain the data from this S3 bucket: s3://udacity-aind/dog-project/dogImages.zip.

In [4]:
# Fetch and unzip the data
!aws s3 cp s3://udacity-aind/dog-project/dogImages.zip ./
!unzip dogImages.zip

download: s3://udacity-aind/dog-project/dogImages.zip to ./dogImages.zip


In [5]:
# upload data to S3 bucket
!aws s3 sync dogImages s3://deeplearning-project/dogImages

Archive:  dogImages.zip
   creating: dogImages/
   creating: dogImages/test/
   creating: dogImages/train/
   creating: dogImages/valid/
   creating: dogImages/test/001.Affenpinscher/
  inflating: dogImages/test/001.Affenpinscher/Affenpinscher_00003.jpg  
  inflating: dogImages/test/001.Affenpinscher/Affenpinscher_00023.jpg  
  inflating: dogImages/test/001.Affenpinscher/Affenpinscher_00036.jpg  
  inflating: dogImages/test/001.Affenpinscher/Affenpinscher_00047.jpg  
  inflating: dogImages/test/001.Affenpinscher/Affenpinscher_00048.jpg  
  inflating: dogImages/test/001.Affenpinscher/Affenpinscher_00058.jpg  
  inflating: dogImages/test/001.Affenpinscher/Affenpinscher_00071.jpg  
  inflating: dogImages/test/001.Affenpinscher/Affenpinscher_00078.jpg  
   creating: dogImages/test/002.Afghan_hound/
  inflating: dogImages/test/002.Afghan_hound/Afghan_hound_00116.jpg  
  inflating: dogImages/test/002.Afghan_hound/Afghan_hound_00125.jpg  
  inflating: dogImages/test/002.Afghan_hound/Afghan_ho

## Hyperparameter Tuning

In [4]:
from sagemaker.tuner import (
    ContinuousParameter,
    CategoricalParameter,
    HyperparameterTuner,
    IntegerParameter
)

# HP ranges
hyperparameter_ranges = {
    "lr": ContinuousParameter(0.001, 0.1),
    "batch-size":CategoricalParameter([32, 64, 128, 256])
}

# metric that will be tracked
objective_metric_name = "average test loss"
objective_type = "Minimize"
metric_definitions = [{"Name":"average test loss", "Regex":"Test set: Average loss: ([0-9\\.]+)"}]

In [29]:
# estimators for your hyperparameters
import sagemaker
from sagemaker.pytorch import PyTorch

estimator = PyTorch(
    entry_point="hpo.py",
    role=sagemaker.get_execution_role(),
    py_version="py36",
    framework_version="1.6",
    instance_count=1,
    instance_type="ml.g4dn.xlarge"
)

tuner = HyperparameterTuner(
    estimator,
    objective_metric_name,
    hyperparameter_ranges,
    metric_definitions,
    objective_type=objective_type,
    max_jobs=5,
    max_parallel_jobs=1,
)

INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole
INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole


In [30]:
# Fit HP Tuner
tuner.fit(wait=True)

INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating hyperparameter tuning job with name: pytorch-training-230217-1428


.................................................................................................................................................................................................................................................................................................................................................!


In [31]:
# hyperparameters of the best trained model
best_estimator = tuner.best_estimator()
best_estimator.hyperparameters()


2023-02-17 14:52:32 Starting - Found matching resource for reuse
2023-02-17 14:52:32 Downloading - Downloading input data
2023-02-17 14:52:32 Training - Training image download completed. Training in progress.
2023-02-17 14:52:32 Uploading - Uploading generated training model
2023-02-17 14:52:32 Completed - Resource reused by training job: pytorch-training-230217-1428-005-255a38f9


{'_tuning_objective_metric': '"average test loss"',
 'batch-size': '"32"',
 'lr': '0.006700332124841554',
 'sagemaker_container_log_level': '20',
 'sagemaker_estimator_class_name': '"PyTorch"',
 'sagemaker_estimator_module': '"sagemaker.pytorch.estimator"',
 'sagemaker_job_name': '"pytorch-training-2023-02-17-14-28-07-978"',
 'sagemaker_program': '"hpo.py"',
 'sagemaker_region': '"us-east-1"',
 'sagemaker_submit_directory': '"s3://sagemaker-us-east-1-232517870160/pytorch-training-2023-02-17-14-28-07-978/source/sourcedir.tar.gz"'}

## Model Profiling and Debugging
Using the best hyperparameters, creating and finetuning a new model

In [2]:
# Setting up debugging and profiling rules and hooks
from sagemaker.debugger import DebuggerHookConfig, ProfilerConfig, FrameworkProfile
from sagemaker.debugger import Rule, ProfilerRule, rule_configs

profiler_config = ProfilerConfig(
    system_monitor_interval_millis=500,
    framework_profile_params=FrameworkProfile(local_path="/opt/ml/output/profilerme/", num_steps=10)
)

debugger_hook_config = DebuggerHookConfig(
    hook_parameters={"train.save_interval":"100", "eval.save_interval":"10"}
)

# rules that will be triggered in profiling
rules = [
    Rule.sagemaker(rule_configs.loss_not_decreasing()),
    Rule.sagemaker(rule_configs.vanishing_gradient()),
    ProfilerRule.sagemaker(rule_configs.LowGPUUtilization()),
    ProfilerRule.sagemaker(rule_configs.ProfilerReport())
]

In [5]:
# Create and fit an estimator
from sagemaker.pytorch import PyTorch

hyperparameters = {
    "batch-size":32,
    "lr": 0.0067
}

estimator = PyTorch(
    entry_point="train_model.py",
    role=role,
    instance_count=1,
    instance_type="ml.g4dn.xlarge",
    hyperparameters=hyperparameters,
    framework_version="1.6",
    py_version="py36",
    metric_definitions=metric_definitions,
    profiler_config=profiler_config,
    debugger_hook_config=debugger_hook_config,
    rules=rules
)

estimator.fit(wait=True)

INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: pytorch-training-2023-02-20-12-08-01-893


2023-02-20 12:08:02 Starting - Starting the training job...
2023-02-20 12:08:18 Starting - Preparing the instances for trainingLossNotDecreasing: InProgress
VanishingGradient: InProgress
LowGPUUtilization: InProgress
ProfilerReport: InProgress
......
2023-02-20 12:09:33 Downloading - Downloading input data
2023-02-20 12:09:33 Training - Downloading the training image.........
2023-02-20 12:11:04 Training - Training image download completed. Training in progress..[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2023-02-20 12:11:06,691 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2023-02-20 12:11:06,718 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2023-02-20 12:11:06,721 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2023-02-20 12:11:06,978 sagemaker-training

In [24]:

session = boto3.session.Session()
training_job_name = estimator.latest_training_job.name
region = session.region_name

print(f"Training jobname: {training_job_name}")

Training jobname: pytorch-training-2023-02-20-12-08-01-893


In [15]:
from smdebug.trials import create_trial
from smdebug.core.modes import ModeKeys

trial = create_trial(estimator.latest_job_debugger_artifacts_path())

[2023-02-20 12:53:45.744 ip-172-16-84-244.ec2.internal:7849 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None
[2023-02-20 12:53:45.758 ip-172-16-84-244.ec2.internal:7849 INFO s3_trial.py:42] Loading trial debug-output at path s3://sagemaker-us-east-1-232517870160/pytorch-training-2023-02-20-12-08-01-893/debug-output


INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole


In [21]:
# names of all the tensors that were tracked
# for both train and eval mode
print(trial.tensor_names())
# number of datapoints for the CrossEntropyLoss_output_0 tensor
print(len(trial.tensor("CrossEntropyLoss_output_0").steps(mode=ModeKeys.TRAIN)))
print(len(trial.tensor("CrossEntropyLoss_output_0").steps(mode=ModeKeys.EVAL)))

['CrossEntropyLoss_output_0', 'gradient/ResNet_fc.0.bias', 'gradient/ResNet_fc.0.weight']
5
1


In [25]:
from smdebug.profiler.analysis.notebook_utils.training_job import TrainingJob

tj = TrainingJob(training_job_name, region)
tj.wait_for_sys_profiling_data_to_be_available()

ProfilerConfig:{'S3OutputPath': 's3://sagemaker-us-east-1-232517870160/', 'ProfilingIntervalInMilliseconds': 500, 'ProfilingParameters': {'DataloaderProfilingConfig': '{"StartStep": 0, "NumSteps": 10, "MetricsRegex": ".*", }', 'DetailedProfilingConfig': '{"StartStep": 0, "NumSteps": 10, }', 'FileOpenFailThreshold': '50', 'HorovodProfilingConfig': '{"StartStep": 0, "NumSteps": 10, }', 'LocalPath': '/opt/ml/output/profilerme/', 'PythonProfilingConfig': '{"StartStep": 0, "NumSteps": 10, "ProfilerName": "cprofile", "cProfileTimer": "total_time", }', 'RotateFileCloseIntervalInSeconds': '60', 'RotateMaxFileSizeInBytes': '10485760', 'SMDataParallelProfilingConfig': '{"StartStep": 0, "NumSteps": 10, }'}, 'DisableProfiler': False}
s3 path:s3://sagemaker-us-east-1-232517870160/pytorch-training-2023-02-20-12-08-01-893/profiler-output


Profiler data from system is available


In [6]:
# Display the profiler output
rule_output_path = estimator.output_path + estimator.latest_training_job.job_name + "/rule-output"
print(f"You will find the profiler report in {rule_output_path}")

You will find the profiler report in s3://sagemaker-us-east-1-232517870160/pytorch-training-2023-02-20-12-08-01-893/rule-output


In [7]:
! aws s3 ls {rule_output_path} --recursive

2023-02-20 12:20:12     370340 pytorch-training-2023-02-20-12-08-01-893/rule-output/ProfilerReport/profiler-output/profiler-report.html
2023-02-20 12:20:11     219103 pytorch-training-2023-02-20-12-08-01-893/rule-output/ProfilerReport/profiler-output/profiler-report.ipynb
2023-02-20 12:20:06        551 pytorch-training-2023-02-20-12-08-01-893/rule-output/ProfilerReport/profiler-output/profiler-reports/BatchSize.json
2023-02-20 12:20:06      11332 pytorch-training-2023-02-20-12-08-01-893/rule-output/ProfilerReport/profiler-output/profiler-reports/CPUBottleneck.json
2023-02-20 12:20:06        126 pytorch-training-2023-02-20-12-08-01-893/rule-output/ProfilerReport/profiler-output/profiler-reports/Dataloader.json
2023-02-20 12:20:06        130 pytorch-training-2023-02-20-12-08-01-893/rule-output/ProfilerReport/profiler-output/profiler-reports/GPUMemoryIncrease.json
2023-02-20 12:20:06        782 pytorch-training-2023-02-20-12-08-01-893/rule-output/ProfilerReport/profiler-output/profiler-re

In [61]:
! aws s3 cp {rule_output_path} ./ --recursive

download: s3://sagemaker-us-east-1-232517870160/pytorch-training-2023-02-20-12-08-01-893/rule-output/ProfilerReport/profiler-output/profiler-report.html to ProfilerReport/profiler-output/profiler-report.html
download: s3://sagemaker-us-east-1-232517870160/pytorch-training-2023-02-20-12-08-01-893/rule-output/ProfilerReport/profiler-output/profiler-reports/BatchSize.json to ProfilerReport/profiler-output/profiler-reports/BatchSize.json
download: s3://sagemaker-us-east-1-232517870160/pytorch-training-2023-02-20-12-08-01-893/rule-output/ProfilerReport/profiler-output/profiler-reports/OverallSystemUsage.json to ProfilerReport/profiler-output/profiler-reports/OverallSystemUsage.json
download: s3://sagemaker-us-east-1-232517870160/pytorch-training-2023-02-20-12-08-01-893/rule-output/ProfilerReport/profiler-output/profiler-reports/StepOutlier.json to ProfilerReport/profiler-output/profiler-reports/StepOutlier.json
download: s3://sagemaker-us-east-1-232517870160/pytorch-training-2023-02-20-12-0

In [62]:
import os

# get the autogenerated folder name of profiler report
profiler_report_name = [
    rule["RuleConfigurationName"]
    for rule in estimator.latest_training_job.rule_job_summary()
    if "Profiler" in rule["RuleConfigurationName"]
][0]

In [63]:
import IPython

IPython.display.HTML(filename=profiler_report_name + "/profiler-output/profiler-report.html")

Unnamed: 0,Description,Recommendation,Number of times rule triggered,Number of datapoints,Rule parameters
BatchSize,"Checks if GPUs are underutilized because the batch size is too small. To detect this problem, the rule analyzes the average GPU memory footprint, the CPU and the GPU utilization.","The batch size is too small, and GPUs are underutilized. Consider running on a smaller instance type or increasing the batch size.",2,1165,cpu_threshold_p95:70  gpu_threshold_p95:70  gpu_memory_threshold_p95:70  patience:1000  window:500
LowGPUUtilization,"Checks if the GPU utilization is low or fluctuating. This can happen due to bottlenecks, blocking calls for synchronizations, or a small batch size.","Check if there are bottlenecks, minimize blocking calls, change distributed training strategy, or increase the batch size.",2,1166,threshold_p95:70  threshold_p5:10  window:500  patience:1000
GPUMemoryIncrease,Measures the average GPU memory footprint and triggers if there is a large increase.,Choose a larger instance type with more memory if footprint is close to maximum available memory.,0,1166,increase:5  patience:1000  window:10
StepOutlier,"Detects outliers in step duration. The step duration for forward and backward pass should be roughly the same throughout the training. If there are significant outliers, it may indicate a system stall or bottleneck issues.","Check if there are any bottlenecks (CPU, I/O) correlated to the step outliers.",0,0,threshold:3  mode:None  n_outliers:10  stddev:3
MaxInitializationTime,Checks if the time spent on initialization exceeds a threshold percent of the total training time. The rule waits until the first step of training loop starts. The initialization can take longer if downloading the entire dataset from Amazon S3 in File mode. The default threshold is 20 minutes.,"Initialization takes too long. If using File mode, consider switching to Pipe mode in case you are using TensorFlow framework.",0,0,threshold:20
IOBottleneck,Checks if the data I/O wait time is high and the GPU utilization is low. It might indicate IO bottlenecks where GPU is waiting for data to arrive from storage. The rule evaluates the I/O and GPU utilization rates and triggers the issue if the time spent on the IO bottlenecks exceeds a threshold percent of the total training time. The default threshold is 50 percent.,"Pre-fetch data or choose different file formats, such as binary formats that improve I/O performance.",0,1167,threshold:50  io_threshold:50  gpu_threshold:10  patience:1000
CPUBottleneck,"Checks if the CPU utilization is high and the GPU utilization is low. It might indicate CPU bottlenecks, where the GPUs are waiting for data to arrive from the CPUs. The rule evaluates the CPU and GPU utilization rates, and triggers the issue if the time spent on the CPU bottlenecks exceeds a threshold percent of the total training time. The default threshold is 50 percent.",Consider increasing the number of data loaders or applying data pre-fetching.,0,1167,threshold:50  cpu_threshold:90  gpu_threshold:10  patience:1000
Dataloader,"Checks how many data loaders are running in parallel and whether the total number is equal the number of available CPU cores. The rule triggers if number is much smaller or larger than the number of available cores. If too small, it might lead to low GPU utilization. If too large, it might impact other compute intensive operations on CPU.",Change the number of data loader processes.,0,0,min_threshold:70  max_threshold:200
LoadBalancing,"Detects workload balancing issues across GPUs. Workload imbalance can occur in training jobs with data parallelism. The gradients are accumulated on a primary GPU, and this GPU might be overused with regard to other GPUs, resulting in reducing the efficiency of data parallelization.",Choose a different distributed training strategy or a different distributed training framework.,0,1166,threshold:0.2  patience:1000


## Model Deploying

In [26]:
# Deploy model to an endpoint
predictor = estimator.deploy(initial_instance_count=1, instance_type="ml.t2.medium")

INFO:sagemaker:Creating model with name: pytorch-training-2023-02-20-12-57-02-223
INFO:sagemaker:Creating endpoint-config with name pytorch-training-2023-02-20-12-57-02-223
INFO:sagemaker:Creating endpoint with name pytorch-training-2023-02-20-12-57-02-223


-----------!

In [35]:
import os
def files(directory):       # 1.Get file names from directory
    file_list=os.listdir(directory)
    return file_list


class_names = files("dogImages/train")
class_names = list(map(lambda name: name[4:], class_names))
class_names.sort()

['Affenpinscher',
 'Afghan_hound',
 'Airedale_terrier',
 'Akita',
 'Alaskan_malamute',
 'American_eskimo_dog',
 'American_foxhound',
 'American_staffordshire_terrier',
 'American_water_spaniel',
 'Anatolian_shepherd_dog',
 'Australian_cattle_dog',
 'Australian_shepherd',
 'Australian_terrier',
 'Basenji',
 'Basset_hound',
 'Beagle',
 'Bearded_collie',
 'Beauceron',
 'Bedlington_terrier',
 'Belgian_malinois',
 'Belgian_sheepdog',
 'Belgian_tervuren',
 'Bernese_mountain_dog',
 'Bichon_frise',
 'Black_and_tan_coonhound',
 'Black_russian_terrier',
 'Bloodhound',
 'Bluetick_coonhound',
 'Border_collie',
 'Border_terrier',
 'Borzoi',
 'Boston_terrier',
 'Bouvier_des_flandres',
 'Boxer',
 'Boykin_spaniel',
 'Briard',
 'Brittany',
 'Brussels_griffon',
 'Bull_terrier',
 'Bulldog',
 'Bullmastiff',
 'Cairn_terrier',
 'Canaan_dog',
 'Cane_corso',
 'Cardigan_welsh_corgi',
 'Cavalier_king_charles_spaniel',
 'Chesapeake_bay_retriever',
 'Chihuahua',
 'Chinese_crested',
 'Chinese_shar-pei',
 'Chow_cho

In [56]:
import torch
from torchvision import transforms
from PIL import Image

def load_input_image(img_path):
    img = Image.open(img_path).convert('RGB')
    transform = transforms.Compose([
        transforms.Resize((224,224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])
    img = transform(img)
    img = torch.unsqueeze(img, 0)
    return img

test_image = load_input_image("dogImages/train/014.Basenji/Basenji_00953.jpg")

In [60]:
# TODO: Run an prediction on the endpoint
response = list(predictor.predict(test_image)[0])
index = response.index(max(response))
print(f"The dog breed is: {class_names[index]}")

The dog breed is: Basenji


In [None]:
predictor.delete_endpoint()