In [1]:
import sagemaker
import boto3
import os

# Initialize session
sess = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()

print(f"S3 Bucket: {bucket}")
print(f"IAM Role: {role}")

# Download the Dog Breed Dataset
# (Using wget to fetch the data from Udacity's public S3 bucket)
!wget https://s3-us-west-1.amazonaws.com/udacity-aind/dog-project/dogImages.zip

# Unzip the data (quietly with -q to avoid messy output)
!unzip -qo dogImages.zip

# Upload the data to your personal S3 bucket
# This step is crucial so that the training jobs we launch later can access the data.
inputs = sess.upload_data(path='dogImages', bucket=bucket, key_prefix='dog-images')
print(f"Data uploaded to: {inputs}")

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml
S3 Bucket: sagemaker-us-east-1-005562962808
IAM Role: arn:aws:iam::005562962808:role/service-role/AmazonSageMaker-ExecutionRole-20260103T201041
--2026-01-03 17:50:50--  https://s3-us-west-1.amazonaws.com/udacity-aind/dog-project/dogImages.zip
Resolving s3-us-west-1.amazonaws.com (s3-us-west-1.amazonaws.com)... 52.219.113.104, 52.219.113.168, 52.219.220.224, ...
Connecting to s3-us-west-1.amazonaws.com (s3-us-west-1.amazonaws.com)|52.219.113.104|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1132023110 (1.1G) [application/zip]
Saving to: ‘dogImages.zip’


2026-01-03 17:51:16 (42.3 MB/s) - ‘dogImages.zip’ saved [1132023110/1132023110]

Data uploaded to: s3://sagemaker-us-east-1-005562962808/dog-images


In [2]:
# Hyperparameter Tuning
import sagemaker
from sagemaker.tuner import IntegerParameter, ContinuousParameter, HyperparameterTuner
from sagemaker.pytorch import PyTorch

# 1. Define the Hyperparameter Ranges
# We will tune the Learning Rate and Batch Size
hyperparameter_ranges = {
    "lr": ContinuousParameter(0.001, 0.1),
    "batch-size": IntegerParameter(32, 64) 
    # Note: kept max batch size small (64) to avoid OutOfMemory errors on ml.g4dn/m5 instances
}

# 2. Define the Metric to Optimize
# This regex matches the print statement in your train.py: "Testing Loss: 0.1234"
objective_metric_name = "average test loss"
objective_type = "Minimize"
metric_definitions = [{"Name": "average test loss", "Regex": "Testing Loss: ([0-9\\.]+)"}]

# 3. Create the Estimator
# This defines the configuration for the training instances
estimator = PyTorch(
    entry_point="train.py",         # Use the existing script
    role=sagemaker.get_execution_role(),
    framework_version="1.8",
    py_version="py36",
    instance_count=1,
    instance_type="ml.g4dn.xlarge", # GPU instance for faster training
    hyperparameters={
        "epochs": 2,                # Keep epochs low for tuning to save budget/time
        "batch-size": 32,
        "lr": 0.001
    }
)

# 4. Configure the Tuner
tuner = HyperparameterTuner(
    estimator,
    objective_metric_name,
    hyperparameter_ranges,
    metric_definitions,
    max_jobs=2,              # Limit to 2 jobs to save budget (Rubric requires at least 2)
    max_parallel_jobs=1,     # Run one at a time
    objective_type=objective_type
)

# 5. Launch the Tuning Job
print("Starting Hyperparameter Tuning job...")
tuner.fit({"training": inputs}) # 'inputs' is the S3 path from Step 1
print("Job started!")

Starting Hyperparameter Tuning job...


No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config
No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config


................................................................................................*


In [3]:
# Final Training with Debugger and Profiler
from sagemaker.debugger import Rule, ProfilerRule, rule_configs
from sagemaker.debugger import DebuggerHookConfig, ProfilerConfig, FrameworkProfile
from sagemaker.pytorch import PyTorch

# 1. Define Rules for Debugging/Profiling (Required for Rubric)
rules = [
    Rule.sagemaker(rule_configs.loss_not_decreasing()),
    ProfilerRule.sagemaker(rule_configs.LowGPUUtilization()),
    ProfilerRule.sagemaker(rule_configs.ProfilerReport()), # Generates the Report
    Rule.sagemaker(rule_configs.vanishing_gradient())
]

# 2. Configure Profiler
profiler_config = ProfilerConfig(
    system_monitor_interval_millis=500, 
    framework_profile_params=FrameworkProfile(num_steps=10)
)

# 3. Configure Debugger Hooks
debugger_config = DebuggerHookConfig(
    hook_parameters={"train.save_interval": "100", "eval.save_interval": "10"}
)

# 4. Create Final Estimator
estimator_final = PyTorch(
    entry_point="train.py",
    role=sagemaker.get_execution_role(),
    framework_version="1.8",
    py_version="py36",
    instance_count=1,
    instance_type="ml.g4dn.xlarge", # GPU instance
    hyperparameters={
        "epochs": 4,        # Train longer for better accuracy
        "batch-size": 64,
        "lr": 0.001
    },
    rules=rules,
    profiler_config=profiler_config,
    debugger_hook_config=debugger_config
)

# 5. Start Training
print("Starting Final Training job...")
estimator_final.fit({"training": inputs}, wait=True)

Framework profiling will be deprecated from tensorflow 2.12 and pytorch 2.0 in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


Starting Final Training job...


INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: pytorch-training-2026-01-03-18-11-25-158


2026-01-03 18:11:26 Starting - Starting the training job
2026-01-03 18:11:26 Pending - Training job waiting for capacity...
2026-01-03 18:11:57 Pending - Preparing the instances for trainingLossNotDecreasing: InProgress
VanishingGradient: InProgress
LowGPUUtilization: InProgress
ProfilerReport: InProgress
...
2026-01-03 18:12:17 Downloading - Downloading input data......
2026-01-03 18:13:17 Downloading - Downloading the training image..................
2026-01-03 18:16:26 Training - Training image download completed. Training in progress..[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2026-01-03 18:16:33,490 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2026-01-03 18:16:33,522 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2026-01-03 18:16:33,525 sagemaker_pytorch_container.training INFO     Invo

In [4]:
# Redeploy with Fix (Updated to m5.large)
from sagemaker.pytorch import PyTorchModel

# 1. Get the S3 location of the model artifacts
model_data = estimator_final.model_data
print(f"Deploying model from: {model_data}")

# 2. Create Model Object
pytorch_model = PyTorchModel(
    model_data=model_data,
    role=sagemaker.get_execution_role(),
    entry_point='train.py',
    framework_version="1.8",
    py_version="py36"
)

# 3. Deploy (Using ml.m5.large for speed)
predictor = pytorch_model.deploy(initial_instance_count=1, instance_type="ml.m5.large")
print("Endpoint Deployed on m5.large!")

Deploying model from: s3://sagemaker-us-east-1-005562962808/pytorch-training-2026-01-03-18-11-25-158/output/model.tar.gz


INFO:sagemaker:Repacking model artifact (s3://sagemaker-us-east-1-005562962808/pytorch-training-2026-01-03-18-11-25-158/output/model.tar.gz), script artifact (None), and dependencies ([]) into single tar.gz file located at s3://sagemaker-us-east-1-005562962808/pytorch-inference-2026-01-03-18-21-49-828/model.tar.gz. This may take some time depending on model size...
INFO:sagemaker:Creating model with name: pytorch-inference-2026-01-03-18-21-56-722
INFO:sagemaker:Creating endpoint-config with name pytorch-inference-2026-01-03-18-21-57-327
INFO:sagemaker:Creating endpoint with name pytorch-inference-2026-01-03-18-21-57-327


------!Endpoint Deployed on m5.large!


In [5]:
# Test Inference
from PIL import Image
import torch
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
import os

# 1. Download a sample image (since we are in the cloud)
!wget -O test_dog.jpg https://s3.amazonaws.com/cdn-origin-etr.akc.org/wp-content/uploads/2017/11/12234558/Chinook-On-White-03.jpg

# 2. Process image
image_path = "test_dog.jpg"
img = Image.open(image_path)

transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
])
img_tensor = transform(img).unsqueeze(0) # Add batch dimension

# 3. Predict
response = predictor.predict(img_tensor.numpy())
prediction = response.argmax(axis=1)[0]

print(f"Predicted Class Index: {prediction}")
plt.imshow(img)
plt.title(f"Prediction Class: {prediction}")
plt.show()

  import pynvml  # type: ignore[import]


--2026-01-03 18:25:35--  https://s3.amazonaws.com/cdn-origin-etr.akc.org/wp-content/uploads/2017/11/12234558/Chinook-On-White-03.jpg
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.217.226.240, 16.15.217.229, 16.15.207.166, ...
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.217.226.240|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 25761 (25K) [image/jpeg]
Saving to: ‘test_dog.jpg’


2026-01-03 18:25:35 (25.4 MB/s) - ‘test_dog.jpg’ saved [25761/25761]



In [6]:
# Download Profiler Report
job_name = estimator_final.latest_training_job.name
client = estimator_final.sagemaker_session.sagemaker_client
description = client.describe_training_job(TrainingJobName=job_name)

# Search for the ProfilerReport rule dynamically (Fixes IndexError)
rule_output_path = None
if "DebugRuleConfigurations" in description:
    for rule in description["DebugRuleConfigurations"]:
        if "ProfilerReport" in rule["RuleConfigurationName"]:
            rule_output_path = rule["OutputS3Uri"]
            print(f"Found ProfilerReport path: {rule_output_path}")
            break

if rule_output_path:
    print("Downloading report...")
    !aws s3 cp {rule_output_path} ./ --recursive
    print("Download complete. Check the file browser on the left.")
else:
    print("ProfilerReport rule was not found. Please check if the training job finished successfully.")

ProfilerReport rule was not found. Please check if the training job finished successfully.


In [7]:
# Delete Endpoint
# Run this immediately after you have downloaded your report!
predictor.delete_endpoint()
print("Endpoint deleted. Billing stopped.")

INFO:sagemaker:Deleting endpoint configuration with name: pytorch-inference-2026-01-03-18-21-57-327
INFO:sagemaker:Deleting endpoint with name: pytorch-inference-2026-01-03-18-21-57-327


Endpoint deleted. Billing stopped.
