## Imports and Setup
Sets up the environment and defines the path to custom modules.

In [None]:
import sagemaker
import boto3
from sagemaker import image_uris, estimator, inputs

# --- 1. Session Setup ---
sess = sagemaker.Session()
region = boto3.Session().region_name
role = sagemaker.get_execution_role()
bucket = sess.default_bucket()

# IMPORTANT: This must match the prefix used in 01-preprocessing.ipynb
prefix = "cbis-ddsm-classification"

print(f"Region: {region}")
print(f"Bucket: {bucket}")
print(f"Using Data Prefix: {prefix}")

# Path to the folder containing all images
s3_images = f"s3://{bucket}/{prefix}/images"

# Paths to the .lst files (Manifests)
s3_train_lst = f"s3://{bucket}/{prefix}/metadata/train.lst"
s3_val_lst = f"s3://{bucket}/{prefix}/metadata/validation.lst"

print(f"Training Data: {s3_images}")
print(f"Manifest: {s3_train_lst}")

## Model Setup: Retrieve Image & Define Estimator
We load the SageMaker Built-in Image Classification algorithm container.

In [None]:
# Retrieve the Docker image for the Built-in Image Classification algorithm
training_image = image_uris.retrieve(
    region=region,
    framework='image-classification',
    image_scope='training'
)

# Configure the training job (Compute resources)
estimator_config = estimator.Estimator(
    image_uri=training_image,
    role=role,
    instance_count=1,
    instance_type='ml.g4dn.xlarge', # ~$0.736/hour
    volume_size=50,                # 50 GB of storage on the training instance
    max_run=7200 ,                  # Timeout (2 hours)
    output_path=f"s3://{bucket}/{prefix}/output",
    sagemaker_session=sess
)

## Hyperparameters (ResNet-50 Config)
This is where we tell SageMaker to use ResNet50 with Transfer Learning.

In [None]:
s3 = boto3.client('s3')
s3.download_file(bucket, f"{prefix}/metadata/train.lst", "train_temp.lst")

with open("train_temp.lst", "r") as f:
    num_training_samples = sum(1 for _ in f)

print(f"Number of samples: {num_training_samples}")

estimator_config.set_hyperparameters(
    num_layers=50,
    use_pretrained_model=1,
    image_shape="3,224,224",
    num_classes=2,
    num_training_samples=num_training_samples,
    mini_batch_size=32,
    epochs=20,
    learning_rate=0.001,
    optimizer='adam',

    # Early Stopping to prevent overfitting
    early_stopping=True,
    early_stopping_patience=3,
    early_stopping_min_epochs=5,
    early_stopping_tolerance=0.0
)

## Training Execution
Map the inputs and start the job.

In [None]:
# Define Input Configuration

# The algorithm needs to know that 'train' contains images, but uses 'train_lst' as the map.
images_data = inputs.TrainingInput(
    s3_data=s3_images,
    content_type='application/x-image',
    s3_data_type='S3Prefix',
    input_mode='File'
)

# Mapping the channels
data_channels = {
    'train': images_data,
    'validation': images_data,
    'train_lst': inputs.TrainingInput(s3_train_lst, content_type='application/x-image'),
    'validation_lst': inputs.TrainingInput(s3_val_lst, content_type='application/x-image')
}

# START!
print("Starting training job... this will take a few minutes.")
estimator_config.fit(inputs=data_channels)

## Training Execution with Optimization Hyperparameters ( Random Search )
Map the inputs and start the job.

In [None]:
from sagemaker.tuner import HyperparameterTuner, ContinuousParameter, IntegerParameter, CategoricalParameter

# Define Search Ranges (Search Space)
# Here we define the ranges where Random Search will "sample" values
hyperparameter_ranges = {
    'learning_rate': ContinuousParameter(0.0001, 0.1), # Tries values between 0.0001 and 0.1
    'mini_batch_size': CategoricalParameter([16, 32, 64]), # Chooses one of these sizes
    'optimizer': CategoricalParameter(['sgd', 'adam', 'rmsprop']), # Tests different optimizers
    'momentum': ContinuousParameter(0.0, 0.9) # Only for SGD, but the tuner tests it
}

# Configure Success Metric
# We tell SageMaker: "The best model is the one with the highest validation accuracy"
objective_metric_name = 'validation:accuracy'
objective_type = 'Maximize'

# --- 7. Configure the Tuner (The Conductor) ---
tuner = HyperparameterTuner(
    estimator=estimator_config,              # The model we defined earlier (ResNet50)
    objective_metric_name=objective_metric_name,
    hyperparameter_ranges=hyperparameter_ranges,
    objective_type=objective_type,

    # Random Search Configurations
    strategy='Random',                   # <--- HERE we define it as Random Search
    max_jobs=5,                         # Total training jobs to run (Cost)
    max_parallel_jobs=2,                 # How many jobs run concurrently (Watch out for account limits)
    early_stopping_type='Auto'           # The Tuner itself stops bad jobs early
)

# START OPTIMIZATION!
print("Starting Hyperparameter Optimization jobs (Random Search)...")

# We pass the data channels (same as normal fit)
tuner.fit(inputs=data_channels)

print("Wait... you can monitor progress in the SageMaker console.")