# Training a YOLO implementation in pytorch

## Step 1: Preparation

### Upgrades

In [2]:
!pip install --upgrade sagemaker

Collecting sagemaker
  Downloading sagemaker-2.239.1-py3-none-any.whl.metadata (16 kB)


Downloading sagemaker-2.239.1-py3-none-any.whl (1.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m39.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sagemaker
  Attempting uninstall: sagemaker
    Found existing installation: sagemaker 2.239.0
    Uninstalling sagemaker-2.239.0:
      Successfully uninstalled sagemaker-2.239.0
Successfully installed sagemaker-2.239.1


### Import requirements

In [10]:
import os
import boto3
import sagemaker
from sagemaker.processing import ScriptProcessor, ProcessingInput, ProcessingOutput
from sagemaker.pytorch import PyTorch
from sagemaker.debugger import DebuggerHookConfig, CollectionConfig

In [5]:
VOC_TAR = "VOCtrainval_11-May-2012.tar"
BUCKET_NAME = 'sagemaker-20250212110251'

### Initialize session and role

In [6]:
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()

### S3 locations

In [8]:
raw_data_s3_uri = f's3://{BUCKET_NAME}/datasets/{VOC_TAR}'
processed_prefix = 'datasets/processed/VOC'  # Folder where processed data is stored
processed_data_s3_uri = f's3://{BUCKET_NAME}/{processed_prefix}'

## Step 2: Preprocessing Job

Create a ScriptProcessor to run a data preprocessing script.
In this example, "preprocess.py" should be a script you create that:
- Downloads and extracts the tar file.
- Splits the dataset into training/validation/test sets.
- Converts the dataset into a format (for example, preprocessed images and annotation files) that your training script expects.
Upload this script into a local folder (e.g., "preprocessing") which you'll point to as your source_dir.

In [18]:
image_uri = sagemaker.image_uris.retrieve(
    framework='pytorch',
    region=sagemaker_session.boto_region_name,
    version='2.1.0',
    py_version='py310',
    image_scope='training',
    instance_type='ml.m5.xlarge'
)

script_processor = ScriptProcessor(
    command=['python3'],
    image_uri=image_uri,
    role=role,
    instance_count=1,
    instance_type='ml.m5.xlarge',
    sagemaker_session=sagemaker_session
)

### Run the processing job

In [11]:
s3 = boto3.client('s3')

# List objects under the processed data prefix
response = s3.list_objects_v2(Bucket=BUCKET_NAME, Prefix=processed_prefix)
if 'Contents' in response and len(response['Contents']) > 0:
    print("Processed data already exists. Skipping processing job.")
else:
    print("Processed data not found. Running processing job.")
    script_processor.run(
        code='preprocess.py',
        source_dir='preprocessing',
        inputs=[
            ProcessingInput(
                source=raw_data_s3_uri,
                destination='/opt/ml/processing/input'
            )
        ],
        outputs=[
            ProcessingOutput(
                output_name='processed_data',
                source='/opt/ml/processing/output',
                destination=processed_data_s3_uri
            )
        ],
        arguments=[
            '--image_size', '448'
        ]
    )

    print("Data preprocessing complete. Processed data available at:", processed_data_s3_uri)

Processed data not found. Running processing job.


## Step 2: Launch a Training Job with Debugger

### Define a Debugger hook configuration. This tells SageMaker which collections to capture.

In [None]:
debugger_hook_config = PyTorchDebuggerHookConfig(
    collection_configs=[CollectionConfig(name="Losses")]
)

### Create a PyTorch estimator. Ensure your training code (e.g., train.py and supporting modules) is available in a source directory.

In [None]:
estimator = PyTorch(
    entry_point='train.py',
    source_dir='training',  # directory containing train.py and other training modules
    role=role,
    instance_count=1,
    instance_type='ml.p3.2xlarge',
    framework_version='1.8.1',
    py_version='py3',
    debugger_hook_config=debugger_hook_config,
    hyperparameters={
        'epochs': 10,
        'batch-size': 32,
        'learning-rate': 0.001,
        # add other hyperparameters as needed
        'data-dir': '/opt/ml/input/data/processed',
        'tar-file': ''  # Not needed in training if data is preprocessed
    },
    debugger_rule_configs=[
        # You can add built-in or custom debugger rules here.
    ]
)

### Define channel for preprocessed data.

The processed data is output from the processing job and will be used as input for training.

In [None]:
data_channels = {
    'processed': processed_data_s3_uri
}

### Launch the training job

In [None]:
estimator.fit(inputs=data_channels)