In [48]:
import boto3
import sagemaker
from sagemaker.pytorch import PyTorch
from sagemaker.pytorch.processing import PyTorchProcessor
from sagemaker.processing import ProcessingInput, ProcessingOutput

In [49]:
# Configuration
REGION = 'us-west-1'
ROLE_ARN = "arn:aws:iam::253490779227:role/service-role/AmazonSageMakerAdminIAMExecutionRole"
BUCKET = 'animal-classification-dss-works'
S3_INPUT_DATA = f's3://{BUCKET}/data/'
S3_PREPROCESSED = f's3://{BUCKET}/preprocessed/'


In [50]:
boto_session = boto3.Session(region_name=REGION)
sagemaker_session = sagemaker.Session(boto_session=boto_session)


INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials


In [51]:

print(f"Region: {sagemaker_session.boto_region_name}")
print(f"S3 Bucket: {BUCKET}")
print(f"Input data: {S3_INPUT_DATA}")
print(f"Preprocessed output: {S3_PREPROCESSED}")

Region: us-west-1
S3 Bucket: animal-classification-dss-works
Input data: s3://animal-classification-dss-works/data/
Preprocessed output: s3://animal-classification-dss-works/preprocessed/


In [54]:
processor = PyTorchProcessor(
    framework_version='2.1',
    py_version='py310',
    role=ROLE_ARN,
    instance_type='ml.m5.2xlarge',  # CPU instance: $0.23/hour
    instance_count=1,
    sagemaker_session=sagemaker_session,
    base_job_name='animal-preprocessing'
)

INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.


In [None]:
# processor.latest_job.stop()

###### RUN THIS CELL ONLY ONCE ######

processor.run(
    code='preprocess.py',
    # inputs=[
    #     ProcessingInput(
    #         source=S3_INPUT_DATA,           # Your S3 data folder
    #         destination='/opt/ml/processing/input'  # Where it appears in container
    #     )
    # ],
    # outputs=[
    #     ProcessingOutput(
    #         source='/opt/ml/processing/output',     # Where script saves results
    #         destination=S3_PREPROCESSED              # Upload results here
    #     )
    # ],
    # arguments=[
    #     '--input-dir', '/opt/ml/processing/input',
    #     '--output-dir', '/opt/ml/processing/output'
    # ]
)
print(f"Preprocessed data saved to: {S3_PREPROCESSED}")

INFO:sagemaker.processing:Uploaded None to s3://sagemaker-us-west-1-253490779227/animal-preprocessing-2025-12-24-23-45-53-325/source/sourcedir.tar.gz
INFO:sagemaker.processing:runproc.sh uploaded to s3://sagemaker-us-west-1-253490779227/animal-preprocessing-2025-12-24-23-45-53-325/source/runproc.sh
INFO:sagemaker:Creating processing-job with name animal-preprocessing-2025-12-24-23-45-53-325


...........Input Directory: {INPUT_DIR}
Output Directory: {OUTPUT_DIR}
Initial RAM usage: 142.44 MB
IMAGE PREPROCESSING FOR SAGEMAKER TRAINING
1. Processing TRAINING images...
Getting image list from s3://animal-classification-dss-works/data/train_features/
Found 16488 images
Getting image list from s3://animal-classification-dss-works/data/test_features/
Found 4464 images
Processing 16488 images...
Image data/train_features/ZJ000007.jpg is grayscale, converting to RGB
Image data/train_features/ZJ000032.jpg is grayscale, converting to RGB
Image data/train_features/ZJ000041.jpg is grayscale, converting to RGB
Image data/train_features/ZJ000042.jpg is grayscale, converting to RGB
Image data/train_features/ZJ000046.jpg is grayscale, converting to RGB
Image data/train_features/ZJ000051.jpg is grayscale, converting to RGB
Image data/train_features/ZJ000052.jpg is grayscale, converting to RGB
Image data/train_features/ZJ000059.jpg is grayscale, converting to RGB
Image data/train_features/ZJ0

In [58]:

estimator = PyTorch(
    entry_point='dss_train.py',
    source_dir='.',
    role=ROLE_ARN,
    framework_version='2.1',
    py_version='py310',
    instance_count=1,
    instance_type='ml.g4dn.xlarge',  # GPU instance with NVIDIA T4
    hyperparameters={
        'epochs': 10,
        'batch-size': 64,  # Larger batch size with GPU
        'learning-rate': 0.001,
    },
    sagemaker_session=sagemaker_session,
    base_job_name='animal-classification-training',
    # Use Spot instances to save 70% (optional)
    # use_spot_instances=True,
    # max_wait=7200,  # 2 hours
    # max_run=3600,   # 1 hour
)
# estimator.latest_training_job.stop()


In [None]:
# ========================================
# RUN THIS CELL TO START TRAINING
# You'll see ALL logs stream in real-time
# ========================================

# print("Created PyTorch Estimator:")
# print(f"  Instance: ml.g4dn.xlarge (4 vCPUs, 16GB RAM, 1x NVIDIA T4 GPU)")
# print(f"  Cost: ~$0.94/hour (~$2 for full training)")
# print(f"  Expected time: 1-2 hours")
# print(f"  Speed: 10-50x faster than CPU!")

# # Start training with PREPROCESSED data
# print("\nüöÄ Starting training job with GPU...")
# print("All print statements from dss_train.py will stream below...")
# print("(Safe to close computer - logs saved to CloudWatch)")
# print("="*70)

estimator.fit(
    {'training': S3_PREPROCESSED},
    wait=True,      # ‚úÖ Wait for job to complete
    logs='All'      # ‚úÖ Stream ALL logs to notebook (shows all print statements!)
)

# print("\n" + "="*70)
# print("‚úÖ PIPELINE COMPLETE!")
# print("="*70)
# print(f"\nJob Name: {estimator.latest_training_job.name}")
# print(f"Model saved to: {estimator.model_data}")
# print(f"\nTotal cost estimate:")
# print(f"  Preprocessing (ml.m5.xlarge): ~$0.50")
# print(f"  Training (ml.g4dn.xlarge):    ~$2.00")
# print(f"  TOTAL:                        ~$2.50")
# print(f"\nVs running on CPU locally: 20-30 hours!")
# print("\nüìã To view logs again later, run the cells below ‚¨áÔ∏è")
# print("="*70)




# üìã View Training Logs Anytime

**Use the cells below to view logs after closing/reopening your computer**

- **Cell below**: List all recent training jobs
- **Next cell**: View complete logs from any job


In [None]:
# ========================================
# RUN THIS CELL TO LIST ALL RECENT JOBS
# ========================================

import boto3
from datetime import datetime

sagemaker_client = boto3.client('sagemaker', region_name=REGION)

# Get recent training jobs
jobs = sagemaker_client.list_training_jobs(
    MaxResults=10, 
    SortBy='CreationTime', 
    SortOrder='Descending'
)

print("=" * 80)
print("RECENT TRAINING JOBS")
print("=" * 80)
print(f"\n{'#':<4} {'Job Name':<50} {'Status':<15}")
print("-" * 80)

for i, job in enumerate(jobs['TrainingJobSummaries']):
    job_name = job['TrainingJobName']
    status = job['TrainingJobStatus']
    created = job['CreationTime'].strftime('%Y-%m-%d %H:%M')
    
    # Color code status
    status_symbol = {
        'InProgress': 'üîÑ',
        'Completed': '‚úÖ',
        'Failed': '‚ùå',
        'Stopped': '‚è∏Ô∏è'
    }.get(status, '‚ùì')
    
    print(f"{i:<4} {job_name:<50} {status_symbol} {status}")

print("\n" + "=" * 80)
print("üí° Copy a job name above and paste it in the next cell to view its logs")
print("=" * 80)


In [None]:
# ========================================
# RUN THIS CELL TO VIEW LOGS FROM A JOB
# Paste the job name from above, or it will use the most recent job
# ========================================

import boto3

def get_training_logs(job_name, region='us-west-1', max_lines=None):
    """
    Get all CloudWatch logs for a SageMaker training job
    
    Args:
        job_name: Name of the training job
        region: AWS region
        max_lines: Maximum number of log lines to show (None = all)
    """
    logs_client = boto3.client('logs', region_name=region)
    
    log_group = '/aws/sagemaker/TrainingJobs'
    
    try:
        # List all log streams for this job
        streams = logs_client.describe_log_streams(
            logGroupName=log_group,
            logStreamNamePrefix=job_name,
            orderBy='LogStreamName'
        )
        
        if not streams['logStreams']:
            print(f"‚ùå No logs found for job: {job_name}")
            print("   Job might still be starting, or name is incorrect")
            return
        
        all_logs = []
        for stream in streams['logStreams']:
            stream_name = stream['logStreamName']
            
            # Get all events from this stream
            next_token = None
            while True:
                kwargs = {
                    'logGroupName': log_group,
                    'logStreamName': stream_name,
                    'startFromHead': True
                }
                if next_token:
                    kwargs['nextToken'] = next_token
                
                response = logs_client.get_log_events(**kwargs)
                
                for event in response['events']:
                    all_logs.append(event['message'])
                
                # Check if there are more logs
                next_token = response.get('nextForwardToken')
                if not response['events'] or next_token == kwargs.get('nextToken'):
                    break
        
        # Print logs
        print("=" * 80)
        print(f"üìã TRAINING LOGS: {job_name}")
        print("=" * 80)
        print(f"Total log lines: {len(all_logs)}")
        if max_lines:
            print(f"Showing first {max_lines} lines (set max_lines=None for all)")
        print("=" * 80)
        print()
        
        logs_to_show = all_logs[:max_lines] if max_lines else all_logs
        for log in logs_to_show:
            print(log)
        
        if max_lines and len(all_logs) > max_lines:
            print()
            print("=" * 80)
            print(f"‚ö†Ô∏è  Showing {max_lines} of {len(all_logs)} total lines")
            print(f"   Run: get_training_logs('{job_name}', max_lines=None) to see all")
            print("=" * 80)
        
    except Exception as e:
        print(f"‚ùå Error retrieving logs: {e}")
        print(f"   Make sure the job name is correct")


# ========================================
# PASTE JOB NAME HERE (or leave empty for most recent)
# ========================================
JOB_NAME = ''  # Example: 'animal-classification-training-2024-12-21-12-34-56-789'

# If no job name provided, use most recent
if not JOB_NAME:
    sagemaker_client = boto3.client('sagemaker', region_name=REGION)
    jobs = sagemaker_client.list_training_jobs(MaxResults=1, SortBy='CreationTime', SortOrder='Descending')
    if jobs['TrainingJobSummaries']:
        JOB_NAME = jobs['TrainingJobSummaries'][0]['TrainingJobName']
        print(f"‚ÑπÔ∏è  No job name specified, using most recent: {JOB_NAME}\n")
    else:
        print("‚ùå No training jobs found")

# View logs (showing first 500 lines by default)
if JOB_NAME:
    get_training_logs(JOB_NAME, REGION, max_lines=500)
    
# üí° To see ALL logs without limit:
# get_training_logs(JOB_NAME, REGION, max_lines=None)


---

## üìñ Quick Reference: Which Cells to Run

### **To Start Training (First Time):**
1. ‚úÖ Run **Cells 0-3** (Setup & Config)
2. ‚úÖ Run **Cell 4-5** (Preprocessing) - Only once needed
3. ‚úÖ Run **Cell 6** (Create Estimator)
4. ‚úÖ Run **Cell 7** (Start Training) - **SEE ALL LOGS HERE!**

### **To View Logs After Closing Computer:**
1. ‚úÖ Run **Cells 0-3** (Setup & Config)
2. ‚úÖ Run **Cell 9** (List all recent jobs)
3. ‚úÖ Run **Cell 10** (View logs fromselected job)

### **Tips:**
- **Cell 7** shows ALL your `print()` statements from `dss_train.py` in real-time
- Logs are automatically saved to CloudWatch (accessible anytime!)
- Safe to close computer during training - logs persist in CloudWatch
- **Cell 10** retrieves logs from CloudWatch whenever you need them

---
