In [1]:
import boto3
import sagemaker
from sagemaker.pytorch import PyTorch
from sagemaker.pytorch.processing import PyTorchProcessor
from sagemaker.processing import ProcessingInput, ProcessingOutput

sagemaker.config INFO - Not applying SDK defaults from location: /Library/Application Support/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /Users/amannindra/Library/Application Support/sagemaker/config.yaml


In [2]:
# Configuration
REGION = 'us-west-1'
ROLE_ARN = "arn:aws:iam::253490779227:role/service-role/AmazonSageMakerAdminIAMExecutionRole"
BUCKET = 'animal-classification-dss-works'
S3_INPUT_DATA = f's3://{BUCKET}/data/'
S3_PREPROCESSED = f's3://{BUCKET}/processed'


In [5]:
boto_session = boto3.Session(region_name=REGION)
sagemaker_session = sagemaker.Session(boto_session=boto_session)


In [4]:

# print(f"Region: {sagemaker_session.boto_region_name}")
# print(f"S3 Bucket: {BUCKET}")
# print(f"Input data: {S3_INPUT_DATA}")
# print(f"Preprocessed output: {S3_PREPROCESSED}")

Region: us-west-1
S3 Bucket: animal-classification-dss-works
Input data: s3://animal-classification-dss-works/data/
Preprocessed output: s3://animal-classification-dss-works/processed


In [6]:
# processor = PyTorchProcessor(
#     framework_version='2.1',
#     py_version='py310',
#     role=ROLE_ARN,
#     instance_type='ml.m5.2xlarge',  # CPU instance: $0.23/hour
#     instance_count=1,
#     sagemaker_session=sagemaker_session,
#     base_job_name='animal-preprocessing'
# )

In [14]:
# processor.latest_job.stop()

###### RUN THIS CELL ONLY ONCE ######

# processor.run(
#     code='preprocess.py',
#     # inputs=[
#     #     ProcessingInput(
#     #         source=S3_INPUT_DATA,           # Your S3 data folder
#     #         destination='/opt/ml/processing/input'  # Where it appears in container
#     #     )
#     # ],
#     # outputs=[
#     #     ProcessingOutput(
#     #         source='/opt/ml/processing/output',     # Where script saves results
#     #         destination=S3_PREPROCESSED              # Upload results here
#     #     )
#     # ],
#     # arguments=[
#     #     '--input-dir', '/opt/ml/processing/input',
#     #     '--output-dir', '/opt/ml/processing/output'
#     # ]
# )
# print(f"Preprocessed data saved to: {S3_PREPROCESSED}")

In [27]:
!python manage_sagemaker_jobs.py --action stop-all


SAGEMAKER RUNNING JOBS
(All job types: Training, Processing, Transform, HyperparameterTuning,
 AutoML, Compilation, EdgePackaging, Labeling, Monitoring)
Scanning job types...
Error listing compilation jobs with status InProgress: An error occurred (ValidationException) when calling the ListCompilationJobs operation: 1 validation error detected: Value 'InProgress' at 'statusEquals' failed to satisfy constraint: Member must satisfy enum value set: [STARTING, COMPLETED, STOPPED, INPROGRESS, STOPPING, FAILED]
Error listing compilation jobs with status Stopping: An error occurred (ValidationException) when calling the ListCompilationJobs operation: 1 validation error detected: Value 'Stopping' at 'statusEquals' failed to satisfy constraint: Member must satisfy enum value set: [STARTING, COMPLETED, STOPPED, INPROGRESS, STOPPING, FAILED]
Error listing edge packaging jobs with status InProgress: An error occurred (ThrottlingException) when calling the ListEdgePackagingJobs operation (reached 

In [31]:

estimator = PyTorch(
    entry_point='dss_train.py',
    source_dir='.',
    role=ROLE_ARN,
    framework_version='2.1',
    py_version='py310',
    instance_count=1,
    instance_type='ml.g4dn.xlarge',  # GPU instance with NVIDIA T4
    hyperparameters={
        'epochs': 5,
        'batch-size': 64,  # Larger batch size with GPU
        'learning-rate': 0.001,
        'use-cuda': True
    },
    sagemaker_session=sagemaker_session,
    base_job_name='animal-classification-training',
    # Use Spot instances to save 70% (optional)
    # use_spot_instances=True,
    # max_wait=7200,  # 2 hours
    max_run=3600,   # 1 hour
)
# estimator.latest_training_job.stop()


In [32]:
estimator.fit(
    {'training': S3_PREPROCESSED},
    wait=True,      # ✅ Wait for job to complete
    logs='All'      # ✅ Stream ALL logs to notebook (shows all print statements!)
)

INFO:sagemaker.telemetry.telemetry_logging:SageMaker Python SDK will collect telemetry to help us better understand our user's needs, diagnose issues, and deliver additional features.
To opt out of telemetry, please disable via TelemetryOptOut parameter in SDK defaults config. For more information, refer to https://sagemaker.readthedocs.io/en/stable/overview.html#configuring-and-using-defaults-with-the-sagemaker-python-sdk.
INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: animal-classification-training-2025-12-29-06-04-53-645


2025-12-29 06:05:04 Starting - Starting the training job
2025-12-29 06:05:04 Pending - Training job waiting for capacity.........
2025-12-29 06:06:35 Pending - Preparing the instances for training...
2025-12-29 06:07:00 Downloading - Downloading input data...
2025-12-29 06:07:50 Downloading - Downloading the training image.....................
2025-12-29 06:11:33 Training - Training image download completed. Training in progress.bash: cannot set terminal process group (-1): Inappropriate ioctl for device
bash: no job control in this shell
  "cipher": algorithms.TripleDES,
  "class": algorithms.TripleDES,
2025-12-29 06:11:39,943 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training
2025-12-29 06:11:39,962 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)
2025-12-29 06:11:39,973 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.
2025-12-29 06:11:39,979 sagemaker_pytorch_conta

In [26]:
print(estimator.model_data)


# 📋 View Training Logs Anytime

**Use the cells below to view logs after closing/reopening your computer**

- **Cell below**: List all recent training jobs
- **Next cell**: View complete logs from any job


In [8]:
# ========================================
# RUN THIS CELL TO LIST ALL RECENT JOBS
# ========================================

import boto3
from datetime import datetime

sagemaker_client = boto3.client('sagemaker', region_name=REGION)

# Get recent training jobs
jobs = sagemaker_client.list_training_jobs(
    MaxResults=10, 
    SortBy='CreationTime', 
    SortOrder='Descending'
)

print("=" * 80)
print("RECENT TRAINING JOBS")
print("=" * 80)
print(f"\n{'#':<4} {'Job Name':<50} {'Status':<15}")
print("-" * 80)

for i, job in enumerate(jobs['TrainingJobSummaries']):
    job_name = job['TrainingJobName']
    status = job['TrainingJobStatus']
    created = job['CreationTime'].strftime('%Y-%m-%d %H:%M')
    
    # Color code status
    status_symbol = {
        'InProgress': '🔄',
        'Completed': '✅',
        'Failed': '❌',
        'Stopped': '⏸️'
    }.get(status, '❓')
    
    print(f"{i:<4} {job_name:<50} {status_symbol} {status}")

print("\n" + "=" * 80)
print("💡 Copy a job name above and paste it in the next cell to view its logs")
print("=" * 80)


RECENT TRAINING JOBS

#    Job Name                                           Status         
--------------------------------------------------------------------------------
0    animal-classification-training-2025-12-28-01-43-07-396 ✅ Completed
1    animal-classification-training-2025-12-26-04-34-27-562 ❌ Failed
2    animal-classification-training-2025-12-26-04-25-52-717 ❌ Failed
3    animal-classification-training-2025-12-26-03-26-11-248 ❌ Failed
4    animal-classification-training-2025-12-26-03-04-56-625 ❌ Failed
5    animal-classification-training-2025-12-26-02-42-42-932 ❌ Failed
6    animal-classification-training-2025-12-26-02-21-39-164 ❌ Failed
7    animal-classification-training-2025-12-25-15-49-11-310 ❌ Failed
8    animal-classification-training-2025-12-25-15-32-38-363 ⏸️ Stopped
9    animal-classification-training-2025-12-25-04-41-08-092 ❌ Failed

💡 Copy a job name above and paste it in the next cell to view its logs


In [9]:
# ========================================
# RUN THIS CELL TO VIEW LOGS FROM A JOB
# Paste the job name from above, or it will use the most recent job
# ========================================

import boto3

def get_training_logs(job_name, region='us-west-1', max_lines=None):
    """
    Get all CloudWatch logs for a SageMaker training job
    
    Args:
        job_name: Name of the training job
        region: AWS region
        max_lines: Maximum number of log lines to show (None = all)
    """
    logs_client = boto3.client('logs', region_name=region)
    
    log_group = '/aws/sagemaker/TrainingJobs'
    
    try:
        # List all log streams for this job
        streams = logs_client.describe_log_streams(
            logGroupName=log_group,
            logStreamNamePrefix=job_name,
            orderBy='LogStreamName'
        )
        
        if not streams['logStreams']:
            print(f"❌ No logs found for job: {job_name}")
            print("   Job might still be starting, or name is incorrect")
            return
        
        all_logs = []
        for stream in streams['logStreams']:
            stream_name = stream['logStreamName']
            
            # Get all events from this stream
            next_token = None
            while True:
                kwargs = {
                    'logGroupName': log_group,
                    'logStreamName': stream_name,
                    'startFromHead': True
                }
                if next_token:
                    kwargs['nextToken'] = next_token
                
                response = logs_client.get_log_events(**kwargs)
                
                for event in response['events']:
                    all_logs.append(event['message'])
                
                # Check if there are more logs
                next_token = response.get('nextForwardToken')
                if not response['events'] or next_token == kwargs.get('nextToken'):
                    break
        
        # Print logs
        print("=" * 80)
        print(f"📋 TRAINING LOGS: {job_name}")
        print("=" * 80)
        print(f"Total log lines: {len(all_logs)}")
        if max_lines:
            print(f"Showing first {max_lines} lines (set max_lines=None for all)")
        print("=" * 80)
        print()
        
        logs_to_show = all_logs[:max_lines] if max_lines else all_logs
        for log in logs_to_show:
            print(log)
        
        if max_lines and len(all_logs) > max_lines:
            print()
            print("=" * 80)
            print(f"⚠️  Showing {max_lines} of {len(all_logs)} total lines")
            print(f"   Run: get_training_logs('{job_name}', max_lines=None) to see all")
            print("=" * 80)
        
    except Exception as e:
        print(f"❌ Error retrieving logs: {e}")
        print(f"   Make sure the job name is correct")


# ========================================
# PASTE JOB NAME HERE (or leave empty for most recent)
# ========================================
JOB_NAME = ''  # Example: 'animal-classification-training-2024-12-21-12-34-56-789'

# If no job name provided, use most recent
if not JOB_NAME:
    sagemaker_client = boto3.client('sagemaker', region_name=REGION)
    jobs = sagemaker_client.list_training_jobs(MaxResults=1, SortBy='CreationTime', SortOrder='Descending')
    if jobs['TrainingJobSummaries']:
        JOB_NAME = jobs['TrainingJobSummaries'][0]['TrainingJobName']
        print(f"ℹ️  No job name specified, using most recent: {JOB_NAME}\n")
    else:
        print("❌ No training jobs found")

# View logs (showing first 500 lines by default)
if JOB_NAME:
    get_training_logs(JOB_NAME, REGION, max_lines=500)
    
# 💡 To see ALL logs without limit:
# get_training_logs(JOB_NAME, REGION, max_lines=None)


ℹ️  No job name specified, using most recent: animal-classification-training-2025-12-28-01-43-07-396

📋 TRAINING LOGS: animal-classification-training-2025-12-28-01-43-07-396
Total log lines: 2583
Showing first 500 lines (set max_lines=None for all)

bash: cannot set terminal process group (-1): Inappropriate ioctl for device
bash: no job control in this shell
  "cipher": algorithms.TripleDES,
  "class": algorithms.TripleDES,
2025-12-28 01:48:40,138 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training
2025-12-28 01:48:40,157 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)
2025-12-28 01:48:40,168 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.
2025-12-28 01:48:40,174 sagemaker_pytorch_container.training INFO     Invoking user training script.
2025-12-28 01:48:42,199 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)
2025-12-28 01:48

---

## 📖 Quick Reference: Which Cells to Run

### **To Start Training (First Time):**
1. ✅ Run **Cells 0-3** (Setup & Config)
2. ✅ Run **Cell 4-5** (Preprocessing) - Only once needed
3. ✅ Run **Cell 6** (Create Estimator)
4. ✅ Run **Cell 7** (Start Training) - **SEE ALL LOGS HERE!**

### **To View Logs After Closing Computer:**
1. ✅ Run **Cells 0-3** (Setup & Config)
2. ✅ Run **Cell 9** (List all recent jobs)
3. ✅ Run **Cell 10** (View logs fromselected job)

### **Tips:**
- **Cell 7** shows ALL your `print()` statements from `dss_train.py` in real-time
- Logs are automatically saved to CloudWatch (accessible anytime!)
- Safe to close computer during training - logs persist in CloudWatch
- **Cell 10** retrieves logs from CloudWatch whenever you need them

---
