In [19]:
import boto3
import sagemaker

from sagemaker.pytorch import PyTorch
from sagemaker.pytorch.processing import PyTorchProcessor
from sagemaker.processing import ProcessingInput, ProcessingOutput

In [30]:
# Configuration
REGION = 'us-east-1'
ROLE_ARN = "arn:aws:iam::253490779227:role/service-role/AmazonSageMakerAdminIAMExecutionRole"
BUCKET = 'animal-classification-dss-works'
BUCKET_VIRGINIA = 'animal-classification-virgina'
S3_INPUT_DATA = f's3://{BUCKET}/data/'
S3_PREPROCESSED = f's3://{BUCKET_VIRGINIA}/processed'
S3_SHORT_PREPROCESSED = f's3://{BUCKET}/short_processed'


In [31]:
boto_session = boto3.Session(region_name=REGION)
sagemaker_session = sagemaker.Session(boto_session=boto_session)


INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials


In [32]:

# print(f"Region: {sagemaker_session.boto_region_name}")
# print(f"S3 Bucket: {BUCKET}")
# print(f"Input data: {S3_INPUT_DATA}")
# print(f"Preprocessed output: {S3_PREPROCESSED}")

In [33]:
# processor = PyTorchProcessor(
#     framework_version='2.1',
#     py_version='py310',
#     role=ROLE_ARN,
#     instance_type='ml.m5.2xlarge',  # CPU instance: $0.23/hour
#     instance_count=1,
#     sagemaker_session=sagemaker_session,
#     base_job_name='animal-preprocessing'
# )

In [34]:
# processor.latest_job.stop()

###### RUN THIS CELL ONLY ONCE ######

# processor.run(
#     code='preprocess.py',
#     # inputs=[
#     #     ProcessingInput(
#     #         source=S3_INPUT_DATA,           # Your S3 data folder
#     #         destination='/opt/ml/processing/input'  # Where it appears in container
#     #     )
#     # ],
#     # outputs=[
#     #     ProcessingOutput(
#     #         source='/opt/ml/processing/output',     # Where script saves results
#     #         destination=S3_PREPROCESSED              # Upload results here
#     #     )
#     # ],
#     # arguments=[
#     #     '--input-dir', '/opt/ml/processing/input',
#     #     '--output-dir', '/opt/ml/processing/output'
#     # ]
# )
# print(f"Preprocessed data saved to: {S3_PREPROCESSED}")

In [35]:
train_file = "dss_transformer_train.py"
part1_my_output_path = "s3://sagemaker-us-west-1-253490779227/animal-classification-resnet18"
gpu = "ml.g4dn.2xlarge"
estimator = PyTorch(
    entry_point=train_file,
    output_path=part1_my_output_path,
    dependencies=["requirements.txt"],
    role=ROLE_ARN,
    framework_version='2.1',
    py_version='py310',
    instance_count=1,
    instance_type=gpu,  # GPU instance with NVIDIA T4
    hyperparameters={
        'epochs': 10,
        'batch-size': 64,  
        'learning-rate': 1e-5, 
        'use-cuda': True, 
        "image-size": 224,
        "weight-decay": 1e-8,
        "stochastic-depth": 0.2,
        "num-cpu": 4,
        "save-file": "resnet18_model.pth"
    },
    sagemaker_session=sagemaker_session,
    base_job_name='resnet18',    
    # max_run=3600,   
)
# estimator.latest_training_job.stop()
print(estimator)

<sagemaker.pytorch.estimator.PyTorch object at 0x11f543550>


In [None]:
train_file = "dss_transformer_train.py"
part1_my_output_path = "s3://sagemaker-us-west-1-253490779227/animal-classification-models_part1"

estimator = PyTorch(
    entry_point=train_file,
    output_path=part1_my_output_path,
    dependencies=["requirements.txt"],
    role=ROLE_ARN,
    framework_version='2.1',
    py_version='py310',
    instance_count=1,
    instance_type='ml.g4dn.xlarge',  # GPU instance with NVIDIA T4
    hyperparameters={
        'epochs': 10,
        'batch-size': 64,  
        'learning-rate': 1e-5, 
        'use-cuda': True, 
        "image-size": 224,
        "weight-decay": 1e-8,
        "stochastic-depth": 0.2,
        "num-cpu": 4,
        "save-file": "final_swin_t_model_part1.pth"
    },
    sagemaker_session=sagemaker_session,
    base_job_name='swin-stage1',    
    # max_run=3600,   
)
# estimator.latest_training_job.stop()
print(estimator)

<sagemaker.pytorch.estimator.PyTorch object at 0x128411010>


In [15]:
print("We are training using this file: ", train_file, " with this data: ", S3_PREPROCESSED)

estimator.fit(
    {'training': S3_PREPROCESSED},
    wait=True,      # ✅ Wait for job to complete
    logs='All'      # ✅ Stream ALL logs to notebook (shows all print statements!)
)


We are training using this file:  dss_transformer_train.py  with this data:  s3://animal-classification-dss-works/processed


INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: swin-stage1-2026-01-06-11-36-42-433


In [None]:
print(estimator.model_data)


In [64]:
model_location = "s3://sagemaker-us-west-1-253490779227/animal-classification-training-2026-01-01-08-24-58-245/output/model.tar.gz"

part2_my_output_path = "s3://sagemaker-us-west-1-253490779227/animal-classification-models_part2"

estimator_2 = PyTorch(
    entry_point=train_file,
    source_dir='.',
    role=ROLE_ARN,
    framework_version='2.1',
    py_version='py310',
    output_path=part2_my_output_path,
    instance_count=1,
    instance_type='ml.g4dn.2xlarge',  
    model_data = model_location,    
    hyperparameters={
        'epochs': 10,                
        'batch-size': 16,            
        'learning-rate': 1e-5,       
        'use-cuda': True, 
        "image-size": 384,            
        "weight-decay": 1e-8,   
        "stochastic-depth": 0.1,      
        "num-cpu": 4,
        "save-file": "final_swin_t_model_384.pth"
    },
    sagemaker_session=sagemaker_session,
    base_job_name='swin-stage2-again',    
# Use Spot instances to save 70% (optional)
    # use_spot_instances=True,
    # max_wait=7200,  # 2 hours
    max_run=3600,   # 1 hour
    
)

In [66]:
estimator_2.fit(
    {'training': S3_PREPROCESSED,
     'model': model_location},
    wait=True,      # ✅ Wait for job to complete
    logs='All'      # ✅ Stream ALL logs to notebook (shows all print statements!)
)

INFO:sagemaker.telemetry.telemetry_logging:SageMaker Python SDK will collect telemetry to help us better understand our user's needs, diagnose issues, and deliver additional features.
To opt out of telemetry, please disable via TelemetryOptOut parameter in SDK defaults config. For more information, refer to https://sagemaker.readthedocs.io/en/stable/overview.html#configuring-and-using-defaults-with-the-sagemaker-python-sdk.
INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: swin-stage2-again-2026-01-01-11-33-54-364


2026-01-01 11:34:47 Starting - Starting the training job
2026-01-01 11:34:47 Pending - Training job waiting for capacity...
2026-01-01 11:35:01 Pending - Preparing the instances for training...
2026-01-01 11:35:58 Downloading - Downloading the training image.bash: cannot set terminal process group (-1): Inappropriate ioctl for device
bash: no job control in this shell
  "cipher": algorithms.TripleDES,
  "class": algorithms.TripleDES,
2026-01-01 11:39:06,462 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training
2026-01-01 11:39:06,480 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)
2026-01-01 11:39:06,490 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.
2026-01-01 11:39:06,492 sagemaker_pytorch_container.training INFO     Invoking user training script.
2026-01-01 11:39:08,154 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)
2026-01

In [36]:
train_file = "dss_new_train.py"

# 📁 Storage Configuration:
# - Models (.pth files) → output_path (automatically uploaded by SageMaker)
# - Metrics (JSON/CSV) → output_data_config (training logs and metrics)
model_output_path = "s3://amazon-sagemaker-253490779227-us-east-1-cnizlxa57lpnon/animal-classification-resnet18"
metrics_output_path = "s3://animal-classification-virgina/output"

estimator_3 = PyTorch(
    entry_point=train_file,
    dependencies=["requirements.txt"],
    # source_dir='.',
    role=ROLE_ARN,
    framework_version='2.1',
    py_version='py310',
    output_data_config={
        'S3OutputPath': metrics_output_path
    },
    
    instance_count=1,
    instance_type='ml.g4dn.2xlarge',  
    # model_data = model_location,    
    hyperparameters={
        'epochs': 10,                
        'batch-size': 16,            
        'learning-rate': 1e-5,       
        'use-cuda': True, 
        "image-size": 224,            
        "weight-decay": 1e-8,   
        "stochastic-depth": 0.1,      
        "num-cpu": 4,
        "save-file": "resnet18Wieghts.pth"
    },
    sagemaker_session=sagemaker_session,
    base_job_name='resnet18',    
    
)

print(f"✓ Estimator configured:")
print(f"  Models → {model_output_path}")
print(f"  Metrics → {metrics_output_path}")


✓ Estimator configured:
  Models → s3://amazon-sagemaker-253490779227-us-east-1-cnizlxa57lpnon/animal-classification-resnet18
  Metrics → s3://animal-classification-virgina/output


In [37]:
estimator_3.fit(
    {'training': S3_PREPROCESSED},
    wait=True,      # ✅ Wait for job to complete
    logs='All'      # ✅ Stream ALL logs to notebook (shows all print statements!)
)

INFO:sagemaker.telemetry.telemetry_logging:SageMaker Python SDK will collect telemetry to help us better understand our user's needs, diagnose issues, and deliver additional features.
To opt out of telemetry, please disable via TelemetryOptOut parameter in SDK defaults config. For more information, refer to https://sagemaker.readthedocs.io/en/stable/overview.html#configuring-and-using-defaults-with-the-sagemaker-python-sdk.
INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: resnet18-2026-01-26-02-48-02-281


2026-01-26 02:48:07 Starting - Starting the training job
2026-01-26 02:48:07 Pending - Training job waiting for capacity...
2026-01-26 02:48:41 Pending - Preparing the instances for training...
2026-01-26 02:49:08 Downloading - Downloading input data...
2026-01-26 02:49:38 Downloading - Downloading the training image..................
2026-01-26 02:52:25 Training - Training image download completed. Training in progress.bash: cannot set terminal process group (-1): Inappropriate ioctl for device
bash: no job control in this shell
  "cipher": algorithms.TripleDES,
  "class": algorithms.TripleDES,
2026-01-26 02:52:41,823 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training
2026-01-26 02:52:41,843 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)
2026-01-26 02:52:41,853 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.
2026-01-26 02:52:41,855 sagemaker_pytorch_container.trai

# 📋 View Training Logs Anytime

**Use the cells below to view logs after closing/reopening your computer**

- **Cell below**: List all recent training jobs
- **Next cell**: View complete logs from any job


In [1]:
# ========================================
# RUN THIS CELL TO LIST ALL RECENT JOBS
# ========================================

import boto3
from datetime import datetime

sagemaker_client = boto3.client('sagemaker', region_name=REGION)

# Get recent training jobs
jobs = sagemaker_client.list_training_jobs(
    MaxResults=10, 
    SortBy='CreationTime', 
    SortOrder='Descending'
)

print("=" * 80)
print("RECENT TRAINING JOBS")
print("=" * 80)
print(f"\n{'#':<4} {'Job Name':<50} {'Status':<15}")
print("-" * 80)

for i, job in enumerate(jobs['TrainingJobSummaries']):
    job_name = job['TrainingJobName']
    status = job['TrainingJobStatus']
    created = job['CreationTime'].strftime('%Y-%m-%d %H:%M')
    
    # Color code status
    status_symbol = {
        'InProgress': '🔄',
        'Completed': '✅',
        'Failed': '❌',
        'Stopped': '⏸️'
    }.get(status, '❓')
    
    print(f"{i:<4} {job_name:<50} {status_symbol} {status}")

print("\n" + "=" * 80)
print("💡 Copy a job name above and paste it in the next cell to view its logs")
print("=" * 80)


: 

In [9]:
# ========================================
# RUN THIS CELL TO VIEW LOGS FROM A JOB
# Paste the job name from above, or it will use the most recent job
# ========================================

import boto3

def get_training_logs(job_name, region='us-west-1', max_lines=None):
    """
    Get all CloudWatch logs for a SageMaker training job
    
    Args:
        job_name: Name of the training job
        region: AWS region
        max_lines: Maximum number of log lines to show (None = all)
    """
    logs_client = boto3.client('logs', region_name=region)
    
    log_group = '/aws/sagemaker/TrainingJobs'
    
    try:
        # List all log streams for this job
        streams = logs_client.describe_log_streams(
            logGroupName=log_group,
            logStreamNamePrefix=job_name,
            orderBy='LogStreamName'
        )
        
        if not streams['logStreams']:
            print(f"❌ No logs found for job: {job_name}")
            print("   Job might still be starting, or name is incorrect")
            return
        
        all_logs = []
        for stream in streams['logStreams']:
            stream_name = stream['logStreamName']
            
            # Get all events from this stream
            next_token = None
            while True:
                kwargs = {
                    'logGroupName': log_group,
                    'logStreamName': stream_name,
                    'startFromHead': True
                }
                if next_token:
                    kwargs['nextToken'] = next_token
                
                response = logs_client.get_log_events(**kwargs)
                
                for event in response['events']:
                    all_logs.append(event['message'])
                
                # Check if there are more logs
                next_token = response.get('nextForwardToken')
                if not response['events'] or next_token == kwargs.get('nextToken'):
                    break
        
        # Print logs
        print("=" * 80)
        print(f"📋 TRAINING LOGS: {job_name}")
        print("=" * 80)
        print(f"Total log lines: {len(all_logs)}")
        if max_lines:
            print(f"Showing first {max_lines} lines (set max_lines=None for all)")
        print("=" * 80)
        print()
        
        logs_to_show = all_logs[:max_lines] if max_lines else all_logs
        for log in logs_to_show:
            print(log)
        
        if max_lines and len(all_logs) > max_lines:
            print()
            print("=" * 80)
            print(f"⚠️  Showing {max_lines} of {len(all_logs)} total lines")
            print(f"   Run: get_training_logs('{job_name}', max_lines=None) to see all")
            print("=" * 80)
        
    except Exception as e:
        print(f"❌ Error retrieving logs: {e}")
        print(f"   Make sure the job name is correct")


# ========================================
# PASTE JOB NAME HERE (or leave empty for most recent)
# ========================================
JOB_NAME = ''  # Example: 'animal-classification-training-2024-12-21-12-34-56-789'

# If no job name provided, use most recent
if not JOB_NAME:
    sagemaker_client = boto3.client('sagemaker', region_name=REGION)
    jobs = sagemaker_client.list_training_jobs(MaxResults=1, SortBy='CreationTime', SortOrder='Descending')
    if jobs['TrainingJobSummaries']:
        JOB_NAME = jobs['TrainingJobSummaries'][0]['TrainingJobName']
        print(f"ℹ️  No job name specified, using most recent: {JOB_NAME}\n")
    else:
        print("❌ No training jobs found")

# View logs (showing first 500 lines by default)
if JOB_NAME:
    get_training_logs(JOB_NAME, REGION, max_lines=500)
    
# 💡 To see ALL logs without limit:
# get_training_logs(JOB_NAME, REGION, max_lines=None)


ℹ️  No job name specified, using most recent: animal-classification-training-2025-12-28-01-43-07-396

📋 TRAINING LOGS: animal-classification-training-2025-12-28-01-43-07-396
Total log lines: 2583
Showing first 500 lines (set max_lines=None for all)

bash: cannot set terminal process group (-1): Inappropriate ioctl for device
bash: no job control in this shell
  "cipher": algorithms.TripleDES,
  "class": algorithms.TripleDES,
2025-12-28 01:48:40,138 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training
2025-12-28 01:48:40,157 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)
2025-12-28 01:48:40,168 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.
2025-12-28 01:48:40,174 sagemaker_pytorch_container.training INFO     Invoking user training script.
2025-12-28 01:48:42,199 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)
2025-12-28 01:48

---

## 📖 Quick Reference: Which Cells to Run

### **To Start Training (First Time):**
1. ✅ Run **Cells 0-3** (Setup & Config)
2. ✅ Run **Cell 4-5** (Preprocessing) - Only once needed
3. ✅ Run **Cell 6** (Create Estimator)
4. ✅ Run **Cell 7** (Start Training) - **SEE ALL LOGS HERE!**

### **To View Logs After Closing Computer:**
1. ✅ Run **Cells 0-3** (Setup & Config)
2. ✅ Run **Cell 9** (List all recent jobs)
3. ✅ Run **Cell 10** (View logs fromselected job)

### **Tips:**
- **Cell 7** shows ALL your `print()` statements from `dss_train.py` in real-time
- Logs are automatically saved to CloudWatch (accessible anytime!)
- Safe to close computer during training - logs persist in CloudWatch
- **Cell 10** retrieves logs from CloudWatch whenever you need them

---
