# 1. SageMaker Training for Diffusion model
---

본 모듈에서는 Amzaon SageMaker API을 효과적으로 이용하기 위해 multigpu-distributed 학습을 위한 PyTorch 프레임워크 자체 구현만으로 모델 훈련을 수행해 봅니다.

In [1]:
install_needed = True  # should only be True once
# install_needed = False

In [2]:
import sys
import IPython

if install_needed:
    print("installing deps and restarting kernel")
#     !{sys.executable} -m pip install -U split-folders tqdm albumentations crc32c wget
    !{sys.executable} -m pip install 'sagemaker[local]' --upgrade
    !{sys.executable} -m pip install -U smdebug sagemaker-experiments
    !{sys.executable} -m pip install -U sagemaker
    !/bin/bash ./local/local_mode_setup.sh
    IPython.Application.instance().kernel.do_shutdown(True)

installing deps and restarting kernel
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/pytorch_p36/bin/python -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/pytorch_p36/bin/python -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/pytorch_p36/bin/python -m pip install --upgrade pip' command.[0m
nvidia-docker2 already installed. We are good to go!
Stopping docker: [60G[[0;32m  OK  [0;39m]
Starting docker:	.[60G[[0;32m  OK  [0;39m]
SageMaker instance route table setup is ok. We are good to go.
SageMaker instance routing for Docker is ok. We are good to go!


## 2. 환경 설정

<p>Sagemaker 학습에 필요한 기본적인 package를 import 합니다. </p>
<p>boto3는 HTTP API 호출을 숨기는 편한 추상화 모델을 가지고 있고, Amazon EC2 인스턴스 및 S3 버켓과 같은 AWS 리소스와 동작하는 파이선 클래스를 제공합니다. </p>
<p>sagemaker python sdk는 Amazon SageMaker에서 기계 학습 모델을 교육 및 배포하기 위한 오픈 소스 라이브러리입니다.</p>

In [1]:
import joblib
import matplotlib.pyplot as plt
import sagemaker
# import splitfolders

import datetime
import glob
import os
import time
import warnings

from smexperiments.experiment import Experiment
from smexperiments.trial import Trial

# import wget
# import tarfile
import shutil

import boto3
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torchvision

# from tqdm import tqdm
from time import strftime
from PIL import Image
from torch.utils.data import Dataset
from torchvision import datasets, transforms

from sagemaker import get_execution_role
from sagemaker.pytorch import PyTorch

from sagemaker.debugger import (Rule,
                                rule_configs,
                                ProfilerConfig, 
                                FrameworkProfile, 
                                DetailedProfilingConfig, 
                                DataloaderProfilingConfig, 
                                PythonProfilingConfig)

warnings.filterwarnings('ignore')
%config InlineBackend.figure_format = 'retina'

In [2]:
role = get_execution_role()

In [3]:
sagemaker.__version__

'2.69.0'

In [4]:
def create_experiment(experiment_name):
    try:
        sm_experiment = Experiment.load(experiment_name)
    except:
        sm_experiment = Experiment.create(experiment_name=experiment_name,
                                          tags=[
                                              {
                                                  'Key': 'multigpu',
                                                  'Value': 'yes'
                                              },
                                              {
                                                  'Key': 'multinode',
                                                  'Value': 'yes'
                                              },
                                          ])

In [5]:
def create_trial(experiment_name, set_param, i_type, i_cnt, spot):
    create_date = strftime("%m%d-%H%M%s")
    
    if set_param['sagemakerdp']:
        algo = 'sdp'
#     elif set_param['sagemakermp']:
#         algo = 'smp'
    else:
        algo = 'ds'
    
    spot = 's' if spot else 'd'
    i_tag = 'test'
    if i_type == 'ml.p3.16xlarge':
        i_tag = 'p3'
    elif i_type == 'ml.p3dn.24xlarge':
        i_tag = 'p3dn'
    elif i_type == 'ml.p4d.24xlarge':
        i_tag = 'p4d'    
        
    trial = "-".join([i_tag,str(i_cnt),algo, spot])
       
    sm_trial = Trial.create(trial_name=f'{experiment_name}-{trial}-{create_date}',
                            experiment_name=experiment_name)

    job_name = f'{sm_trial.trial_name}'
    return job_name

In [6]:
bucket = 'diffusion-sagemaker-211011'
code_location = f's3://{bucket}/sm_codes'
output_path = f's3://{bucket}/poc_diffusion/output' 
s3_log_path = f's3://{bucket}/tf_logs' 
# s3_log_path = f'{bucket}/tf_logs' 

In [7]:
metric_definitions=[
     {'Name': 'train:lr', 'Regex': 'lr - (.*?),'},
     {'Name': 'train:Loss', 'Regex': 'loss -(.*?),'},
]

In [8]:
from sagemaker.debugger import Rule, ProfilerRule, rule_configs

rules=[ 
    Rule.sagemaker(rule_configs.loss_not_decreasing()),
    Rule.sagemaker(rule_configs.overfit()),
    ProfilerRule.sagemaker(rule_configs.ProfilerReport()),
]

In [233]:
# hyperparameters = {
#     'schedule_sampler' : 'uniform',
#     'lr': 1e-4,
#     'weight_decay': 0.0,
#     'lr_anneal_steps' : 100,
#     'batch_size' : 32,
#     'microbatch' : -1,
#     'ema_rate' : '0.9999',
#     'log_interval' : 5,
#     'save_interval' : 5,
# #     'resume_checkpoint' : "/opt/ml/code/resume_ckt/model000100.pt",
#     'use_fp16': True,
#     'fp16_scale_growth' : 1e-3,
#     's3_log_path' : s3_log_path,
#     'sagemakerdp' : True,
#     }
#         "num_channels": 256,
#         "num_res_blocks": 3,
#         "lr": 1e-05,
#         "num_heads": 4,
#         "channel_mult": "1,1,2,4,4",
#         "learn_sigma": true,
#         "diffusion_steps": 1000,
#         "batch_size": 16,
#         "class_cond": false,
#         "weight_decay": 0.0,
#         "resblock_updown": true,
#         "image_size": 128,
#         "noise_schedule": "linear",
#         "use_scale_shift_norm": true,
#         "log_interval": 10,
#         "ema_rate": "0.999",
#         "save_interval": 2000,
#         "attention_resolutions": "32,16,8",
#         "sagemakerdp": true,
#         "s3_log_path": "s3://lgaivision-diffusion/tf_logs",
#         "use_fp16": true,
#         "resume_checkpoint": "model078000.pt"

hyperparameters = {
    'attention_resolutions': '32,16,8',
    'class_cond': True,
    'diffusion_steps': 1000,
    'image_size': 128,
    'channel_mult': '1,1.5,2,4,5',
    'learn_sigma': True,
    'noise_schedule': 'linear',
    'num_channels': 256,
    'num_heads': 4,
    'num_res_blocks': 3,
    'resblock_updown': True,
    'use_fp16': True,
    'use_scale_shift_norm': True,
#     'schedule_sampler' : 'uniform',
    'lr': 1e-4,
    'weight_decay': 1e-4,
    'lr_anneal_steps' : 2000,
    'batch_size' : 8,
#     'microbatch' : -1,
    'ema_rate' : '0.9999',
    'log_interval' : 10,
    'save_interval' : 200,
#     'resume_checkpoint' : 'model259000.pt',
#     'fp16_scale_growth' : 1e-3,
    's3_log_path' : s3_log_path,   ### 로그를 위한 s3_log_path 추가
    'sagemakerdp' : True,
#     'eps': 1e-8,
    }

# mp_parameters = {
#         'num_microbatches': 16,
#         'num_partitions' : 4,
#         'placement_strategy': 'cluster', # cluster , spread
#         'pipeline': 'interleaved',
#         'optimize': 'speed',
#         'memory_weight': 0.2,
#         'ddp': True,
# }

experiment_name = 'diffusion-poc-exp2'
instance_type = 'ml.p4d.24xlarge'  # 'ml.p3.16xlarge', 'ml.p3dn.24xlarge', 'ml.p4d.24xlarge', 'local_gpu'
# instance_type = 'local_gpu'
instance_count = 2
do_spot_training = False
max_wait = None
max_run = 1*60*60


# !gdown https://drive.google.com/uc?id=1vF8Ht0VThpobtmShD52_INhpIgy6eEXq
# !gdown https://drive.google.com/uc?id=1kaIqFwTLD7Ml3ib9NQpjoUSD4FUD21-I

# !rm -rf dataset
# !mkdir dataset
# !unzip birds.zip -d dataset/
# !tar zxvf CUB_200_2011.tgz -C dataset/

In [301]:
# hyperparameters = {
#     'schedule_sampler' : 'uniform',
#     'lr': 1e-4,
#     'weight_decay': 0.0,
#     'lr_anneal_steps' : 100,
#     'batch_size' : 32,
#     'microbatch' : -1,
#     'ema_rate' : '0.9999',
#     'log_interval' : 5,
#     'save_interval' : 5,
# #     'resume_checkpoint' : "/opt/ml/code/resume_ckt/model000100.pt",
#     'use_fp16': True,
#     'fp16_scale_growth' : 1e-3,
#     's3_log_path' : s3_log_path,
#     'sagemakerdp' : True,
#     }


hyperparameters = {
    'attention_resolutions': '32,16,8',
    'class_cond': False,
    'diffusion_steps': 1000,
    'image_size': 32,
    'channel_mult': '1,1,2,4',
    'learn_sigma': True,
    'noise_schedule': 'linear',
    'num_channels': 256,
    'num_heads': 1,
    'num_res_blocks': 1,
    'resblock_updown': True,
    'use_fp16': True,
    'use_scale_shift_norm': True,
#     'schedule_sampler' : 'uniform',
    'lr': 1e-4,
    'weight_decay': 0.0,
    'lr_anneal_steps' : 2000,
    'batch_size' : 2,
#     'microbatch' : -1,
    'ema_rate' : '0.9999',
    'log_interval' : 10,
    'save_interval' : 200,
#     'resume_checkpoint' : "/opt/ml/code/resume_ckt/model000100.pt",
#     'fp16_scale_growth' : 1e-3,
    's3_log_path' : s3_log_path,   ### 로그를 위한 s3_log_path 추가
    'sagemakerdp' : False,
#     'eps': 1e-8,
    }

# mp_parameters = {
#         'num_microbatches': 16,
#         'num_partitions' : 4,
#         'placement_strategy': 'cluster', # cluster , spread
#         'pipeline': 'interleaved',
#         'optimize': 'speed',
#         'memory_weight': 0.2,
#         'ddp': True,
# }

experiment_name = 'diffusion-poc-exp2'
instance_type = 'ml.p4d.24xlarge'  # 'ml.p3.16xlarge', 'ml.p3dn.24xlarge', 'ml.p4d.24xlarge', 'local_gpu'
instance_type = 'local_gpu'
instance_count = 1
do_spot_training = False
max_wait = None
max_run = 1*60*60


# !gdown https://drive.google.com/uc?id=1vF8Ht0VThpobtmShD52_INhpIgy6eEXq
# !gdown https://drive.google.com/uc?id=1kaIqFwTLD7Ml3ib9NQpjoUSD4FUD21-I

# !rm -rf dataset
# !mkdir dataset
# !unzip birds.zip -d dataset/
# !tar zxvf CUB_200_2011.tgz -C dataset/

In [302]:
if instance_type =='local_gpu':
    from sagemaker.local import LocalSession
    from pathlib import Path

    sagemaker_session = LocalSession()
    sagemaker_session.config = {'local': {'local_code': True}}
    s3_data_path = 'file:///home/ec2-user/SageMaker/improved-diffusion-sagemaker/datasets/cifar10'
    source_dir = f'{Path.cwd()}/scripts'
    checkpoint_s3_bucket = None
else:
    sess = boto3.Session()
    sagemaker_session = sagemaker.Session()
    sm = sess.client('sagemaker')
    s3_data_path = 's3://dataset-us-west-2-cyj/cifar10'
    source_dir = 'scripts'
    checkpoint_s3_bucket = f's3://{bucket}/checkpoints'

In [303]:
image_uri = None
distribution = None
train_job_name = 'sagemaker'


train_job_name = 'smp-dist'
distribution = {}

if hyperparameters['sagemakerdp']:
    distribution["smdistributed"]={ 
                        "dataparallel": {
                            "enabled": True
                        }
                }

# elif hyperparameters['sagemakermp']:
#     distribution['smdistributed'] = { "modelparallel": {
#                                               "enabled":True,
#                                               "parameters": {
#                                                   "partitions": mp_parameters['num_partitions'],
#                                                   "microbatches": mp_parameters['num_microbatches'],
#                                                   "placement_strategy": mp_parameters['placement_strategy'],
#                                                   "pipeline": mp_parameters['pipeline'],
#                                                   "optimize": mp_parameters['optimize'],
#                                                   "memory_weight": mp_parameters['memory_weight'],
#                                                   "ddp": mp_parameters['ddp'],
#                                               }
#                                           }
#                                       }
#     distribution["mpi"]={
#                         "enabled": True,
#                         "processes_per_host": 8, # Pick your processes_per_host
#                         "custom_mpi_options": "-verbose -x orte_base_help_aggregate=0 -x FI_EFA_USE_DEVICE_RDMA=1 -x FI_PROVIDER=efa " #  -x SMP_SKIP_GRAPH_VALIDATION=1
#                   }

else:
    distribution["mpi"]={
                        "enabled": True,
    #                     "processes_per_host": 8, # Pick your processes_per_host
    #                     "custom_mpi_options": "-verbose -x orte_base_help_aggregate=0 "
                  }

if do_spot_training:
    max_wait = max_run

print("train_job_name : {} \ntrain_instance_type : {} \ntrain_instance_count : {} \nimage_uri : {} \ndistribution : {}".format(train_job_name, instance_type, instance_count, image_uri, distribution))    

train_job_name : smp-dist 
train_instance_type : local_gpu 
train_instance_count : 1 
image_uri : None 
distribution : {'mpi': {'enabled': True}}


In [304]:
# image_uri='322537213286.dkr.ecr.us-west-2.amazonaws.com/diffusion-sagemaker-smddp:smddp-1.2.2-pt-1.9.0'
image_uri='322537213286.dkr.ecr.us-west-2.amazonaws.com/pytorch-training:pytorch-21-10'
image_uri='322537213286.dkr.ecr.us-west-2.amazonaws.com/diffusion-sagemaker-smddp:pt110-smddp122'
# image_uri='322537213286.dkr.ecr.us-west-2.amazonaws.com/diffusion-sagemaker-smddp:pt110-21-10'

In [305]:
# all input configurations, parameters, and metrics specified in estimator 
# definition are automatically tracked
estimator = PyTorch(
    entry_point='image_train.py',
    source_dir=source_dir,
    role=role,
    sagemaker_session=sagemaker_session,
    framework_version='1.9',
    py_version='py38',
#     image_uri=image_uri,
    instance_count=instance_count,
    instance_type=instance_type,
    volume_size=256,
    code_location = code_location,
    output_path=output_path,
    hyperparameters=hyperparameters,
    distribution=distribution,
    disable_profiler=True,
    debugger_hook_config=False,
    metric_definitions=metric_definitions,
#     rules=rules,
    max_run=max_run,
    use_spot_instances=do_spot_training,  # spot instance 활용
    max_wait=max_wait,
    subnets=['subnet-02e36c042e58264e6'],   ## 	subnet-05c77affac40aa7f3 (2b)  subnet-02e36c042e58264e6 (2c)
    security_group_ids=['sg-0bc738570daec9015'],
    checkpoint_s3_uri=checkpoint_s3_bucket,
    TrainingInputMode='File', ## FastFile
#     checkpoint_local_path=f'/opt/ml/checkpoints',
    max_retry_attempts=30
)

#### lustre preload

In [306]:
# ## https://docs.aws.amazon.com/fsx/latest/LustreGuide/preload-file-contents-hsm.html
# # sudo lfs hsm_restore path/to/file
# # sudo lfs hsm_action path/to/file
# !find /home/ec2-user/SageMaker/dstaset-2a -type f -print0 | xargs -0 -n 1 sudo lfs hsm_restore

In [307]:
# Configure FSx Input for your SageMaker Training job

from sagemaker.inputs import FileSystemInput

file_system_directory_path= '/hlz2pbmv/BIRDS'  # '/5n6znbmv'    g4ljfbmv
 
file_system_id='fs-0cd6d9b6c3c7f614e'  # fs-0849611d06d289065  063be12d6ca6d7862

file_system_access_mode='rw'
file_system_type='FSxLustre'
train_fs = FileSystemInput(file_system_id=file_system_id,
                                    file_system_type=file_system_type,
                                    directory_path=file_system_directory_path,
                                    file_system_access_mode=file_system_access_mode)

In [308]:
# Configure FSx Input for your SageMaker Training job

from sagemaker.inputs import FileSystemInput

file_system_directory_path= '/bwh3hbmv/cifar10'  # '/5n6znbmv'    g4ljfbmv
 
file_system_id='fs-0ac78e311f71fd34a'  # fs-0849611d06d289065  063be12d6ca6d7862

file_system_access_mode='rw'
file_system_type='FSxLustre'
train_fs = FileSystemInput(file_system_id=file_system_id,
                                    file_system_type=file_system_type,
                                    directory_path=file_system_directory_path,
                                    file_system_access_mode=file_system_access_mode)

In [309]:
# # Configure FSx Input for your SageMaker Training job - cifar10-2

# from sagemaker.inputs import FileSystemInput

# file_system_directory_path= '/pwa3hbmv/cifar10'  # '/5n6znbmv'    g4ljfbmv
 
# file_system_id='fs-0a50fc761273ae496'  # fs-0849611d06d289065  063be12d6ca6d7862

# file_system_access_mode='rw'
# file_system_type='FSxLustre'
# train_fs = FileSystemInput(file_system_id=file_system_id,
#                                     file_system_type=file_system_type,
#                                     directory_path=file_system_directory_path,
#                                     file_system_access_mode=file_system_access_mode)

In [310]:
if instance_type =='local_gpu':
    inputs = s3_data_path
else:
    inputs = train_fs
    inputs = s3_data_path

In [311]:
# %%time
# create_experiment(experiment_name)
# job_name = create_trial(experiment_name, hyperparameters, instance_type, instance_count, do_spot_training)

# if not instance_type =='local_gpu':
#     target_resume_checkpoint=checkpoint_s3_bucket +"/"+ job_name  ## model000001.pt만 resume_checkpoint에 추가
#     estimator.checkpoint_s3_uri=target_resume_checkpoint

# ## checkpoint가 들어있는 S3 위치 --> 새로운 checkpoint S3로 복제
# !aws s3 cp 's3://diffusion-sagemaker-211011/checkpoints/diffusion-poc-exp2-p4d-2-sdp-d-1023-10211634984488/model000365.pt' ${target_resume_checkpoint}/model.pt

# # Now associate the estimator with the Experiment and Trial
# estimator.fit(
#     inputs={'training': inputs}, 
#     job_name=job_name,
#     experiment_config={
#       'TrialName': job_name,
#       'TrialComponentDisplayName': job_name,
#     },
#     wait=False,
# )

In [312]:
checkpoint_jobname = 'diffusion-poc-exp2-p4d-2-sdp-d-1104-01181635988690'
checkpoint_id='000031'

In [313]:
# !aws s3 rm s3://diffusion-sagemaker-211011/resume_checkpoint/ --recursive
# !aws s3 cp s3://diffusion-sagemaker-211011/checkpoints/{checkpoint_jobname}/model{checkpoint_id}.pt s3://diffusion-sagemaker-211011/resume_checkpoint/
# !aws s3 cp s3://diffusion-sagemaker-211011/checkpoints/{checkpoint_jobname}/ema_0.9999_{checkpoint_id}.pt s3://diffusion-sagemaker-211011/resume_checkpoint/
# !aws s3 cp s3://diffusion-sagemaker-211011/checkpoints/{checkpoint_jobname}/opt{checkpoint_id}.pt s3://diffusion-sagemaker-211011/resume_checkpoint/                

In [314]:
# hyperparameters['resume_checkpoint']=f"model{checkpoint_id}.pt"

In [315]:
%%time
create_experiment(experiment_name)
job_name = create_trial(experiment_name, hyperparameters, instance_type, instance_count, do_spot_training)

if not instance_type =='local_gpu':
    target_resume_checkpoint=checkpoint_s3_bucket +"/"+ job_name  ## model000001.pt만 resume_checkpoint에 추가
    estimator.checkpoint_s3_uri=target_resume_checkpoint
    
inputs2 = 's3://diffusion-sagemaker-211011/resume_checkpoint/'
# Now associate the estimator with the Experiment and Trial
estimator.fit(
    inputs={'training': inputs}, 
#     inputs={'training': inputs, 'checkpoint' : inputs2}, 
    job_name=job_name,
    experiment_config={
      'TrialName': job_name,
      'TrialComponentDisplayName': job_name,
    },
    wait=False,
)

INFO:sagemaker:Creating training-job with name: diffusion-poc-exp2-test-1-ds-d-1123-09101637658611
INFO:sagemaker.local.local_session:Starting training job
INFO:sagemaker.local.image:No AWS credentials found in session but credentials from EC2 Metadata Service are available.
INFO:sagemaker.local.image:docker compose file: 
networks:
  sagemaker-local:
    name: sagemaker-local
services:
  algo-1-v8qcv:
    command: train
    container_name: yhxltv93ae-algo-1-v8qcv
    environment:
    - '[Masked]'
    - '[Masked]'
    image: 763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-training:1.9-gpu-py38
    networks:
      sagemaker-local:
        aliases:
        - algo-1-v8qcv
    runtime: nvidia
    stdin_open: true
    tty: true
    volumes:
    - /tmp/tmpnbb1nqek/algo-1-v8qcv/output:/opt/ml/output
    - /tmp/tmpnbb1nqek/algo-1-v8qcv/input:/opt/ml/input
    - /tmp/tmpnbb1nqek/algo-1-v8qcv/output/data:/opt/ml/output/data
    - /tmp/tmpnbb1nqek/model:/opt/ml/model
    - /opt/ml/metadata:/

Creating yhxltv93ae-algo-1-v8qcv ... 
Creating yhxltv93ae-algo-1-v8qcv ... done
Attaching to yhxltv93ae-algo-1-v8qcv
[36myhxltv93ae-algo-1-v8qcv |[0m 2021-11-23 09:10:15,145 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training
[36myhxltv93ae-algo-1-v8qcv |[0m 2021-11-23 09:10:15,222 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.
[36myhxltv93ae-algo-1-v8qcv |[0m 2021-11-23 09:10:15,224 sagemaker_pytorch_container.training INFO     Invoking user training script.
[36myhxltv93ae-algo-1-v8qcv |[0m 2021-11-23 09:10:15,225 sagemaker-training-toolkit INFO     Installing dependencies from requirements.txt:
[36myhxltv93ae-algo-1-v8qcv |[0m /opt/conda/bin/python3.8 -m pip install -r requirements.txt
[36myhxltv93ae-algo-1-v8qcv |[0m Collecting blobfile==0.11.0
[36myhxltv93ae-algo-1-v8qcv |[0m Downloading blobfile-0.11.0-py3-none-any.whl (32 kB)
[36myhxltv93ae-algo-1-v8qcv |[0m Collecting nvgpu
[36myhxl

KeyboardInterrupt: 

In [299]:
job_name=estimator.latest_training_job.name
# job_name='dalle-poc-exp5-p4d-2-d-0530-12261622377580'
# dalle-poc-exp4-p4d-2-d-0525-03071621912021 --> public
# dalle-poc-exp4-p4d-2-d-0525-03091621912148 --> another private
# job_name='dalle-poc-exp1-p4d-1-sdp-d-1006-13111633525892'

# job_name = 'diffusion-poc-exp2-p4d-2-ds-d-1119-01441637286264'

AttributeError: 'NoneType' object has no attribute 'name'

In [None]:
sagemaker_session.logs_for_job(job_name=job_name, wait=True)

2021-11-22 14:40:15 Starting - Preparing the instances for training
2021-11-22 14:40:15 Downloading - Downloading input data
2021-11-22 14:40:15 Training - Training image download completed. Training in progress.[35mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[35mbash: no job control in this shell[0m
[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2021-11-22 14:40:16,380 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2021-11-22 14:40:16,462 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[35m2021-11-22 14:40:15,912 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[35m2021-11-22 14:40:15,994 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[35m2021-11-22 14:40:19,018 sagemak

In [562]:
import os

In [666]:
opt_list = []
ema_list = []
model_list = []

for cktroot, cktdirs, cktfiles in os.walk('/home/ec2-user/SageMaker/checkpoints'):
    if len(cktfiles) > 0:
        for cktfile in cktfiles:
            flag = cktfile[:3]
            if flag == 'opt':
                opt_list.append(cktfile)
            elif flag == 'mod':
                model_list.append(cktfile)
            elif flag == 'ema':
                ema_list.append(cktfile)
resume_checkpoint=cktroot + "/" + sorted(model_list)[-1]

/home/ec2-user/SageMaker/checkpoints [] ['opt000040.pt', 'ema_0.9999_000085.pt', 'model000100.pt', 'ema_0.9999_000005.pt', 'model000090.pt', 'model000040.pt', 'ema_0.9999_000095.pt', 'opt000090.pt', 'opt000030.pt', 'opt000100.pt', 'model.pt', 'ema_0.9999_000040.pt', 'opt000025.pt', 'ema_0.9999_000035.pt', 'model000020.pt', 'model000005.pt', 'ema_0.9999_000015.pt', 'model000070.pt', 'ema_0.9999_000030.pt', 'model000095.pt', 'ema_0.9999_000065.pt', 'model000080.pt', 'model000030.pt', 'ema_0.9999_000100.pt', 'opt000095.pt', 'ema_0.9999_000055.pt', 'model000025.pt', 'opt000080.pt', 'ema_0.9999_000045.pt', 'model000000.pt', 'model000075.pt', 'opt000075.pt', 'opt000020.pt', 'model000065.pt', 'model000045.pt', 'ema_0.9999_000050.pt', 'opt000045.pt', 'model000085.pt', 'model000010.pt', 'ema_0.9999_000010.pt', 'ema_0.9999_000090.pt', 'model000050.pt', 'ema_0.9999_000060.pt', 'model000060.pt', 'opt000035.pt', 'opt000010.pt', 'opt000085.pt', 'opt000055.pt', 'opt000000.pt', 'model000055.pt', 'ema_

In [665]:
cktfiles

['opt000040.pt',
 'ema_0.9999_000085.pt',
 'model000100.pt',
 'ema_0.9999_000005.pt',
 'model000090.pt',
 'model000040.pt',
 'ema_0.9999_000095.pt',
 'opt000090.pt',
 'opt000030.pt',
 'opt000100.pt',
 'model.pt',
 'ema_0.9999_000040.pt',
 'opt000025.pt',
 'ema_0.9999_000035.pt',
 'model000020.pt',
 'model000005.pt',
 'ema_0.9999_000015.pt',
 'model000070.pt',
 'ema_0.9999_000030.pt',
 'model000095.pt',
 'ema_0.9999_000065.pt',
 'model000080.pt',
 'model000030.pt',
 'ema_0.9999_000100.pt',
 'opt000095.pt',
 'ema_0.9999_000055.pt',
 'model000025.pt',
 'opt000080.pt',
 'ema_0.9999_000045.pt',
 'model000000.pt',
 'model000075.pt',
 'opt000075.pt',
 'opt000020.pt',
 'model000065.pt',
 'model000045.pt',
 'ema_0.9999_000050.pt',
 'opt000045.pt',
 'model000085.pt',
 'model000010.pt',
 'ema_0.9999_000010.pt',
 'ema_0.9999_000090.pt',
 'model000050.pt',
 'ema_0.9999_000060.pt',
 'model000060.pt',
 'opt000035.pt',
 'opt000010.pt',
 'opt000085.pt',
 'opt000055.pt',
 'opt000000.pt',
 'model000055.p

In [593]:
model_list

['model000100.pt',
 'model000090.pt',
 'model000040.pt',
 'model.pt',
 'model000020.pt',
 'model000005.pt',
 'model000070.pt',
 'model000095.pt',
 'model000080.pt',
 'model000030.pt',
 'model000025.pt',
 'model000000.pt',
 'model000075.pt',
 'model000065.pt',
 'model000045.pt',
 'model000085.pt',
 'model000010.pt',
 'model000050.pt',
 'model000060.pt',
 'model000055.pt',
 'model000035.pt',
 'model000015.pt']

In [594]:
sorted(model_list)

['model.pt',
 'model000000.pt',
 'model000005.pt',
 'model000010.pt',
 'model000015.pt',
 'model000020.pt',
 'model000025.pt',
 'model000030.pt',
 'model000035.pt',
 'model000040.pt',
 'model000045.pt',
 'model000050.pt',
 'model000055.pt',
 'model000060.pt',
 'model000065.pt',
 'model000070.pt',
 'model000075.pt',
 'model000080.pt',
 'model000085.pt',
 'model000090.pt',
 'model000095.pt',
 'model000100.pt']

In [559]:
sorted(ckt_list)

['/home/ec2-user/SageMaker/checkpoints/ema_0.9999_000000.pt',
 '/home/ec2-user/SageMaker/checkpoints/ema_0.9999_000005.pt',
 '/home/ec2-user/SageMaker/checkpoints/ema_0.9999_000010.pt',
 '/home/ec2-user/SageMaker/checkpoints/ema_0.9999_000015.pt',
 '/home/ec2-user/SageMaker/checkpoints/ema_0.9999_000020.pt',
 '/home/ec2-user/SageMaker/checkpoints/ema_0.9999_000025.pt',
 '/home/ec2-user/SageMaker/checkpoints/ema_0.9999_000030.pt',
 '/home/ec2-user/SageMaker/checkpoints/ema_0.9999_000035.pt',
 '/home/ec2-user/SageMaker/checkpoints/ema_0.9999_000040.pt',
 '/home/ec2-user/SageMaker/checkpoints/ema_0.9999_000045.pt',
 '/home/ec2-user/SageMaker/checkpoints/ema_0.9999_000050.pt',
 '/home/ec2-user/SageMaker/checkpoints/ema_0.9999_000055.pt',
 '/home/ec2-user/SageMaker/checkpoints/ema_0.9999_000060.pt',
 '/home/ec2-user/SageMaker/checkpoints/ema_0.9999_000065.pt',
 '/home/ec2-user/SageMaker/checkpoints/ema_0.9999_000070.pt',
 '/home/ec2-user/SageMaker/checkpoints/ema_0.9999_000075.pt',
 '/home/

In [555]:
test= ['a', 'b']

In [557]:
print(sorted(test))

['a', 'b']


In [121]:
test={}

In [122]:
test['PMI_RANK'] = "1"
test['OMPI_COMM_WORLD_RANK'] = "3"

In [123]:
    for varname in ["PMI_RANK", "OMPI_COMM_WORLD_RANK"]:
        if varname in test:
            print(int(test[varname]))

1
3


In [95]:
log_suffix = log_suffix + "-rank%03i" % rank
print(log_suffix)

log-rank014


### Clean Up Amazon SageMaker Experiment Resources
- https://docs.aws.amazon.com/sagemaker/latest/dg/experiments-cleanup.html

In [None]:
import boto3
sm = boto3.Session().client('sagemaker')

In [None]:
def cleanup_boto3(experiment_name):
    trials = sm.list_trials(ExperimentName=experiment_name)['TrialSummaries']
    print('TrialNames:')
    for trial in trials:
        trial_name = trial['TrialName']
        print(f"\n{trial_name}")

        components_in_trial = sm.list_trial_components(TrialName=trial_name)
        print('\tTrialComponentNames:')
        for component in components_in_trial['TrialComponentSummaries']:
            component_name = component['TrialComponentName']
            print(f"\t{component_name}")
            sm.disassociate_trial_component(TrialComponentName=component_name, TrialName=trial_name)
            try:
                # comment out to keep trial components
                sm.delete_trial_component(TrialComponentName=component_name)
            except:
                # component is associated with another trial
                continue
            # to prevent throttling
            time.sleep(.5)
        sm.delete_trial(TrialName=trial_name)
    sm.delete_experiment(ExperimentName=experiment_name)
    print(f"\nExperiment {experiment_name} deleted")

In [None]:
# Use experiment name not display name
experiment_name = "dalle-poc-exp4"
cleanup_boto3(experiment_name)

In [None]:
!pip install piexif

In [None]:
import piexif

In [None]:
image_size=256

In [None]:
image_file = '/home/ec2-user/SageMaker/lg-ai-research/dalle-sagemaker-dp-mp/test2.png'

In [None]:
from skimage import io, color

In [None]:
image_transform1 = T.Compose([
    T.ToPILImage(),
    T.RandomResizedCrop(image_size,
                        scale=(0.8, 1.),
                        ratio=(1., 1.)),
    T.ToTensor()
])

In [None]:
try:
    array_img = io.imread(image_file)
    image_tensor = image_transform1(array_img)
except (PIL.UnidentifiedImageError, OSError, ValueError) as corrupt_image_exceptions:
    print(f"An exception occurred trying to load file.")

In [None]:
image_tensor.shape

In [None]:
trans = transforms.ToPILImage()
plt.imshow(trans(image_tensor))

In [None]:
im = Image.open(image_file)
rgb_im = im.convert('RGB')
rgb_im.save('test.jpg')

In [None]:
image_file = '/home/ec2-user/SageMaker/lg-ai-research/dalle-sagemaker-dp-mp/test.jpg'
image_file = '/home/ec2-user/SageMaker/dataset/BIRDS/CUB_200_2011/images/029.American_Crow/American_Crow_0053_25203.jpg'

In [None]:
array_img = PIL.Image.open(image_file)

In [None]:
array_img.info

In [None]:
from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True

In [None]:
import PIL

In [None]:
image_transform = T.Compose([
    T.RandomResizedCrop(image_size,
                        scale=(0.8, 1.),
                        ratio=(1., 1.)),
    T.ToTensor()
])

In [None]:
try:
#     piexif.remove(image_file)
    array_img = PIL.Image.open(image_file)
    array_img = array_img.convert('RGB')
    
    image_tensor = image_transform(array_img)
except (PIL.UnidentifiedImageError, OSError, ValueError) as corrupt_image_exceptions:
    print(f"An exception occurred trying to load file.")

In [None]:
image_tensor.shape

In [None]:
trans = transforms.ToPILImage()
plt.imshow(trans(image_tensor))

In [None]:
array_img.info.get("transparency", None)

In [None]:
if array_img.info.get("transparency", None):
    print(f"[transparency] An exception occurred trying to load file.")


In [None]:
array_img = PIL.Image.open(image_file)
            img = self.img_convert(array_img)
        except (PIL.UnidentifiedImageError, OSError) as corrupt_image_exceptions:
            print(f"An exception occurred trying to load file {image_file}.")
            print(f"Skipping index {ind}")
            return self.skip_sample(ind)

        try:
            if img.info.get("transparency", None):

In [359]:
entt = {"additional_framework_parameters":{"sagemaker_distributed_dataparallel_custom_mpi_options":"","sagemaker_distributed_dataparallel_enabled":True,"sagemaker_instance_type":"local_gpu"},"channel_input_dirs":{"training":"/opt/ml/input/data/training"},"current_host":"algo-1-84g0b","framework_module":"sagemaker_pytorch_container.training:main","hosts":["algo-1-84g0b"],"hyperparameters":{"batch_size":1024,"ema_rate":"0.9999","fp16_scale_growth":0.001,"log_interval":5,"lr":0.0001,"lr_anneal_steps":5000,"microbatch":16,"sagemakerdp":True,"save_interval":5,"schedule_sampler":"uniform","use_fp16":False,"weight_decay":0.0},"input_config_dir":"/opt/ml/input/config","input_data_config":{"training":{"TrainingInputMode":"File"}},"input_dir":"/opt/ml/input","is_master":True,"job_name":"diffusion-poc-exp1-test-1-sdp-d-1023-04051634961959","log_level":20,"master_hostname":"algo-1-84g0b","model_dir":"/opt/ml/model","module_dir":"/opt/ml/code","module_name":"image_train","network_interface_name":"eth0","num_cpus":64,"num_gpus":8,"output_data_dir":"/opt/ml/output/data","output_dir":"/opt/ml/output","output_intermediate_dir":"/opt/ml/output/intermediate","resource_config":{"current_host":"algo-1-84g0b","hosts":["algo-1-84g0b"]},"user_entry_point":"image_train.py"}

In [422]:
entt

{'additional_framework_parameters': {'sagemaker_distributed_dataparallel_custom_mpi_options': '',
  'sagemaker_distributed_dataparallel_enabled': True,
  'sagemaker_instance_type': 'local_gpu'},
 'channel_input_dirs': {'training': '/opt/ml/input/data/training'},
 'current_host': 'algo-1-84g0b',
 'framework_module': 'sagemaker_pytorch_container.training:main',
 'hosts': ['algo-1-84g0b'],
 'hyperparameters': {'batch_size': 1024,
  'ema_rate': '0.9999',
  'fp16_scale_growth': 0.001,
  'log_interval': 5,
  'lr': 0.0001,
  'lr_anneal_steps': 5000,
  'microbatch': 16,
  'sagemakerdp': True,
  'save_interval': 5,
  'schedule_sampler': 'uniform',
  'use_fp16': False,
  'weight_decay': 0.0},
 'input_config_dir': '/opt/ml/input/config',
 'input_data_config': {'training': {'TrainingInputMode': 'File'}},
 'input_dir': '/opt/ml/input',
 'is_master': True,
 'job_name': 'diffusion-poc-exp1-test-1-sdp-d-1023-04051634961959',
 'log_level': 20,
 'master_hostname': 'algo-1-84g0b',
 'model_dir': '/opt/ml/

In [357]:
entt['job_name']

'diffusion-poc-exp1-test-1-sdp-d-1023-04051634961959'

In [360]:
import json

In [361]:
json.loads()

<function json.loads(s, *, encoding=None, cls=None, object_hook=None, parse_float=None, parse_int=None, parse_constant=None, object_pairs_hook=None, **kw)>