# 1.SM_Multigpu Distributed Training-ScriptMode-DALLE
---

본 모듈에서는 Amzaon SageMaker API을 효과적으로 이용하기 위해 multigpu-distributed 학습을 위한 PyTorch 프레임워크 자체 구현만으로 모델 훈련을 수행해 봅니다.

In [14]:
install_needed = True  # should only be True once
# install_needed = False

In [15]:
import sys
import IPython

if install_needed:
    print("installing deps and restarting kernel")
#     !{sys.executable} -m pip install -U split-folders tqdm albumentations crc32c wget
    !{sys.executable} -m pip install 'sagemaker[local]' --upgrade
    !{sys.executable} -m pip install -U bokeh smdebug sagemaker-experiments gdown
    !{sys.executable} -m pip install -U sagemaker torch torchvision
    !/bin/bash ./local/local_mode_setup.sh
    IPython.Application.instance().kernel.do_shutdown(True)

installing deps and restarting kernel
Collecting gdown
  Downloading gdown-3.13.0.tar.gz (9.3 kB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h    Preparing wheel metadata ... [?25ldone
Building wheels for collected packages: gdown
  Building wheel for gdown (PEP 517) ... [?25ldone
[?25h  Created wheel for gdown: filename=gdown-3.13.0-py3-none-any.whl size=9034 sha256=b57d1601f1c4d8eed6120500ebab712b3c9f60b06b17dcf377a7c0b29a4b62dc
  Stored in directory: /home/ec2-user/.cache/pip/wheels/6a/87/bd/09b16161b149fd6711ac76b5420d78ed58bd6a320e892117c3
Successfully built gdown
Installing collected packages: gdown
Successfully installed gdown-3.13.0
nvidia-docker2 already installed. We are good to go!
Stopping docker: [60G[[0;32m  OK  [0;39m]
Starting docker:	.[60G[[0;32m  OK  [0;39m]
SageMaker instance route table setup is ok. We are good to go.
SageMaker instance routing for Docker is ok. We are good to go!


## 2. 환경 설정

<p>Sagemaker 학습에 필요한 기본적인 package를 import 합니다. </p>
<p>boto3는 HTTP API 호출을 숨기는 편한 추상화 모델을 가지고 있고, Amazon EC2 인스턴스 및 S3 버켓과 같은 AWS 리소스와 동작하는 파이선 클래스를 제공합니다. </p>
<p>sagemaker python sdk는 Amazon SageMaker에서 기계 학습 모델을 교육 및 배포하기 위한 오픈 소스 라이브러리입니다.</p>

In [5]:
import joblib
import matplotlib.pyplot as plt
import sagemaker
# import splitfolders

import datetime
import glob
import os
import time
import warnings

from smexperiments.experiment import Experiment
from smexperiments.trial import Trial

# import wget
# import tarfile
import shutil

import boto3
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torchvision

# from tqdm import tqdm
from time import strftime
from PIL import Image
from torch.utils.data import Dataset
from torchvision import datasets, transforms

from sagemaker import get_execution_role
from sagemaker.pytorch import PyTorch

from sagemaker.debugger import (Rule,
                                rule_configs,
                                ProfilerConfig, 
                                FrameworkProfile, 
                                DetailedProfilingConfig, 
                                DataloaderProfilingConfig, 
                                PythonProfilingConfig)

warnings.filterwarnings('ignore')
%config InlineBackend.figure_format = 'retina'

In [6]:
role = get_execution_role()

In [7]:
sagemaker.__version__

'2.41.0'

In [8]:
def create_experiment(experiment_name):
    try:
        sm_experiment = Experiment.load(experiment_name)
    except:
        sm_experiment = Experiment.create(experiment_name=experiment_name,
                                          tags=[
                                              {
                                                  'Key': 'multigpu',
                                                  'Value': 'yes'
                                              },
                                              {
                                                  'Key': 'multinode',
                                                  'Value': 'yes'
                                              },
                                          ])

In [9]:
def create_trial(experiment_name, set_param, i_type, i_cnt, spot):
    create_date = strftime("%m%d-%H%M%s")
    
    spot = 's' if spot else 'd'
    i_tag = 'test'
    if i_type == 'ml.p3.16xlarge':
        i_tag = 'p3'
    elif i_type == 'ml.p3dn.24xlarge':
        i_tag = 'p3dn'
    elif i_type == 'ml.p4d.24xlarge':
        i_tag = 'p4d'    
        
    trial = "-".join([i_tag,str(i_cnt),spot])
       
    sm_trial = Trial.create(trial_name=f'{experiment_name}-{trial}-{create_date}',
                            experiment_name=experiment_name)

    job_name = f'{sm_trial.trial_name}'
    return job_name

In [10]:
bucket = 'bucket-exp-dalle-210410'
code_location = f's3://{bucket}/sm_codes'
output_path = f's3://{bucket}/poc_dalle/output/' 

In [11]:
metric_definitions=[
     {'Name': 'train:lr', 'Regex': 'lr - (.*?),'},
     {'Name': 'train:Loss', 'Regex': 'loss -(.*?),'},
]

In [12]:
from sagemaker.debugger import Rule, ProfilerRule, rule_configs

rules=[ 
    Rule.sagemaker(rule_configs.loss_not_decreasing()),
    Rule.sagemaker(rule_configs.overfit()),
    ProfilerRule.sagemaker(rule_configs.ProfilerReport()),
]

In [13]:
hyperparameters = {
        'epochs' : 15,
        'batch_size' : 96, # 24, # 32, # 48, # 36, # 96 train_batch_size is not equal to micro_batch_per_gpu * gradient_acc_step * world_size14 != 1 * 1 * 8
        'learning_rate' : 3e-4,
        'clip_grad_norm' : 0.5,
        'dim' : 512,
        'text_seq_len' : 256,
        'depth' : 2,
        'heads' : 8,
        'dim_head' : 64,
        'reversible' : True,
        'loss_img_weight' : 7,
        'reversible' : True,
        'bpe_path' : '/opt/ml/code/dalle_pytorch/data/bpe_simple_vocab_16e6.model',
#         'vae_path' : 'model/vae/vae-final.pt',
#         'dalle_path' : 'model/dalle/dalle.pt',
        'fp16' : True,
        'taming' : True,
        'num_worker' : 0,
        'sagemakermp' : True,
        'num_microbatches': 4,
        'num_partitions' : 4,
        'placement_strategy': 'cluster', # cluster , spread
        'pipeline': 'interleaved',
        'optimize': 'speed',
        'ddp': True,
    }

experiment_name = 'dalle-poc-exp3'
# instance_type = 'local_gpu'
instance_type = 'ml.p4d.24xlarge'  # 'ml.p3.16xlarge', 'ml.p3dn.24xlarge', 'ml.p4d.24xlarge', 'local_gpu'
instance_count = 1
do_spot_training = False
max_wait = None
max_run = 2*60*60


In [14]:
hyperparameters = {
        'epochs' : 1,
        'batch_size' : 16, # 96
        'learning_rate' : 3e-4,
        'clip_grad_norm' : 0.5,
        'dim' : 64,
        'text_seq_len' : 128,
        'depth' : 1,
        'heads' : 2,
        'dim_head' : 16,
        'reversible' : True,
        'loss_img_weight' : 7,
        'reversible' : True,
        'bpe_path' : '/opt/ml/code/dalle_pytorch/data/bpe_simple_vocab_16e6.model',
#         'hug' : True,
#         'vae_path' : 'model/vae/vae-final.pt',
#         'dalle_path' : 'model/dalle/dalle.pt',
#         'fp16' : True,
        'taming' : True,
        'num_worker' : 0,
        'sagemakermp' : True,
        'num_microbatches': 4,
        'num_partitions' : 4,
        'placement_strategy': 'cluster', # cluster , spread
        'pipeline': 'interleaved',
        'optimize': 'speed',
        'ddp': True,
    }

experiment_name = 'dalle-poc-exp3'
instance_type = 'local_gpu'
# instance_type = 'ml.p4d.24xlarge'  # 'ml.p3.16xlarge', 'ml.p3dn.24xlarge', 'ml.p4d.24xlarge', 'local_gpu'
instance_count = 1
do_spot_training = False
max_wait = None
max_run = 2*60*60


In [15]:
!gdown https://drive.google.com/uc?id=1vF8Ht0VThpobtmShD52_INhpIgy6eEXq
!gdown https://drive.google.com/uc?id=1kaIqFwTLD7Ml3ib9NQpjoUSD4FUD21-I

Downloading...
From: https://drive.google.com/uc?id=1vF8Ht0VThpobtmShD52_INhpIgy6eEXq
To: /home/ec2-user/SageMaker/lg-ai-research/DALLE-pytorch-sm-210513/CUB_200_2011.tgz
1.15GB [00:05, 210MB/s] 
Downloading...
From: https://drive.google.com/uc?id=1kaIqFwTLD7Ml3ib9NQpjoUSD4FUD21-I
To: /home/ec2-user/SageMaker/lg-ai-research/DALLE-pytorch-sm-210513/birds.zip
613MB [00:02, 207MB/s]  


In [20]:
!rm -rf dataset
!mkdir dataset
!unzip birds.zip -d dataset/
!tar zxvf CUB_200_2011.tgz -C dataset/

Archive:  birds.zip
   creating: dataset/birds/
  inflating: dataset/birds/readme    
  inflating: dataset/birds/example_captions.txt  
   creating: dataset/birds/train/
  inflating: dataset/birds/train/char-CNN-RNN-embeddings.pickle  
  inflating: dataset/birds/train/filenames.pickle  
  inflating: dataset/birds/train/class_info.pickle  
   creating: dataset/birds/test/
  inflating: dataset/birds/test/char-CNN-RNN-embeddings.pickle  
  inflating: dataset/birds/test/filenames.pickle  
  inflating: dataset/birds/test/class_info.pickle  
   creating: dataset/birds/text_c10/
   creating: dataset/birds/text_c10/185.Bohemian_Waxwing/
  inflating: dataset/birds/text_c10/185.Bohemian_Waxwing/Bohemian_Waxwing_0115_177724.txt  
  inflating: dataset/birds/text_c10/185.Bohemian_Waxwing/Bohemian_Waxwing_0022_177642.txt  
  inflating: dataset/birds/text_c10/185.Bohemian_Waxwing/Bohemian_Waxwing_0042_177887.txt  
  inflating: dataset/birds/text_c10/185.Bohemian_Waxwing/Bohemian_Waxwing_0048_177821.t

In [21]:
if instance_type =='local_gpu':
    from sagemaker.local import LocalSession
    from pathlib import Path

    sagemaker_session = LocalSession()
    sagemaker_session.config = {'local': {'local_code': True}}
    s3_data_path = 'file:///home/ec2-user/SageMaker/SageMaker-dalle/CUB_BIRD'
    source_dir = f'{Path.cwd()}/source_code'
else:
    sess = boto3.Session()
    sagemaker_session = sagemaker.Session()
    sm = sess.client('sagemaker')
#     bucket_name = 'dataset-cyj-coco-210410'
#     s3_data_path = f's3://{bucket_name}/dataset1'
    s3_data_path = 's3://dataset-cyj-us-east-1/CUB-BIRD'
    source_dir = 'source_code'


In [22]:
image_uri = None
distribution = None
train_job_name = 'sagemaker'


train_job_name = 'smp-dist'
distribution = {}

if hyperparameters.get('sagemakermp'):
    distribution['smdistributed'] = { "modelparallel": {
                                              "enabled":True,
                                              "parameters": {
                                                  "partitions": hyperparameters['num_partitions'],
                                                  "microbatches": hyperparameters['num_microbatches'],
                                                  "placement_strategy": hyperparameters['placement_strategy'],
                                                  "pipeline": hyperparameters['pipeline'],
                                                  "optimize": hyperparameters['optimize'],
                                                  "ddp": hyperparameters['ddp'],
                                              }
                                          }
                                      }



distribution["mpi"]={
                    "enabled": True,
                    "processes_per_host": 8, # Pick your processes_per_host
                    "custom_mpi_options": "-verbose -x orte_base_help_aggregate=0 "
              }

if do_spot_training:
    max_wait = max_run

print("train_job_name : {} \ntrain_instance_type : {} \ntrain_instance_count : {} \nimage_uri : {} \ndistribution : {}".format(train_job_name, instance_type, instance_count, image_uri, distribution))    

train_job_name : smp-dist 
train_instance_type : local_gpu 
train_instance_count : 1 
image_uri : None 
distribution : {'smdistributed': {'modelparallel': {'enabled': True, 'parameters': {'partitions': 4, 'microbatches': 4, 'placement_strategy': 'cluster', 'pipeline': 'interleaved', 'optimize': 'speed', 'ddp': True}}}, 'mpi': {'enabled': True, 'processes_per_host': 8, 'custom_mpi_options': '-verbose -x orte_base_help_aggregate=0 '}}


In [26]:
%%time

# all input configurations, parameters, and metrics specified in estimator 
# definition are automatically tracked
estimator = PyTorch(
    entry_point='train_dalle_sm.py',
    source_dir=source_dir,
    role=role,
    sagemaker_session=sagemaker_session,
    framework_version='1.8.0',
    py_version='py36',
    instance_count=instance_count,
    instance_type=instance_type,
    volume_size=1024,
    code_location = code_location,
    output_path=output_path,
    hyperparameters=hyperparameters,
    distribution=distribution,
    disable_profiler=True,
    metric_definitions=metric_definitions,
#     rules=rules,
    max_run=max_run,
    use_spot_instances=do_spot_training,  # spot instance 활용
    max_wait=max_wait,
)

CPU times: user 203 µs, sys: 0 ns, total: 203 µs
Wall time: 210 µs


In [27]:
!sudo rm -rf ./source_code/wandb

In [28]:
create_experiment(experiment_name)
job_name = create_trial(experiment_name, hyperparameters, instance_type, instance_count, do_spot_training)

# Now associate the estimator with the Experiment and Trial
estimator.fit(
    inputs={'training': s3_data_path}, 
    job_name=job_name,
    experiment_config={
      'TrialName': job_name,
      'TrialComponentDisplayName': job_name,
    },
    wait=False,
)

INFO:sagemaker:Creating training-job with name: dalle-poc-exp3-test-1-d-0519-04201621398051
INFO:sagemaker.local.local_session:Starting training job
INFO:sagemaker.local.image:No AWS credentials found in session but credentials from EC2 Metadata Service are available.
INFO:sagemaker.local.image:docker compose file: 
networks:
  sagemaker-local:
    name: sagemaker-local
services:
  algo-1-mey4t:
    command: train
    container_name: cwtzlrzlfc-algo-1-mey4t
    environment:
    - '[Masked]'
    - '[Masked]'
    image: 763104351884.dkr.ecr.us-east-1.amazonaws.com/pytorch-training:1.8.0-gpu-py36
    networks:
      sagemaker-local:
        aliases:
        - algo-1-mey4t
    runtime: nvidia
    stdin_open: true
    tty: true
    volumes:
    - /tmp/tmp8x8or_61/algo-1-mey4t/output:/opt/ml/output
    - /tmp/tmp8x8or_61/algo-1-mey4t/input:/opt/ml/input
    - /tmp/tmp8x8or_61/algo-1-mey4t/output/data:/opt/ml/output/data
    - /tmp/tmp8x8or_61/model:/opt/ml/model
    - /opt/ml/metadata:/opt/m

Creating cwtzlrzlfc-algo-1-mey4t ... 
Creating cwtzlrzlfc-algo-1-mey4t ... done
Attaching to cwtzlrzlfc-algo-1-mey4t
[36mcwtzlrzlfc-algo-1-mey4t |[0m 2021-05-19 04:20:55,471 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training
[36mcwtzlrzlfc-algo-1-mey4t |[0m 2021-05-19 04:20:55,550 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.
[36mcwtzlrzlfc-algo-1-mey4t |[0m 2021-05-19 04:20:55,553 sagemaker_pytorch_container.training INFO     Invoking user training script.
[36mcwtzlrzlfc-algo-1-mey4t |[0m 2021-05-19 04:20:55,555 sagemaker-training-toolkit INFO     Installing module with the following command:
[36mcwtzlrzlfc-algo-1-mey4t |[0m /opt/conda/bin/python3.6 -m pip install . -r requirements.txt
[36mcwtzlrzlfc-algo-1-mey4t |[0m Processing /opt/ml/code
[36mcwtzlrzlfc-algo-1-mey4t |[0m Collecting wandb
[36mcwtzlrzlfc-algo-1-mey4t |[0m   Downloading wandb-0.10.30-py2.py3-none-any.whl (1.8 MB)
[K    

RuntimeError: Failed to run: ['docker-compose', '-f', '/tmp/tmp8x8or_61/docker-compose.yaml', 'up', '--build', '--abort-on-container-exit'], Process exited with code: 1