# Training Amazon SageMaker models by using the Deep Graph Library with PyTorch backend

https://github.com/aws/amazon-sagemaker-examples/blob/master/sagemaker-python-sdk/dgl_gcn/pytorch_gcn.ipynb

## Setup

Define a few variables that are needed later in the example.

In [1]:
import sagemaker
from sagemaker import get_execution_role
from sagemaker.session import Session

# Setup session
sess = sagemaker.Session()

# S3 bucket for saving code and model artifacts.
# Feel free to specify a different bucket here.
bucket = sess.default_bucket()

# IAM execution role that gives Amazon SageMaker access to resources in your AWS account.
# You can use the Amazon SageMaker Python SDK to get the role from the notebook environment.
role = get_execution_role()

In [2]:
role

'arn:aws:iam::479389006481:role/DocDB-SM-SageMakerRoleName'

## The training script

In [4]:
# !cat src/main.py

## SageMaker's estimator class

In [5]:
!cp rds-combined-ca-bundle.pem src/

In [6]:
import json
import boto3
# Get DocumentDB credentials stored in Secrets Manager
def get_secret(stack_name):

    # Create a Secrets Manager client
    session = boto3.session.Session()
    client = session.client(
        service_name='secretsmanager',
        region_name=session.region_name
    )
    
    secret_name = f'{stack_name}-DocDBSecret'
    get_secret_value_response = client.get_secret_value(SecretId=secret_name)
    secret = get_secret_value_response['SecretString']
    
    return json.loads(secret)

In [7]:
secrets = get_secret('DocDB-SM')

In [8]:
# secrets

In [9]:
from sagemaker.pytorch import PyTorch

CODE_PATH = "main.py"
account = sess.boto_session.client("sts").get_caller_identity()["Account"]
region = sess.boto_session.region_name

params = {
    'patience': 5, 
    'n-epochs': 10,
    'batch-size': 64,
    'db-host': secrets['host'],
    'db-username': secrets['username'], 
    'db-password': secrets['password'], 
    'db-port': secrets['port'],
    
}
task_tags = [{"Key": "ML Task", "Value": "DGL"}]
estimator = PyTorch(
    entry_point=CODE_PATH,
    source_dir='src',
    role=role,
    instance_count=1,
#     instance_type="ml.p3.2xlarge",
    instance_type='ml.c4.2xlarge',
#     instance_type='local',
    framework_version="1.7.1",
    py_version="py3",
    hyperparameters=params,
    sagemaker_session=sess,
#     subnets=['subnet-059a93a71fc411609'],
#     subnets=['subnet-0452015a17db921f5'], # DocDB-SM-PrivateOne
    subnets=['subnet-08bf5904187d12262'], # NAT-subnet
    security_group_ids=['sg-0b571292a85eaad77'],    
)

In [10]:
estimator.get_vpc_config()

{'Subnets': ['subnet-08bf5904187d12262'],
 'SecurityGroupIds': ['sg-0b571292a85eaad77']}

## Running the Training Job

In [None]:
estimator.fit()

2021-08-25 23:37:05 Starting - Starting the training job...
2021-08-25 23:37:28 Starting - Launching requested ML instancesProfilerReport-1629934624: InProgress
...
2021-08-25 23:37:56 Starting - Preparing the instances for training.........
2021-08-25 23:39:35 Downloading - Downloading input data
2021-08-25 23:39:35 Training - Downloading the training image...
2021-08-25 23:39:51 Training - Training image download completed. Training in progress.[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2021-08-25 23:39:52,395 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2021-08-25 23:39:52,397 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2021-08-25 23:39:52,408 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2021-08-25 23:39:52,421 sagemaker_pytorch_container.