# Training Amazon SageMaker models by using the Deep Graph Library with PyTorch backend

https://github.com/aws/amazon-sagemaker-examples/blob/master/sagemaker-python-sdk/dgl_gcn/pytorch_gcn.ipynb

## Setup

Define a few variables that are needed later in the example.

In [1]:
import sagemaker
from sagemaker import get_execution_role
from sagemaker.session import Session

# Setup session
sess = sagemaker.Session()

# S3 bucket for saving code and model artifacts.
# Feel free to specify a different bucket here.
bucket = sess.default_bucket()

# IAM execution role that gives Amazon SageMaker access to resources in your AWS account.
# You can use the Amazon SageMaker Python SDK to get the role from the notebook environment.
role = get_execution_role()

In [2]:
stack_name = 'docdb-sm-2' # name of CloudFormation stack

## The training script

In [3]:
# !cat src/main.py

In [4]:
# copy the certificate bundle for SM training jobs
!cp rds-combined-ca-bundle.pem src/

## SageMaker's estimator class

In [5]:
import json
import boto3
# Get DocumentDB credentials stored in Secrets Manager
def get_secret(stack_name):

    # Create a Secrets Manager client
    session = boto3.session.Session()
    client = session.client(
        service_name='secretsmanager',
        region_name=session.region_name
    )
    
    secret_name = f'{stack_name}-DocDBSecret'
    get_secret_value_response = client.get_secret_value(SecretId=secret_name)
    secret = get_secret_value_response['SecretString']
    
    return json.loads(secret)

In [6]:
secrets = get_secret(stack_name)

In [7]:
# secrets

In [8]:
ec2 = boto3.client('ec2')

In [9]:
# find NAT subnet ID 
resp = ec2.describe_subnets(
    Filters=[{'Name': 'tag:Name', 'Values': ['NAT_subnet']}]
)
nat_subnet_id = resp['Subnets'][0]['SubnetId']
# print(nat_subnet_id)

In [10]:
# find security group ids
resp = ec2.describe_security_groups(
    Filters=[{
        'Name': 'tag:Name', 
        'Values': ['{}-SG-DocumentDB'.format(stack_name)]
    }])
sg_id = resp['SecurityGroups'][0]['GroupId']
# print(sg_id)

In [11]:
from sagemaker.pytorch import PyTorch

CODE_PATH = "main.py"
account = sess.boto_session.client("sts").get_caller_identity()["Account"]
region = sess.boto_session.region_name

params = {
    'patience': 5, 
    'n-epochs': 20,
    'batch-size': 64,
    'db-host': secrets['host'],
    'db-username': secrets['username'], 
    'db-password': secrets['password'], 
    'db-port': secrets['port'],
    
}
task_tags = [{"Key": "ML Task", "Value": "DGL"}]
estimator = PyTorch(
    entry_point=CODE_PATH,
    source_dir='src',
    role=role,
    instance_count=1,
#     instance_type="ml.p3.2xlarge",
    instance_type='ml.c4.2xlarge',
    framework_version="1.7.1",
    py_version="py3",
    hyperparameters=params,
    sagemaker_session=sess,
    subnets=[nat_subnet_id], 
    security_group_ids=[sg_id],    
)

In [12]:
# estimator.get_vpc_config()

## Running the Training Job

In [13]:
estimator.fit()

2021-09-03 20:38:54 Starting - Starting the training job...
2021-09-03 20:38:57 Starting - Launching requested ML instancesProfilerReport-1630701533: InProgress
......
2021-09-03 20:40:15 Starting - Preparing the instances for training.........
2021-09-03 20:41:40 Downloading - Downloading input data...
2021-09-03 20:42:21 Training - Training image download completed. Training in progress..[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2021-09-03 20:42:22,004 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2021-09-03 20:42:22,005 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2021-09-03 20:42:22,016 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2021-09-03 20:42:28,299 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m