# Training Amazon SageMaker models by using the Deep Graph Library with PyTorch backend

https://github.com/aws/amazon-sagemaker-examples/blob/master/sagemaker-python-sdk/dgl_gcn/pytorch_gcn.ipynb

## Setup

Define a few variables that are needed later in the example.

In [1]:
import sagemaker
from sagemaker import get_execution_role
from sagemaker.session import Session

# Setup session
sess = sagemaker.Session()

# S3 bucket for saving code and model artifacts.
# Feel free to specify a different bucket here.
bucket = sess.default_bucket()

# # Location to put your custom code.
# custom_code_upload_location = "customcode"

# IAM execution role that gives Amazon SageMaker access to resources in your AWS account.
# You can use the Amazon SageMaker Python SDK to get the role from the notebook environment.
role = get_execution_role()

In [2]:
role

'arn:aws:iam::811425317877:role/JanssenMLSLSageMakerNotebookRole'

## The training script

In [2]:
!cat src/main.py

import argparse
import json
import os
import math
import random
from datetime import datetime

from pymongo import MongoClient
import dgl
from dgl.nn import GraphConv
import numpy as np
import torch
import torch.nn as nn
import torch.utils.data as data
import torch.nn.functional as F
from sklearn.metrics import roc_auc_score

# amino acid to index mapping
# from Bio.PDB.Polypeptide import d1_to_index
d1_to_index = {
    "A": 0,
    "C": 1,
    "D": 2,
    "E": 3,
    "F": 4,
    "G": 5,
    "H": 6,
    "I": 7,
    "K": 8,
    "L": 9,
    "M": 10,
    "N": 11,
    "P": 12,
    "Q": 13,
    "R": 14,
    "S": 15,
    "T": 16,
    "V": 17,
    "W": 18,
    "Y": 19,
    "X": 20,
}


def setup(args, seed=0):
    args["device"] = "cuda" if torch.cuda.is_available() else "cpu"

    # Set random seed
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
    return args


def collate_protein_graphs(sample

## SageMaker's estimator class

In [3]:
import json
secrets = json.load(open('DocumentDB_secrets.json', 'r')) 

In [4]:
from sagemaker.pytorch import PyTorch

CODE_PATH = "main.py"
account = sess.boto_session.client("sts").get_caller_identity()["Account"]
region = sess.boto_session.region_name

params = {
    'patience': 5, 
    'n-epochs': 10,
    'db-host': secrets['host'],
    'db-username': secrets['db_username'], 
    'db-password': secrets['db_password'], 
    
}
task_tags = [{"Key": "ML Task", "Value": "DGL"}]
estimator = PyTorch(
    entry_point=CODE_PATH,
    source_dir='src',
    role=role,
    instance_count=1,
#     instance_type="ml.p3.2xlarge",
    instance_type='ml.c4.2xlarge',
#     framework_version="1.3.1",
    framework_version="1.7.1",
    py_version="py3",
    debugger_hook_config=False,
    tags=task_tags,
    hyperparameters=params,
    sagemaker_session=sess,
    subnets=['subnet-e008bdbf'],
    security_group_ids=['sg-069bf37128d412109', 'sg-026342aa24fe27af0'],
#     vpc_config_override={
#         'Subnets': ['subnet-e008bdbf'],
#         'SecurityGroupIds': ['sg-069bf37128d412109', 'sg-026342aa24fe27af0']
#     }
)

In [5]:
sess

<sagemaker.session.Session at 0x7f89a83a9748>

In [6]:
estimator.get_vpc_config()

{'Subnets': ['subnet-e008bdbf'],
 'SecurityGroupIds': ['sg-069bf37128d412109', 'sg-026342aa24fe27af0']}

In [7]:
type(estimator)

sagemaker.pytorch.estimator.PyTorch

## Running the Training Job

In [None]:
estimator.fit()

2021-08-25 17:52:44 Starting - Starting the training job...
2021-08-25 17:53:02 Starting - Launching requested ML instancesProfilerReport-1629913964: InProgress
.........
2021-08-25 17:54:43 Starting - Preparing the instances for training......
2021-08-25 17:55:43 Downloading - Downloading input data...
2021-08-25 17:56:14 Training - Training image download completed. Training in progress..[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2021-08-25 17:56:15,724 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2021-08-25 17:56:15,726 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2021-08-25 17:56:15,737 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2021-08-25 17:56:16,365 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
