Use Boto3 to create a training job

https://github.com/aws/amazon-sagemaker-examples/blob/master/sagemaker-python-sdk/scikit_learn_randomforest/Sklearn_on_SageMaker_end2end.ipynb

Intro to SageMaker Container
https://medium.com/@krishna.yerramsetty/amazon-sagemaker-a-short-guide-c4040d85b54c

Extending SageMaker container
https://github.com/aws/amazon-sagemaker-examples/blob/master/advanced_functionality/pytorch_extending_our_containers/pytorch_extending_our_containers.ipynb


## Brief intro to boto3

Requirements:
    - check secret-key and access-key-id are properly sourced in the env variable

In [4]:
import boto3  # your python gate way to all aws services
import pprint # print readable dictionary
import json


pp = pprint.PrettyPrinter(indent=1)
iam = boto3.client('iam')

## Setting up IAM execution role
- What is executionn role
- Why do we need them

### Attach policy 

- Trust relationship
- Permissions

see some example policies

resources for policies
https://docs.aws.amazon.com/IAM/latest/UserGuide/access_policies.html

resources for ARN format
https://docs.aws.amazon.com/quicksight/latest/APIReference/qs-arn-format.html

```
arn:<partion>:<service>:<region>:<account-id>:<resource-type>/<resource-id>
```




In [39]:
def create_execution_role(role_name="basic-role"):
    """Create an execution role to procure services on your behalf
    
    Args:
        role_name (str): name of the role
    
    Return:
        dict
    """    
    # if the role already exists, delete it
    
    # Note: you need to make sure the role is not
    # used in production, because the code below
    # will delete the role and create a new one
    role = None
    for rol in iam.list_roles()['Roles']:
        if rol['RoleName'] == role_name:
            # detach policy from the role before deleting it
            role = boto3.resource('iam').Role(role_name)
            
            for p in role.attached_policies.all():
                role.detach_policy(PolicyArn=p.arn)
            break
    
    # Trust relation
    trust_relation_policy_doc = {
      "Version": "2012-10-17",
      "Statement": [
        {
          "Effect": "Allow",
          "Principal": {
            "Service": [
              "sagemaker.amazonaws.com"
            ],
          },
          "Action": "sts:AssumeRole",
        }
      ]
    }
    
    if role is not None:
        iam.delete_role(RoleName=role.name)
    
    res = iam.create_role(
        RoleName=role_name,
        AssumeRolePolicyDocument=json.dumps(trust_relation_policy_doc)
    )
    return res

In [40]:
role_res = create_execution_role()
pp.pprint(role_res)

{'ResponseMetadata': {'HTTPHeaders': {'content-length': '783',
                                      'content-type': 'text/xml',
                                      'date': 'Fri, 26 Feb 2021 02:40:28 GMT',
                                      'x-amzn-requestid': '9e0ed659-05c8-48ed-8a52-286182fe2e5c'},
                      'HTTPStatusCode': 200,
                      'RequestId': '9e0ed659-05c8-48ed-8a52-286182fe2e5c',
                      'RetryAttempts': 0},
 'Role': {'Arn': 'arn:aws:iam::688520471316:role/basic-role',
          'AssumeRolePolicyDocument': {'Statement': [{'Action': 'sts:AssumeRole',
                                                      'Effect': 'Allow',
                                                      'Principal': {'Service': ['sagemaker.amazonaws.com']}}],
                                       'Version': '2012-10-17'},
          'CreateDate': datetime.datetime(2021, 2, 26, 2, 40, 28, tzinfo=tzlocal()),
          'Path': '/',
          'RoleId': 'AROA2ATY

Attach permission

In [26]:
basic_s3_permission = {
    "Version": "2012-10-17",
        "Statement": [
            {
                "Effect": "Allow",
                "Action": [
                    "s3:GetObject",
                    "s3:PutObject",
                    "s3:DeleteObject", 
                    "s3:ListBucket"
                ],
                "Resource": [
                    "arn:aws:s3:::*"
                ]
            }
        ]
    }

def attach_permission(role_name, policy_name, policy_doc):
    """Attach a basic permission policy to the role"""

    # Create the policy
    # If the policy with policy name $policy_name already exists,
    # then we need to delete it first
    
    # Note: you need to make sure that you do not have a policy 
    # with $policy_name in production, because we will delete it
    # and create a new one with the policy document given by 
    # $policy_doc
    
    policy = None
    for p in iam.list_policies()['Policies']:
        if p['PolicyName']==policy_name:
            # Before we delete the policy, we need to detach it
            # from all IAM entities 
            policy = boto3.resource('iam').Policy(p['Arn'])
            
            # 1. detach from all groups
            for grp in policy.attached_groups.all():
                policy.detach_group(GroupName=grp.name)
                
            # 2. detach from all users
            for usr in policy.attached_users.all():
                policy.detach_user(UserName=usr.name)
            
            # 3. detach from all roles
            for rol in policy.attached_roles.all():
                policy.detach_role(RoleName=rol.name)
                
            break
    
    if policy is not None:
        iam.delete_policy(PolicyArn=policy.arn)   
    
    # create a new policy
    policy = iam.create_policy(
        PolicyName=policy_name,
        PolicyDocument=json.dumps(policy_doc))['Policy']
    
    # attach the policy to the role
    res = iam.attach_role_policy(
        RoleName=role_name,
        PolicyArn=policy['Arn']
        )
    return res

In [29]:
perm_res = attach_permission(
    role_name=role_res['Role']['RoleName'],
    policy_name='Basic-S3-Permission',
    policy_doc=basic_s3_permission
    )

pp.pprint(perm_res)

{'ResponseMetadata': {'HTTPHeaders': {'content-length': '212',
                                      'content-type': 'text/xml',
                                      'date': 'Fri, 26 Feb 2021 02:13:55 GMT',
                                      'x-amzn-requestid': '8ff8eeab-6d9d-46e6-80e1-d652ad32d79a'},
                      'HTTPStatusCode': 200,
                      'RequestId': '8ff8eeab-6d9d-46e6-80e1-d652ad32d79a',
                      'RetryAttempts': 0}}


## Test your role

Now, you have created an execution role `basic-role` with the following permission:
```
basic_s3_permission = {
    "Version": "2012-10-17",
        "Statement": [
            {
                "Effect": "Allow",
                "Action": [
                    "s3:GetObject",
                    "s3:PutObject",
                    "s3:DeleteObject", 
                    "s3:ListBucket"
                ],
                "Resource": [
                    "arn:aws:s3:::*"
                ]
            }
        ]
    }
```
Let's test that it can indeed perform those actions on your behalf.

In [45]:
import os
os.environ['AWS_PROFILE']='default'

In [46]:
sts_client = boto3.client('sts')

assumed_role_object=sts_client.assume_role(
    RoleArn=role_res["Role"]["Arn"],
    RoleSessionName="AssumeRoleSession1"
)

ClientError: An error occurred (AccessDenied) when calling the AssumeRole operation: User: arn:aws:sts::688520471316:assumed-role/RL/botocore-session-1614303096 is not authorized to perform: sts:AssumeRole on resource: arn:aws:iam::688520471316:role/basic-role

In [32]:
sts_client.get_caller_identity()

{'UserId': 'AROA2ATYEUMKIU3KQG7TC:botocore-session-1614303096',
 'Account': '688520471316',
 'Arn': 'arn:aws:sts::688520471316:assumed-role/RL/botocore-session-1614303096',
 'ResponseMetadata': {'RequestId': '9e8c9a66-c79e-40e3-aadb-0f3c515cd2e6',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '9e8c9a66-c79e-40e3-aadb-0f3c515cd2e6',
   'content-type': 'text/xml',
   'content-length': '463',
   'date': 'Fri, 26 Feb 2021 02:25:49 GMT'},
  'RetryAttempts': 0}}

## How Amazon SageMaker Runs Your Container
https://docs.aws.amazon.com/sagemaker/latest/dg/your-algorithms-training-algo-dockerfile.html


In [75]:
import boto3
sm_boto3 = boto3.client('sagemaker')
s3 = boto3.client('s3')

# try to create a local training job
training_job_name = 'test-training-job-{}'.format(current_time())


bucket='688520471316-sagemaker-us-west-2'

# put data here
data_path="s3://{}/{}/data".format(bucket, training_job_name)

# upload data to s3
train_file="boston_train.csv"
s3.upload_file(
    Filename=train_file, 
    Bucket=bucket, 
    Key='{}/data/{}'.format(training_job_name, train_file))

# location that SageMaker saves the model artifacts
output_path="s3://{}/{}/output".format(bucket, training_job_name)

algorithm_specification = {
    'TrainingImage': "688520471316.dkr.ecr.us-west-2.amazonaws.com/test:latest",
    'TrainingInputMode': 'File',
}


role_arn = "arn:aws:iam::688520471316:role/RL"
input_data_config = [
    {
        'ChannelName': 'train',
            'DataSource':{
                'S3DataSource':{
                    'S3DataType': 'S3Prefix',
                    'S3Uri': data_path,
                    'S3DataDistributionType': 'FullyReplicated',
                }
        }
        
    },
    {
        'ChannelName': 'test',
        'DataSource':{
            'S3DataSource': {
                'S3DataType': 'S3Prefix',
                'S3Uri': data_path,
                'S3DataDistributionType': 'FullyReplicated',
            }
        }
    }
]


output_data_config = {
    'S3OutputPath': output_path
}

resource_config = {
    'InstanceType': 'ml.m5.large',
    'InstanceCount':1,
    'VolumeSizeInGB':10
}

stopping_condition={
    'MaxRuntimeInSeconds':120,
    'MaxWaitTimeInSeconds': 123
}

enable_network_isolation=False

res = sm_boto3.create_training_job(
    TrainingJobName=training_job_name,
    #HyperParameters=hyperparameters,
    AlgorithmSpecification=algorithm_specification,
    RoleArn=role_arn,
    InputDataConfig=input_data_config,
    OutputDataConfig=output_data_config,
    ResourceConfig=resource_config,
    StoppingCondition=stopping_condition,
    EnableNetworkIsolation=enable_network_isolation,
    EnableManagedSpotTraining=True, # set it to False if do not want managed spot training
)



In [73]:
import pprint
pp = pprint.PrettyPrinter(indent=1)


res = sm_boto3.describe_training_job(
    TrainingJobName=training_job_name)

pp.pprint(res)

{'AlgorithmSpecification': {'EnableSageMakerMetricsTimeSeries': False,
                            'TrainingImage': '688520471316.dkr.ecr.us-west-2.amazonaws.com/test:latest',
                            'TrainingInputMode': 'File'},
 'CreationTime': datetime.datetime(2021, 2, 24, 1, 57, 13, 385000, tzinfo=tzlocal()),
 'EnableInterContainerTrafficEncryption': False,
 'EnableManagedSpotTraining': False,
 'EnableNetworkIsolation': False,
 'InputDataConfig': [{'ChannelName': 'train',
                      'CompressionType': 'None',
                      'DataSource': {'S3DataSource': {'S3DataDistributionType': 'FullyReplicated',
                                                      'S3DataType': 'S3Prefix',
                                                      'S3Uri': 's3://688520471316-sagemaker-us-west-2/test-training-job-2021-02-24-01-57-13/data'}},
                      'RecordWrapperType': 'None'},
                     {'ChannelName': 'test',
                      'CompressionType':

In [50]:
training_job_name

'test-training-job-2021-02-24-00-08-16'

In [1]:
# describe-training-job look at the parameters of an successful training job


res = sm.describe_training_job(
    TrainingJobName=training_
)

In [5]:
import pprint
pp = pprint.PrettyPrinter(indent=1)
pp.pprint(res)


{'AlgorithmSpecification': {'EnableSageMakerMetricsTimeSeries': False,
                            'MetricDefinitions': [{'Name': 'train:mae',
                                                   'Regex': '.*\\[[0-9]+\\].*#011train-mae:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*'},
                                                  {'Name': 'validation:aucpr',
                                                   'Regex': '.*\\[[0-9]+\\].*#011validation-aucpr:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*'},
                                                  {'Name': 'train:merror',
                                                   'Regex': '.*\\[[0-9]+\\].*#011train-merror:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*'},
                                                  {'Name': 'train:gamma-nloglik',
                                                   'Regex': '.*\\[[0-9]+\\].*#011train-gamma-nloglik:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*'},
                                         

In [76]:
r = sm_boto3.describe_training_job(
    TrainingJobName='my-awesome-training-job')
print(r['FailureReason'])

AlgorithmError: framework error: 
Traceback (most recent call last):
  File "/miniconda3/lib/python3.7/site-packages/sagemaker_containers/_trainer.py", line 84, in train
    entrypoint()
  File "/miniconda3/lib/python3.7/site-packages/sagemaker_sklearn_container/training.py", line 39, in main
    train(environment.Environment())
  File "/miniconda3/lib/python3.7/site-packages/sagemaker_sklearn_container/training.py", line 35, in train
    runner_type=runner.ProcessRunnerType)
  File "/miniconda3/lib/python3.7/site-packages/sagemaker_training/entry_point.py", line 92, in run
    files.download_and_extract(uri=uri, path=environment.code_dir)
  File "/miniconda3/lib/python3.7/site-packages/sagemaker_training/files.py", line 131, in download_and_extract
    s3_download(uri, dst)
  File "/miniconda3/lib/python3.7/site-packages/sagemaker_training/files.py", line 167, in s3_download
    s3.Bucket(bucket).download_file(key, dst)
  File "/miniconda3/lib/python3.7/site-packages/boto3/s3/inject.p