In [62]:
!pip install "sagemaker>=2.48.0" "transformers==4.6.1" "datasets[s3]==1.6.2" --upgrade

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Collecting botocore==1.19.52
  Using cached botocore-1.19.52-py2.py3-none-any.whl (7.2 MB)
Collecting boto3==1.16.43
  Using cached boto3-1.16.43-py2.py3-none-any.whl (130 kB)
Collecting s3transfer<0.4.0,>=0.3.0
  Using cached s3transfer-0.3.7-py2.py3-none-any.whl (73 kB)
Collecting fsspec
  Downloading fsspec-2021.4.0-py3-none-any.whl (108 kB)
[K     |████████████████████████████████| 108 kB 7.5 MB/s eta 0:00:01
Installing collected packages: botocore, s3transfer, fsspec, boto3
  Attempting uninstall: botocore
    Found existing installation: botocore 1.23.11
    Uninstalling botocore-1.23.11:
      Successfully uninstalled

In [12]:
import sagemaker

sess = sagemaker.Session()
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

role = sagemaker.get_execution_role()
sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")

sagemaker role arn: arn:aws:iam::086613482928:role/service-role/AmazonSageMaker-ExecutionRole-20200122T153461
sagemaker bucket: sagemaker-us-west-2-086613482928
sagemaker session region: us-west-2


In [63]:
import sagemaker.huggingface

# Preprocessing

We are using the `datasets` library to download and preprocess the `imdb` dataset. After preprocessing, the dataset will be uploaded to our `sagemaker_session_bucket` to be used within our training job. The [imdb](http://ai.stanford.edu/~amaas/data/sentiment/) dataset consists of 25000 training and 25000 testing highly polar movie reviews.

## Tokenization 

In [64]:
from datasets import load_dataset
from transformers import AutoTokenizer

# tokenizer used in preprocessing
tokenizer_name = 'distilbert-base-uncased'

# dataset used
dataset_name = 'imdb'

# s3 key prefix for the data
s3_prefix = 'samples/datasets/imdb'

In [65]:
# load dataset
dataset = load_dataset(dataset_name)

# download tokenizer
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

# tokenizer helper function
def tokenize(batch):
    return tokenizer(batch['text'], padding='max_length', truncation=True)

# load dataset
train_dataset, test_dataset = load_dataset('imdb', split=['train', 'test'])
test_dataset = test_dataset.shuffle().select(range(10000)) # smaller the size for test dataset to 10k 


# tokenize dataset
train_dataset = train_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

# set format for pytorch
train_dataset =  train_dataset.rename_column("label", "labels")
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset = test_dataset.rename_column("label", "labels")
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

Reusing dataset imdb (/home/ec2-user/.cache/huggingface/datasets/imdb/plain_text/1.0.0/4ea52f2e58a08dbc12c2bd52d0d92b30b88c00230b4522801b3636782f625c5b)
Reusing dataset imdb (/home/ec2-user/.cache/huggingface/datasets/imdb/plain_text/1.0.0/4ea52f2e58a08dbc12c2bd52d0d92b30b88c00230b4522801b3636782f625c5b)


HBox(children=(FloatProgress(value=0.0, max=25.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))




## Uploading data to `sagemaker_session_bucket`

After we processed the `datasets` we are going to use the new `FileSystem` [integration](https://huggingface.co/docs/datasets/filesystems.html) to upload our dataset to S3.

In [66]:
import botocore
from datasets.filesystems import S3FileSystem

s3 = S3FileSystem()  

# save train_dataset to s3
training_input_path = f's3://{sess.default_bucket()}/{s3_prefix}/train'
train_dataset.save_to_disk(training_input_path,fs=s3)

# save test_dataset to s3
test_input_path = f's3://{sess.default_bucket()}/{s3_prefix}/test'
test_dataset.save_to_disk(test_input_path,fs=s3)

In [67]:
!pip install boto3 awscli botocore --upgrade

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Collecting boto3
  Downloading boto3-1.20.12-py3-none-any.whl (131 kB)
[K     |████████████████████████████████| 131 kB 6.7 MB/s eta 0:00:01
Collecting awscli
  Downloading awscli-1.22.12-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 37.9 MB/s eta 0:00:01
Collecting botocore
  Downloading botocore-1.23.12-py3-none-any.whl (8.2 MB)
[K     |████████████████████████████████| 8.2 MB 104.6 MB/s eta 0:00:01
Collecting s3transfer<0.6.0,>=0.5.0
  Using cached s3transfer-0.5.0-py3-none-any.whl (79 kB)
Installing collected packages: botocore, s3transfer, boto3, awscli
  Attempting uninstall: botocore
   

In [68]:
import boto3
client = boto3.client("sagemaker")

# Steps Automated by Sagemaker Python SDK which are manual with boto3 or any other AWS SDKs (Java/Golang etc..)


In [69]:
# 1. Customers need to package the code and upload the training code to S3 bucket using the boto3 S3 client s3://sagemaker-us-west-2-086613482928/huggingface-pytorch-training-2021-11-22-20-14-36-456/source/sourcedir.tar.gz
# 2. Customers need to find the right training image name and populate it in the API along with other parameters as mentioned below. Most of the below parameters are populated by Sagemaker Python SDK automagically when using HuggingFaceEstimator() class and then using HuggingFaceEstimator.fit() which also finds the right training image
# 3. Customers need to use a training waiter to wait for the training job to complete. Sagemaker python SDK automatically implements the waiter in the HuggingFaceEstimator.fit()
# 4. Customers need to call 3 Sagemaker Hosting Cloud APIs to deploy a trained model and create an endpoint. These 3 APIs are automatically called by Sagemaker Python SDK using the HuggingFaceEstimator.deploy() method
# 5. Customers need to find the right inference image to pass in the above APIs - Sagemaker Python SDK automagically finds the correct inference image when using HuggingFaceEstimator.deploy()
# 6. Customers need to use an inference waiter to wait for the endpoint creation to complete. Sagemaker python SDK automatically implements the waiter in the HuggingFaceEstimator.deploy()
# 7. Customers need to use Sagemaker runtime APIs to call prediction on the endpoint. This is automated by Sagemaker Python SDK using predictor.predict(). predictor is the object created by Sagemaker Python SDK returned after calling HuggingFaceEstimator.deploy()


# TRAINING

# Package the training code

In [77]:
! cd scripts; tar -czvf ../sourcedir.tar.gz train.py

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
train.py


In [89]:
training_job_name = "huggingface-pytorch-training-manual-{}".format(int(time.time()))

import boto3
s3 = boto3.resource('s3')
s3.meta.client.upload_file('sourcedir.tar.gz', sess.default_bucket(), f'{training_job_name}/source/sourcedir.tar.gz')

s3_code_directory = f's3://{sess.default_bucket()}/{training_job_name}/source/sourcedir.tar.gz'

# Create Training Job

In [90]:
import time
response = client.create_training_job(
    TrainingJobName=training_job_name,
    HyperParameters={
        "epochs": "1",
        "model_name": '"distilbert-base-uncased"',
        "sagemaker_container_log_level": "20",
        "sagemaker_job_name": training_job_name,
        "sagemaker_program": '"train.py"',
        "sagemaker_region": '"us-west-2"',
        "sagemaker_submit_directory": s3_code_directory,
        "train_batch_size": "32"
    },
    AlgorithmSpecification={
        'TrainingImage': '763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-pytorch-training:1.7-transformers4.6-gpu-py36-cu110-ubuntu18.04',
        'TrainingInputMode': 'File',
        'EnableSageMakerMetricsTimeSeries': True
    },
    RoleArn='arn:aws:iam::086613482928:role/service-role/AmazonSageMaker-ExecutionRole-20200122T153461',
    InputDataConfig=[
        {
            'ChannelName': 'train',
            'DataSource': {
                'S3DataSource': {
                    'S3DataType': 'S3Prefix',
                    'S3Uri': training_input_path,
                    'S3DataDistributionType': 'FullyReplicated'
                }
            }
        },
        {
            'ChannelName': 'test',
            'DataSource': {
                'S3DataSource': {
                    'S3DataType': 'S3Prefix',
                    'S3Uri': test_input_path,
                    'S3DataDistributionType': 'FullyReplicated'
                }
            }
        },
    ],
    OutputDataConfig={
        'S3OutputPath': 's3://sagemaker-us-west-2-086613482928/'
    },
    ResourceConfig={
        'InstanceType': 'ml.p3.8xlarge',
        'InstanceCount': 1,
        'VolumeSizeInGB': 30    },
    StoppingCondition={
        'MaxRuntimeInSeconds': 86400    },
    EnableNetworkIsolation=False,
    EnableInterContainerTrafficEncryption=False,
    EnableManagedSpotTraining=False
)

In [91]:
waiter = client.get_waiter('training_job_completed_or_stopped')
waiter.wait(
    TrainingJobName=training_job_name,
    WaiterConfig={
        'Delay': 60
    }
)

# DEPLOY MODEL

# Create Model

In [110]:
model_name = "model-{}".format(training_job_name)
response = client.create_model(
    ModelName=model_name,
    PrimaryContainer={
        'ContainerHostname': 'Container1',
        'Image': '763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-pytorch-inference:1.7-transformers4.6-gpu-py36-cu110-ubuntu18.04',
        'Mode': 'SingleModel',
        'ModelDataUrl': 's3://sagemaker-us-west-2-086613482928/huggingface-pytorch-training-manual-9/output/model.tar.gz',
        'Environment': {
            'SAGEMAKER_CONTAINER_LOG_LEVEL': '20',
            'SAGEMAKER_REGION': 'us-west-2'
        }
    },
    ExecutionRoleArn='arn:aws:iam::086613482928:role/service-role/AmazonSageMaker-ExecutionRole-20200122T153461'
)

# Create Endpoint Config

In [111]:
endpoint_config_name = "endpoint-config-{}".format(model_name)
endpoint_config_name = endpoint_config_name[:63]

In [112]:
response = client.create_endpoint_config(
    EndpointConfigName=endpoint_config_name,
    ProductionVariants=[
        {
            'VariantName': model_name,
            'ModelName': model_name,
            'InitialInstanceCount': 1,
            'InstanceType': 'ml.g4dn.xlarge',
            'InitialVariantWeight': 1
        },
    ]
)

# Create Endpoint

In [113]:
endpoint_name = "endpoint-{}".format(endpoint_config_name)
endpoint_name = endpoint_name[:63]

In [114]:
response = client.create_endpoint(
    EndpointName=endpoint_name,
    EndpointConfigName=endpoint_config_name
)

In [115]:
waiter = client.get_waiter('endpoint_in_service')
waiter.wait(
    EndpointName=endpoint_name,
    WaiterConfig={
        'Delay': 60
    }
)

# RUN INFERENCE

# Invoke Endpoint

In [116]:
import json
runtime_client = boto3.client('sagemaker-runtime')

sentiment_input= {"inputs":"I love using the new Inference DLC."}

response = runtime_client.invoke_endpoint(
    EndpointName=endpoint_name,
    Body=json.dumps(sentiment_input),
    ContentType="application/json"
)
print(response)

{'ResponseMetadata': {'RequestId': '6a77dce7-373e-4000-80c4-12cb009b95b7', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': '6a77dce7-373e-4000-80c4-12cb009b95b7', 'x-amzn-invoked-production-variant': 'model-huggingface-pytorch-training-manual-1637715507', 'date': 'Wed, 24 Nov 2021 02:45:38 GMT', 'content-type': 'application/json', 'content-length': '48'}, 'RetryAttempts': 0}, 'ContentType': 'application/json', 'InvokedProductionVariant': 'model-huggingface-pytorch-training-manual-1637715507', 'Body': <botocore.response.StreamingBody object at 0x7fd5407a1e50>}


In [117]:
response_body = json.loads(response['Body'].read().decode("utf-8"))
print(response_body)

[{'label': 'LABEL_1', 'score': 0.9627289175987244}]
