#### Imports 

In [1]:
from sagemaker import get_execution_role, Session
from sagemaker.huggingface import HuggingFace
import sagemaker
import logging

##### Setup logger 

In [2]:
logger = logging.getLogger('__name__')
logger.setLevel(logging.INFO)
logger.addHandler(logging.StreamHandler())

In [3]:
logger.info(f'[Using SageMaker: {sagemaker.__version__}]')

[Using SageMaker: 2.59.5]


#### Essentials 

In [4]:
session = Session()
role = get_execution_role()
bucket = session.default_bucket()

### Create a HuggingFace estimator and start a SageMaker training job

In [5]:
!pygmentize ./src/train.py

[34mfrom[39;49;00m [04m[36mtransformers[39;49;00m [34mimport[39;49;00m AutoModelForSequenceClassification, Trainer, TrainingArguments, AutoTokenizer
[34mfrom[39;49;00m [04m[36msklearn[39;49;00m[04m[36m.[39;49;00m[04m[36mmetrics[39;49;00m [34mimport[39;49;00m accuracy_score, precision_recall_fscore_support
[34mfrom[39;49;00m [04m[36mdatasets[39;49;00m [34mimport[39;49;00m load_from_disk
[34mimport[39;49;00m [04m[36margparse[39;49;00m
[34mimport[39;49;00m [04m[36mlogging[39;49;00m
[34mimport[39;49;00m [04m[36mrandom[39;49;00m
[34mimport[39;49;00m [04m[36mtorch[39;49;00m
[34mimport[39;49;00m [04m[36msys[39;49;00m
[34mimport[39;49;00m [04m[36mos[39;49;00m


[34mif[39;49;00m [31m__name__[39;49;00m == [33m'[39;49;00m[33m__main__[39;49;00m[33m'[39;49;00m:
    parser = argparse.ArgumentParser()
    [37m# Hyperparameters sent by the client are passed as command-line arguments to the script[39;49;00m
    parser.add_argument([3

##### Define hyperparameters

In [6]:
hyperparameters={'epochs': 3,
                 'train_batch_size': 64,
                 'eval_batch_size': 64,
                 'model_name': 'distilbert-base-uncased'}

Configuration for running training on smdistributed (Model Parallelism)

In [7]:
mpi_options = {
    "enabled" : True,
    "processes_per_host" : 8
}

smp_options = {
    "enabled": True,
    "parameters": {
        "microbatches": 16,
        "placement_strategy": "cluster",
        "pipeline": "interleaved",
        "optimize": "speed",
        "partitions": 2,
        "ddp": True,
    }
}

distribution={
    "smdistributed": {"modelparallel": smp_options},
    "mpi": mpi_options
}

Define metric definitions 

In [8]:
metric_definitions = [
    {"Name": "epoch", "Regex": "epoch.*=\D*(.*?)$"},
    {"Name": "train_runtime", "Regex": "train_runtime.*=\D*(.*?)$"},
    {'Name': 'train_samples_per_second', 'Regex': "train_samples_per_second.*=\D*(.*?)$"},
    {"Name": "train_accuracy", "Regex": "train_accuracy.*=\D*(.*?)$"},
    {"Name": "train_loss", "Regex": "train_loss.*=\D*(.*?)$"},
    {"Name": "eval_accuracy", "Regex": "eval_accuracy.*=\D*(.*?)$"},
    {"Name": "eval_loss", "Regex": "eval_loss.*=\D*(.*?)$"},
    {"Name": "f1", "Regex": "f1.*=\D*(.*?)$"}]

Instance configurations

In [9]:
instance_type = 'ml.p3.16xlarge'
instance_count = 2
volume_size = 200

Create HuggingFace estimator

In [10]:
huggingface_estimator = HuggingFace(entry_point='train.py',
                                    source_dir='./src',
                                    metric_definitions=metric_definitions,
                                    instance_type=instance_type,
                                    instance_count=instance_count,
                                    volume_size=volume_size,
                                    role=role,
                                    transformers_version='4.6',
                                    pytorch_version='1.7',
                                    py_version='py36',
                                    distribution= distribution,
                                    hyperparameters = hyperparameters)

##### Fit model

In [11]:
training_input_path = f's3://{bucket}/imdb/train'
test_input_path = f's3://{bucket}/imdb/test'

In [12]:
%%time

huggingface_estimator.fit({'train': training_input_path, 'test': test_input_path}, logs=False)


2021-10-14 18:50:26 Starting - Starting the training job
2021-10-14 18:50:30 Starting - Launching requested ML instances....................
2021-10-14 18:52:17 Starting - Preparing the instances for training.................................
2021-10-14 18:55:05 Downloading - Downloading input data...
2021-10-14 18:55:24 Training - Downloading the training image............
2021-10-14 18:56:28 Training - Training image download completed. Training in progress............................................................
2021-10-14 19:01:34 Uploading - Uploading generated training model........
2021-10-14 19:02:20 Completed - Training job completed
CPU times: user 675 ms, sys: 56.1 ms, total: 731 ms
Wall time: 11min 54s


##### Retrieve estimator parameters 

In [13]:
logger.info(f'S3 uri where the trained model is located: {huggingface_estimator.model_data}')

S3 uri where the trained model is located: s3://sagemaker-us-east-1-119174016168/huggingface-pytorch-training-2021-10-14-18-50-26-037/output/model.tar.gz


In [14]:
logger.info(f'Latest training job name for this estimator: {huggingface_estimator.latest_training_job.name}')

Latest training job name for this estimator: huggingface-pytorch-training-2021-10-14-18-50-26-037


#### Deploying the endpoint

In [None]:
predictor = huggingface_estimator.deploy(1, 'ml.g4dn.xlarge')

-------------

Make inferene using the deployed sentiment classifier model

In [None]:
sentiment_input= {"inputs": "I love using the new Inference DLC."}

In [None]:
response = predictor.predict(sentiment_input)
response