Hugging Face Transformer finetuning with SageMaker SDK: https://huggingface.co/docs/sagemaker/train

In [3]:
#!pip install transformers "datasets[s3]" --upgrade
#!pip install "sagemaker>=2.31.0" "transformers==4.4.2" "datasets[s3]==1.5.0" --upgrade
#!pip install "sagemaker>=2.48.0" "transformers==4.12.3" "datasets[s3]==1.18.3" --upgrade

Setup role and default bucket

In [2]:
import boto3
import os
import sagemaker
import json

session = sagemaker.Session()

role = sagemaker.get_execution_role()
role_name = role.split('/')[-1]

#sagemaker_session_bucket = "walkthrough-bucket-hf-aws"
sagemaker_session_bucket = session.default_bucket()

In [3]:
session = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {session.default_bucket()}")
print(f"sagemaker session region: {session.boto_region_name}")

sagemaker role arn: arn:aws:iam::264639154954:role/aaca-ani-cogsci-sagemaker-studio-role
sagemaker bucket: sagemaker-us-east-1-264639154954
sagemaker session region: us-east-1


In [4]:
import sagemaker.huggingface

Preprocessing: using the datasets library for imdb dataset. Data is uploaded to a sagemaker_session_bucket

In [5]:
from datasets import load_dataset
from transformers import AutoTokenizer

In [6]:
tokenizer_name = "distilbert-base-uncased"

dataset_name = "imdb"

s3_prefix = "samples/datasets/imdb"

Load data and preprocess outside of training job

In [7]:
dataset = load_dataset(dataset_name, ignore_verifications=True)

tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

#tokenizer helper function
def tokenize(batch):
    return tokenizer(batch['text'], padding='max_length', truncation=True)

# load dataset
train_dataset, test_dataset = load_dataset('imdb', split=['train', 'test'])
test_dataset = test_dataset.shuffle().select(range(10000))

Reusing dataset imdb (/root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1)


  0%|          | 0/3 [00:00<?, ?it/s]

Reusing dataset imdb (/root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1)


  0%|          | 0/2 [00:00<?, ?it/s]

In [12]:
# tokenize
train_dataset = train_dataset.map(tokenize, batched=True, batch_size=len(train_dataset))
test_dataset  = test_dataset.map(tokenize, batched=True, batch_size=len(test_dataset))

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [56]:
#train_dataset = train_dataset.rename_column("label", "labels")
#train_dataset.set_format("torch", columns={"input_ids", "attention_mask", "label"})
#test_dataset = test_dataset.rename_column("label", "labels")
#test_dataset.set_format("torch", columns={"input_ids", "attention_mask", "label"})

In [13]:
train_dataset

Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 25000
})

In [14]:
train_dataset = train_dataset.rename_column("label", "labels")
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
test_dataset = test_dataset.rename_column("label", "labels")
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

In [15]:
import botocore
from datasets.filesystems import S3FileSystem

In [25]:
s3 = S3FileSystem()

import s3fs
s3 = s3fs.S3FileSystem()

training_input_path = f"s3://{session.default_bucket()}/{s3_prefix}/train"
train_dataset.save_to_disk(training_input_path, fs=s3)

test_input_path = f"s3://{session.default_bucket()}/{s3_prefix}/test"
test_dataset.save_to_disk(test_input_path, fs=s3)

In [17]:
print(f"train_path: {training_input_path}")
print(f"test_path: {test_input_path}")

train_path: s3://sagemaker-us-east-1-264639154954/samples/datasets/imdb/train
test_path: s3://sagemaker-us-east-1-264639154954/samples/datasets/imdb/test


In [37]:
from datasets import load_from_disk
data = load_from_disk(training_input_path, fs=s3)

In [38]:
data

Dataset({
    features: ['text', 'labels', 'input_ids', 'attention_mask'],
    num_rows: 25000
})

In [31]:
dataset.__dict__.keys()

dict_keys([])

# Finetuning with Sagemaker training job
Here we are using the HuggingFace estimator. It handles end-to-end SageMaker training and deployment tasks. entry_point is the training script.

When we create a SageMaker training job, SageMaker takes care of starting and managing all the required ec2 instances for us with the huggingface container, uploads the provided fine-tuning script train.py, and downloads the data from our sagemaker_session_bucket in the container at /opt/ml/input/data. Then it runs the training job.

In [21]:
from sagemaker.huggingface import HuggingFace

In [28]:
import datasets
datasets.__version__

'2.4.0'

In [27]:
hyperparameters = {"epochs": 1,
                   "train_batch_size": 16,
                   "model_name": "distilbert-base-uncased"
                  }

In [40]:
import transformers
transformers.__version__

'4.12.3'

In [48]:
huggingface_estimator = HuggingFace(entry_point='train.py',
                                    source_dir='./scripts',
                                    sagemaker_session=session,
                                    instance_type='ml.p3.2xlarge',
                                    instance_count=1,
                                    role=role,
                                    transformers_version='4.12',
                                    py_version='py38',
                                    pytorch_version='1.9',
                                    hyperparameters=hyperparameters)

In [49]:
huggingface_estimator.fit({"train": training_input_path, "test": test_input_path})

2023-01-08 17:16:39 Starting - Starting the training job...
2023-01-08 17:17:06 Starting - Preparing the instances for trainingProfilerReport-1673198199: InProgress
......
2023-01-08 17:18:10 Downloading - Downloading input data...
2023-01-08 17:18:30 Training - Downloading the training image........................
2023-01-08 17:22:42 Training - Training image download completed. Training in progress....[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2023-01-08 17:22:56,958 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2023-01-08 17:22:56,985 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2023-01-08 17:22:56,988 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2023-01-08 17:22:57,268 sagemaker-training-toolkit INFO     Invoking user script[0m
[34mTraining En

# Deploying the model

In [50]:
predictor = huggingface_estimator.deploy(1, "ml.g4dn.xlarge")

----------!

# Use this object to call an endpoint

In [54]:
predictor.predict({"inputs": "I can say anything bad about this movie!"})

[{'label': 'LABEL_0', 'score': 0.8966643810272217}]

In [52]:
predictor.predict({"inputs": "Utterly terrible movie!"})

[{'label': 'LABEL_0', 'score': 0.9913119077682495}]

In [53]:
predictor.predict({"inputs": "Best thing I've seen this year!"})

[{'label': 'LABEL_1', 'score': 0.9924923777580261}]