In [3]:
#!pip install "sagemaker>=2.48.0" "transformers==4.12.3" "datasets[s3]==1.18.3" --upgrade

In [12]:
import boto3
import os
import sagemaker
import json

session = sagemaker.Session()

role = sagemaker.get_execution_role()
role_name = role.split('/')[-1]

sagemaker_session_bucket = session.default_bucket()

In [13]:
session = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {session.default_bucket()}")
print(f"sagemaker session region: {session.boto_region_name}")

sagemaker role arn: arn:aws:iam::264639154954:role/aaca-ani-cogsci-sagemaker-studio-role
sagemaker bucket: sagemaker-us-east-1-264639154954
sagemaker session region: us-east-1


In [14]:
s3_prefix = "samples/datasets/imdb"

training_input_path = f"s3://{session.default_bucket()}/{s3_prefix}/train"
test_input_path = f"s3://{session.default_bucket()}/{s3_prefix}/test"

print(training_input_path)
print(test_input_path)

s3://sagemaker-us-east-1-264639154954/samples/datasets/imdb/train
s3://sagemaker-us-east-1-264639154954/samples/datasets/imdb/test


In [15]:
from sagemaker.huggingface import HuggingFace

In [16]:
hyperparameters = {"epochs": 1,
                   "train_batch_size": 16,
                   "model_name": "distilbert-base-uncased"
                  }

In [17]:
huggingface_estimator = HuggingFace(entry_point='bespoke_training.py',
                                    source_dir='./scripts',
                                    sagemaker_session=session,
                                    instance_type='ml.p3.2xlarge',
                                    instance_count=1,
                                    role=role,
                                    transformers_version='4.12',
                                    py_version='py38',
                                    pytorch_version='1.9',
                                    hyperparameters=hyperparameters)

In [None]:
huggingface_estimator.fit({"train": training_input_path, "test": test_input_path})

2023-01-16 11:04:05 Starting - Starting the training job...
2023-01-16 11:04:32 Starting - Preparing the instances for trainingProfilerReport-1673867044: InProgress
......
2023-01-16 11:05:35 Downloading - Downloading input data...
2023-01-16 11:06:01 Training - Downloading the training image.......................
2023-01-16 11:09:57 Training - Training image download completed. Training in progress...[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2023-01-16 11:10:20,181 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2023-01-16 11:10:20,213 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2023-01-16 11:10:20,216 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2023-01-16 11:10:20,453 sagemaker-training-toolkit INFO     Installing dependencies from requirements.t

# Now deploy the model so that it can be used for inference
There are a couple of easy ways to so this:
1. Directly from the estimator itself.
2. Using the HuggingFaceModel class to deploy a model stored in S3.

## Deploy from estimator

In [58]:
predictor = huggingface_estimator.deploy(
    initial_instance_count=1,
    instance_type="ml.m4.xlarge",)

's3://sagemaker-us-east-1-264639154954/huggingface-pytorch-training-2023-01-13-17-26-31-869/output/model.tar.gz'

## Deploy from HuggingFaceModel class

In [13]:
from sagemaker.huggingface import HuggingFaceModel

# create Hugging Face Model Class
huggingface_model = HuggingFaceModel(
   model_data=huggingface_estimator.model_data,  # path to your trained sagemaker model
   role=role, # iam role with permissions to create an Endpoint
   transformers_version='4.12', # transformers version used
   pytorch_version=huggingface_estimator.pytorch_version, # pytorch version used
   py_version=huggingface_estimator.py_version, # python version of the DLC
   env={ 'HF_TASK':'text-classification' }
)

In [14]:
predictor = huggingface_model.deploy(
   initial_instance_count=1,
   instance_type="ml.m4.xlarge"
)

-----!

## Test results by passing in a dict containing an "inputs" key and a text string.
LABEL_0 == 'negative'
LABEL_1 == 'positive'

In [64]:
predictor.predict({"inputs":"i DID NOT LIKE THIS."})

[{'label': 'LABEL_0', 'score': 0.9453563094139099}]

In [66]:
results = predictor.predict({"inputs":"i DID NOT LIKE THIS."})

Convert predictor labels in to negative/positive classes

In [67]:
classes = ["negative", "positive"]
id2label = {f"LABEL_{v}": k for v, k in enumerate(classes)}
id2label

{'LABEL_0': 'negative', 'LABEL_1': 'positive'}

In [68]:
for result in results:
    print(id2label[result['label']])

negative


# We can also look at how the model performed against individual samples from the test set 

In [68]:
# Load some data

In [31]:
from datasets import load_dataset
from transformers import AutoTokenizer

In [32]:
tokenizer_name = "distilbert-base-uncased"

dataset_name = "imdb"

s3_prefix = "samples/datasets/imdb"

In [33]:
dataset = load_dataset(dataset_name, ignore_verifications=True)

tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

#tokenizer helper function
def tokenize(batch):
    return tokenizer(batch['text'], padding='max_length', truncation=True)

# load dataset
train_dataset, test_dataset = load_dataset('imdb', split=['train', 'test'])
test_dataset = test_dataset.shuffle().select(range(10000))

Reusing dataset imdb (/root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1)


  0%|          | 0/3 [00:00<?, ?it/s]

Reusing dataset imdb (/root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1)


  0%|          | 0/2 [00:00<?, ?it/s]

In [34]:
test_dataset  = test_dataset.map(tokenize, batched=True, batch_size=len(test_dataset))

  0%|          | 0/1 [00:00<?, ?ba/s]

In [35]:
test_dataset = test_dataset.rename_column("label", "labels")
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

## Decode test set tokens using our tokenizer

In [38]:
tokenizer.decode(test_dataset[0]['input_ids'], skip_special_tokens=True)

"one of the worse surfing movies i've ever seen. this movie is so bad i don't know where to begin - - okay, let's start with the premise - some dude from the mainland who barely knows how to surf travels to hawaii and enters a big wave contest which he more or less expects to win. a good analogy for those who don't surf would be a that of a grossly overweight chain smoker slapping on a pairs of running shoes and entering the la marathon with expectations of winning. no way! and, the contest is held on the north shore which conjures up images of 15 + foot waves, but contest day the waves are maybe 6 foot. the acting? what acting? if you must see this woof see it on tv, don't waste your money renting it. if you want to see a pretty good surfing movie - granted it is flawed, but that's another story - rent big wednesday."

In [88]:
import numpy as np
sample_mask = np.random.randint(0, 10_000, size=(10)).tolist()

int

In [89]:
ground_truth = [test_dataset[i]['labels'].cpu().numpy().item() for i in sample_mask]
ground_truth

[1, 1, 0, 0, 0, 1, 0, 0, 0, 0]

In [91]:
id2label[f"LABEL_{test_dataset[sample_mask[0]]['labels'].cpu().numpy().item()}"]

'positive'

## Pass the predictor a list of strings

In [92]:
results = predictor.predict({"inputs":[tokenizer.decode(test_dataset[i]['input_ids'], skip_special_tokens=True) for i in sample_mask]})
results

[{'label': 'LABEL_1', 'score': 0.959917426109314},
 {'label': 'LABEL_1', 'score': 0.9544893503189087},
 {'label': 'LABEL_0', 'score': 0.9959618449211121},
 {'label': 'LABEL_0', 'score': 0.9893727898597717},
 {'label': 'LABEL_0', 'score': 0.970529317855835},
 {'label': 'LABEL_1', 'score': 0.9841547012329102},
 {'label': 'LABEL_0', 'score': 0.96929931640625},
 {'label': 'LABEL_0', 'score': 0.8777941465377808},
 {'label': 'LABEL_0', 'score': 0.9888051748275757},
 {'label': 'LABEL_0', 'score': 0.9791433811187744}]

## Convert model outputs to informative strings and compare

In [93]:
for pred, truth in zip(results, ground_truth):
    print(f"Ground Truth: {id2label[f'LABEL_{truth}']} | Prediction: {id2label[pred['label']]} -- Correct: {id2label[f'LABEL_{truth}'] == id2label[pred['label']]}")

Ground Truth: positive | Prediction: positive -- Correct: True
Ground Truth: positive | Prediction: positive -- Correct: True
Ground Truth: negative | Prediction: negative -- Correct: True
Ground Truth: negative | Prediction: negative -- Correct: True
Ground Truth: negative | Prediction: negative -- Correct: True
Ground Truth: positive | Prediction: positive -- Correct: True
Ground Truth: negative | Prediction: negative -- Correct: True
Ground Truth: negative | Prediction: negative -- Correct: True
Ground Truth: negative | Prediction: negative -- Correct: True
Ground Truth: negative | Prediction: negative -- Correct: True


## Delete the endpoint if it is not going to be used.

In [94]:
predictor.delete_endpoint()