In [1]:
%load_ext autoreload
%autoreload 2
import warnings
warnings.simplefilter(action="ignore")

### Kernel specs
* leveraging Python 3 (Data Science) out of the box

In [2]:
%%capture
!pip3 install hydra-core==1.1.1

In [3]:
!pip3 show sagemaker

Name: sagemaker
Version: 2.70.0
Summary: Open source library for training and deploying models on Amazon SageMaker.
Home-page: https://github.com/aws/sagemaker-python-sdk/
Author: Amazon Web Services
Author-email: 
License: Apache License 2.0
Location: /opt/conda/lib/python3.7/site-packages
Requires: attrs, boto3, google-pasta, importlib-metadata, numpy, packaging, pandas, pathos, protobuf, protobuf3-to-dict, smdebug-rulesconfig
Required-by: 


In [32]:
import sagemaker, boto3, os, time, tarfile
from sagemaker.huggingface import HuggingFace, HuggingFaceModel
import pandas as pd
from IPython.display import display
from omegaconf import OmegaConf

parent_dir = os.path.dirname(os.getcwd())
conf = OmegaConf.load(os.path.join(parent_dir,'conf/config.yaml'))

In [5]:
sagemaker_session = sagemaker.Session()
s3_bucket = sagemaker_session.default_bucket()
data_prefix = conf['ag-news']['data_prefix']
iam_role = sagemaker.get_execution_role()
aws_region = boto3.Session().region_name

### [AWS Open Data Registry](https://registry.opendata.aws/fast-ai-nlp/)

* AG news dataset for text classification 

In [6]:
!wget -nc https://s3.amazonaws.com/fast-ai-nlp/ag_news_csv.tgz
tf = tarfile.open('ag_news_csv.tgz')
tf.extractall()
!rm -fr ag_news_csv.tgz

--2022-04-16 02:58:54--  https://s3.amazonaws.com/fast-ai-nlp/ag_news_csv.tgz
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.177.21
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.177.21|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 11784419 (11M) [application/x-tar]
Saving to: ‘ag_news_csv.tgz’


2022-04-16 02:58:54 (44.4 MB/s) - ‘ag_news_csv.tgz’ saved [11784419/11784419]



In [7]:
std_cols = conf['ag-news']['std_cols']
data_folder = conf['ag-news']['data_folder']
train = pd.read_csv(f'{parent_dir}/notebooks/{data_folder}/train.csv',names=std_cols)
test = pd.read_csv(f'{parent_dir}/notebooks/{data_folder}/test.csv',names=std_cols)
cleaned_train_dir = f'{parent_dir}/notebooks/{data_folder}/ag-train.csv'
cleaned_test_dir = f'{parent_dir}/notebooks/{data_folder}/ag-test.csv'
train.to_csv(cleaned_train_dir,index=False)
test.to_csv(cleaned_test_dir,index=False)

In [8]:
# upload train test data to s3 bucket
train_s3 = sagemaker_session.upload_data(cleaned_train_dir,bucket=s3_bucket,key_prefix=f"{data_prefix}/train")
test_s3 = sagemaker_session.upload_data(cleaned_test_dir,bucket=s3_bucket,key_prefix=f"{data_prefix}/test")
print(train_s3)
print(test_s3)

s3://sagemaker-us-east-1-937500302629/news-hf/train/ag-train.csv
s3://sagemaker-us-east-1-937500302629/news-hf/test/ag-test.csv


In [9]:
class_cols = conf['ag-news']['class_cols']
class_labels = pd.read_csv(f"{parent_dir}/notebooks/{data_folder}/classes.txt",names=class_cols)

### Amazon BORT Model

* [https://huggingface.co/amazon/bort](https://huggingface.co/amazon/bort)

In [10]:
amazon_bort_hyperparams = conf['amazon-bort']['hyperparameters']
# git config to download the fine-tuning script
git_config = conf['amazon-bort']['git_config']

In [11]:
# creates HF estimator - note: the entry_point and other parameters are pointing to the HF repo in the git_config
# need to change the size of the instance type
hf_estimator_bort = HuggingFace(
    entry_point=conf['amazon-bort']['HuggingFace_estimator']['entry_point'],
    source_dir=conf['amazon-bort']['HuggingFace_estimator']['source_dir'],
    instance_type=conf['amazon-bort']['HuggingFace_estimator']['instance_type'],
    instance_count=conf['amazon-bort']['HuggingFace_estimator']['instance_count'],
    role=iam_role,
    git_config=git_config,
    transformers_version=conf['amazon-bort']['HuggingFace_estimator']['transformers_version'],
    pytorch_version=conf['amazon-bort']['HuggingFace_estimator']['pytorch_version'],
    py_version=conf['amazon-bort']['HuggingFace_estimator']['py_version'],
    hyperparameters=amazon_bort_hyperparams,
    disable_profiler=True
)

In [13]:
%%capture
# start the training job
# the wait=True parameter allows the cell to execute with the training job not complete
hf_estimator_bort.fit(
    {
        "train": os.path.dirname(train_s3),
        "test": os.path.dirname(test_s3)
    },
    wait=True
)

In [19]:
# make sure status of training job is complete
client = boto3.client("sagemaker")
describe_resp = client.describe_training_job(TrainingJobName=hf_estimator_bort.latest_training_job.name)
print('Time - TrainingJobStatus - SecondaryStatus')
print('-'*42)
print(time.strftime("%H:%M", time.localtime()), '-', describe_resp['TrainingJobStatus'] + " - " + describe_resp['SecondaryStatus'])

Time - TrainingJobStatus - SecondaryStatus
------------------------------------------
03:25 - Completed - Completed


### Inference Endpoint

In [24]:
hf_model_bort = HuggingFaceModel(
    env=conf['amazon-bort']['HuggingFace_model']['env'],
    model_data=hf_estimator_bort.model_data,
    role=iam_role,
    transformers_version=conf['amazon-bort']['HuggingFace_estimator']['transformers_version'],
    pytorch_version=conf['amazon-bort']['HuggingFace_estimator']['pytorch_version'],
    py_version=conf['amazon-bort']['HuggingFace_estimator']['py_version']
)

In [33]:
hf_endpoint_bort = hf_model_bort.deploy(
    initial_instance_count=conf['amazon-bort']['HuggingFace_model']['deploy']['initial_instance_count'],
    instance_type=conf['amazon-bort']['HuggingFace_model']['deploy']['instance_type']
)

------!

In [36]:
class_labels

Unnamed: 0,label
0,World
1,Sports
2,Business
3,Sci/Tech


In [43]:
test_input_data = {
    "inputs": "Stocks went up 30% after yesterday's market closure"
}
pred_resp = hf_endpoint_bort.predict(test_input_data)
pred_label_name = class_labels['label'][int(pred_resp[0]['label'][-1:])]
print(f"label: {pred_resp[0]['label']}")
print(f"score: {pred_resp[0]['score']}")
print(f"label_name: {pred_label_name}")

label: LABEL_0
score: 0.852148711681366
label_name: World


### Delete Endpoint

In [47]:
client.delete_endpoint(EndpointName=hf_endpoint_bort.endpoint)

The endpoint attribute has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


{'ResponseMetadata': {'RequestId': '2ab344ee-5b6e-4f68-b149-0388f81a6cf5',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '2ab344ee-5b6e-4f68-b149-0388f81a6cf5',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '0',
   'date': 'Sat, 16 Apr 2022 04:10:25 GMT'},
  'RetryAttempts': 0}}