In [2]:
%load_ext autoreload
%autoreload 2
import warnings
warnings.simplefilter(action="ignore")

### Kernel specs
* leveraging Python 3 (Data Science) out of the box

In [3]:
%%capture
!pip3 install hydra-core==1.1.1

In [4]:
!pip3 show sagemaker

Name: sagemaker
Version: 2.70.0
Summary: Open source library for training and deploying models on Amazon SageMaker.
Home-page: https://github.com/aws/sagemaker-python-sdk/
Author: Amazon Web Services
Author-email: 
License: Apache License 2.0
Location: /opt/conda/lib/python3.7/site-packages
Requires: attrs, boto3, google-pasta, importlib-metadata, numpy, packaging, pandas, pathos, protobuf, protobuf3-to-dict, smdebug-rulesconfig
Required-by: 


In [28]:
import sagemaker, boto3, os, time, tarfile
from sagemaker.huggingface import HuggingFace
import pandas as pd
from IPython.display import display
from omegaconf import OmegaConf

parent_dir = os.path.dirname(os.getcwd())
conf = OmegaConf.load(os.path.join(parent_dir,'conf/config.yaml'))

In [6]:
sagemaker_session = sagemaker.Session()
s3_bucket = sagemaker_session.default_bucket()
data_prefix = conf['ag-news']['data_prefix']
iam_role = sagemaker.get_execution_role()
aws_region = boto3.Session().region_name

### [AWS Open Data Registry](https://registry.opendata.aws/fast-ai-nlp/)

* AG news dataset for text classification 

In [7]:
!wget -nc https://s3.amazonaws.com/fast-ai-nlp/ag_news_csv.tgz
tf = tarfile.open('ag_news_csv.tgz')
tf.extractall()
!rm -fr ag_news_csv.tgz

--2022-04-04 03:48:56--  https://s3.amazonaws.com/fast-ai-nlp/ag_news_csv.tgz
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.84.35
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.84.35|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 11784419 (11M) [application/x-tar]
Saving to: ‘ag_news_csv.tgz’


2022-04-04 03:48:56 (45.6 MB/s) - ‘ag_news_csv.tgz’ saved [11784419/11784419]



In [9]:
std_cols = conf['ag-news']['std_cols']
data_folder = conf['ag-news']['data_folder']
train = pd.read_csv(f'{parent_dir}/notebooks/{data_folder}/train.csv',names=std_cols)
test = pd.read_csv(f'{parent_dir}/notebooks/{data_folder}/test.csv',names=std_cols)
cleaned_train_dir = f'{parent_dir}/notebooks/{data_folder}/ag-train.csv'
cleaned_test_dir = f'{parent_dir}/notebooks/{data_folder}/ag-test.csv'
train.to_csv(cleaned_train_dir,index=False)
test.to_csv(cleaned_test_dir,index=False)

In [10]:
# upload train test data to s3 bucket
train_s3 = sagemaker_session.upload_data(cleaned_train_dir,bucket=s3_bucket,key_prefix=f"{data_prefix}/train")
test_s3 = sagemaker_session.upload_data(cleaned_test_dir,bucket=s3_bucket,key_prefix=f"{data_prefix}/test")
print(train_s3)
print(test_s3)

s3://sagemaker-us-east-1-937500302629/news-hf/train/ag-train.csv
s3://sagemaker-us-east-1-937500302629/news-hf/test/ag-test.csv


In [15]:
class_cols = conf['ag-news']['class_cols']
class_labels = pd.read_csv(f"{parent_dir}/notebooks/{data_folder}/classes.txt",names=class_cols)

### Amazon BORT Model

* [https://huggingface.co/amazon/bort](https://huggingface.co/amazon/bort)

In [29]:
amazon_bort_hyperparams = conf['amazon-bort']['hyperparameters']
# git config to download the fine-tuning script
git_config = conf['amazon-bort']['git_config']

In [30]:
# creates HF estimator - note: the entry_point and other parameters are pointing to the HF repo in the git_config
# need to change the size of the instance type
hf_estimator_bort = HuggingFace(
    entry_point=conf['amazon-bort']['HuggingFace_estimator']['entry_point'],
    source_dir=conf['amazon-bort']['HuggingFace_estimator']['source_dir'],
    instance_type=conf['amazon-bort']['HuggingFace_estimator']['instance_type'],
    instance_count=conf['amazon-bort']['HuggingFace_estimator']['instance_count'],
    role=iam_role,
    git_config=git_config,
    transformers_version=conf['amazon-bort']['HuggingFace_estimator']['transformers_version'],
    pytorch_version=conf['amazon-bort']['HuggingFace_estimator']['pytorch_version'],
    py_version=conf['amazon-bort']['HuggingFace_estimator']['py_version'],
    hyperparameters=amazon_bort_hyperparams,
    disable_profiler=True
)

In [32]:
# start the training job
hf_estimator_bort.fit(
    {
        "train": os.path.dirname(train_s3),
        "test": os.path.dirname(test_s3)
    },
    wait=False
)