# Test1: HuggingFace and AWS Sagemaker Training

## Intro

This is a notebook to test the HuggingFace transformers and datasets library together with a custom Amazon sagemaker-sdk extension to fine-tune a pre-trained transformer for multi-class text classification.

The pre-trained model will be fine-tuned using the govuk labelled dataset.

In [None]:
# !pip3 install datasets
# !pip3 install transformers
#!pip3 install sagemaker
#!pip3 install torch
#!pip3 install tensorflow
#!pip3 install boto3

In [None]:
import os
import secrets

import boto3
import sagemaker
from datasets import load_dataset, load_from_disk
from transformers import AutoTokenizer

## Permissions

In [None]:
system = "AWS"  # ['LOCAL', 'AWS']
s3_bucket = "govuk-data-infrastructure-integration"
s3_prefix = "model-data/huggingface_transformer_models"  # s3 key prefix for the data

tokenizer_name = "bert-base-uncased"  # tokenizer used in preprocessing
dataset_name = "hf_data"  # dataset used

In [None]:
if system == "AWS":
    # set up sagemaker session offline to manage interactions with the Amazon SageMaker APIs and any other AWS services needed.
    # sagemaker session bucket -> used for uploading data, models and logs. sagemaker will automatically create this bucket if it not exists
    sess = sagemaker.Session()
    sagemaker_session_bucket = s3_bucket
    if sagemaker_session_bucket is None and sess is not None:
        # set to default bucket if a bucket name is not given
        sagemaker_session_bucket = sess.default_bucket()

    role = sagemaker.get_execution_role()
    sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

    print(f"sagemaker role arn: {role}")
    print(f"sagemaker bucket: {sess.default_bucket()}")
    print(f"sagemaker session region: {sess.boto_region_name}")

if system == "LOCAL":
    s3 = boto3.resource(
        service_name="s3",
        region_name=XXX,
        aws_access_key_id=XXX,
        aws_secret_access_key=XXX,
    )

## Preprocessing

We are using the datasets library to download and preprocess the gov.uk labelled dataset. After preprocessing, the dataset will be uploaded to our sagemaker_session_bucket to be used within our training job. The gov.uk dataset consists of 16000 training examples, 2000 validation examples, and 2000 testing examples.

### Tokenization

In [None]:
# download tokenizer for 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

# tokenizer helper function
def tokenize(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True)

In [None]:
import botocore
from datasets.filesystems import S3FileSystem

s3 = S3FileSystem()

dataset_path = f"s3://{sess.default_bucket()}/{s3_prefix}/hf_data"

# load dataset
ner_data = load_from_disk(dataset_path, fs=s3)

In [None]:
# rename columns in dataset dict
ner_data = ner_data.rename_column(
    original_column_name="text_token", new_column_name="tokens"
)
ner_data = ner_data.rename_column(
    original_column_name="new_label_list_id", new_column_name="ner_tags"
)
ner_data

In [None]:
raw_datasets = load_dataset("conll2003")

In [None]:
raw_datasets["train"][0]

In [None]:
ner_data["train"][0]

In [None]:
raw_datasets["train"].features

In [None]:
raw_datasets["train"].features["ner_tags"]

In [None]:
ner_data["train"].features

In [None]:
ner_data["train"].features["ner_tags"]

In [None]:
label_names = ner_feature.feature.id()
label_names

In [None]:
new_label_map = {
    "O": 0,
    "CONTACT": 1,
    "DATE": 2,
    "EVENT": 3,
    "FINANCE": 4,
    "FORM": 5,
    "LOCATION": 6,
    "MISC": 7,
    "MONEY": 8,
    "ORGANIZATION": 9,
    "PERSON": 10,
    "SCHEME": 11,
    "STATE": 12,
}

label_names = [i for i in new_label_map.keys()]
label_names

In [None]:
words = ner_data["train"][2]["tokens"]
labels = ner_data["train"][2]["ner_tags"]
line1 = ""
line2 = ""
for word, label in zip(words, labels):
    full_label = label_names[label]
    max_length = max(len(word), len(full_label))
    line1 += word + " " * (max_length - len(word) + 1)
    line2 += full_label + " " * (max_length - len(full_label) + 1)

print(line1)
print(line2)

## Processing the data

In [None]:
from transformers import AutoTokenizer

model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
tokenizer.is_fast

In [None]:
inputs = tokenizer(ner_data["train"][0]["tokens"], is_split_into_words=True)
inputs.tokens()

In [None]:
inputs.word_ids()

In [None]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

In [None]:
labels = ner_data["train"][0]["ner_tags"]
word_ids = inputs.word_ids()
print(len(labels))
print(labels)
print(len(word_ids))
print(align_labels_with_tokens(labels, word_ids))

In [None]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))
    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [None]:
tokenized_ner_data = ner_data.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=ner_data["train"].column_names,
)

In [None]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [None]:
batch = data_collator([tokenized_ner_data["train"][i] for i in range(2)])
batch["labels"]

In [None]:
!pip install seqeval

In [None]:
from datasets import load_metric

metric = load_metric("seqeval")

In [None]:
import numpy as np


def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

In [None]:
id2label = {str(i): label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

In [None]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
)

In [None]:
model.config.num_labels

In [None]:
from transformers import TrainingArguments

args = TrainingArguments(
    "bert-finetuned-ner",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=False,
)

In [None]:
!pip install torch==1.5.0

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_ner_data["train"],
    eval_dataset=tokenized_ner_data["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)
trainer.train()

In [None]:
# tokenize dataset
train_dataset = train_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

# set format for pytorch
train_dataset = train_dataset.rename_column("label", "labels")
train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
test_dataset = test_dataset.rename_column("label", "labels")
test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

### Uploading data to sagemaker_session_bucket

After we processed the datasets we are going to use the new FileSystem integration to upload our dataset to S3.

In [None]:
import botocore
from datasets.filesystems import S3FileSystem

s3 = S3FileSystem()

# save train_dataset to s3
training_input_path = f"s3://{sess.default_bucket()}/{s3_prefix}/train"
train_dataset.save_to_disk(training_input_path, fs=s3)

# save test_dataset to s3
test_input_path = f"s3://{sess.default_bucket()}/{s3_prefix}/test"
test_dataset.save_to_disk(test_input_path, fs=s3)

## Fine-tuning & starting Sagemaker Training Job

In order to create a sagemaker training job we need an HuggingFace Estimator. The Estimator handles end-to-end Amazon SageMaker training and deployment tasks. In a Estimator we define, which fine-tuning script should be used as entry_point, which instance_type should be used, which hyperparameters are passed in .....

In [None]:
#!pygmentize ./scripts/ner_training.py

In [None]:
#!python ./scripts/ner_training.py

In [None]:
import time

from sagemaker.huggingface import HuggingFace

# hyperparameters, which are passed into the training job
hyperparameters = {
    "epochs": 1,  # number of training epochs
    "train_batch_size": 32,  # batch size for training
    "eval_batch_size": 64,  # batch size for evaluation
    "learning_rate": 3e-5,  # learning rate used during training
    "model_id": "distilbert-base-uncased",  # pre-trained model
    "fp16": True,  # Whether to use 16-bit (mixed) precision training
}

In [None]:
# define Training Job Name
job_name = f'huggingface-test-{time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime())}'

# create the Estimator
huggingface_estimator = HuggingFace(
    entry_point="train.py",  # fine-tuning script used in training jon
    source_dir="./scripts",  # directory where fine-tuning script is stored
    instance_type="ml.p3.2xlarge",  #   # instances type used for the training job
    instance_count=1,  # the number of instances used for training
    base_job_name=job_name,  # the name of the training job
    role=role,  # Iam role used in training job to access AWS ressources, e.g. S3
    transformers_version="4.6.1",  # the transformers version used in the training job
    pytorch_version="1.7.1",  # the pytorch_version version used in the training job
    py_version="py36",  # the python version used in the training job
    hyperparameters=hyperparameters,  # the hyperparameter used for running the training job
)

In [None]:
# define a data input dictonary with our uploaded s3 uris
data = {"train": training_input_path, "test": test_input_path}

# starting the train job with our uploaded datasets as input
huggingface_estimator.fit(data, wait=True)

## Deploying the endpoint

To deploy our endpoint, we call deploy() on our HuggingFace estimator object, passing in our desired number of instances and instance type.

In [None]:
# predictor = huggingface_estimator.deploy(1,"ml.g4dn.xlarge")

Then, we use the returned predictor object to call the endpoint.

In [None]:
# sentences = [{"inputs": "I get so nervous before a demo"}, #fear
#              {"inputs": "I am shocked that the API works so well "}, #suprise
#              {"inputs": "It's a shame that I havent learned this sooner"}, #sadness
#              {"inputs": "It's a disgrace that AWS is not free"}, #anger
#              {"inputs": "I am delighted to have learned this amazing new technology"}, #joy
#              {"inputs": "I was so shocked at my suprise party. I also hated every minute of it."} #suprise/anger
#             ]

# for sentence in sentences:
#     prediction = predictor.predict(sentence)
#     print(prediction)

**IMPORTANT** Finally, we delete the inference endpoint.



In [None]:
predictor.delete_endpoint()