## BERT Binary Classification (Distributed Data Parallelism)
##### using SageMaker HuggingFace Estimators 

##### Prerequisites

In [None]:
%%capture
!pip install "sagemaker>=2.31.0" "transformers>=4.4.2" "datasets[s3]>=1.5.0" --upgrade

In [None]:
%%capture
import IPython
!conda install -c conda-forge ipywidgets -y
IPython.Application.instance().kernel.do_shutdown(True)

**Note:** Restart Kernel after installing prerequisites.

#### Imports 

In [None]:
from sagemaker import get_execution_role, Session
from sagemaker.huggingface import HuggingFace
from datasets.filesystems import S3FileSystem
from transformers import AutoTokenizer
from datasets import Dataset
import pandas as pd
import transformers 
import sagemaker
import datasets
import logging
import torch

#### Setup Logger

In [None]:
logger = logging.getLogger('__name__')
logger.setLevel(logging.INFO)
logger.addHandler(logging.StreamHandler())

In [None]:
logger.info(f'[Using SageMaker: {sagemaker.__version__}]')
logger.info(f'[Using Transformers: {transformers.__version__}]')
logger.info(f'[Using Datasets: {datasets.__version__}]')
logger.info(f'[Using Torch: {torch.__version__}]')

#### Essentials

In [None]:
session = Session()
role = get_execution_role()
bucket = session.default_bucket()
s3 = S3FileSystem() 

In [None]:
logger.info(f'Default bucket = {bucket}')

### Data Preparation

#### Load dataset into train and test splits

In [None]:
pd.set_option('max_colwidth', 400)

In [None]:
train_df = pd.read_csv('./data/train.csv')

train_df.head(5)

In [None]:
test_df = pd.read_csv('./data/test.csv')

In [None]:
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

In [None]:
train_dataset.shape

In [None]:
test_dataset.shape

#### Tokenize

In [None]:
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

In [None]:
def tokenize(batch):
    return tokenizer(batch['text'], padding='max_length', truncation=True)

In [None]:
train_dataset = train_dataset.map(tokenize, batched=True, batch_size=len(train_dataset))
test_dataset = test_dataset.map(tokenize, batched=True, batch_size=len(test_dataset))

In [None]:
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

In [None]:
train_dataset

#### Load tokenized train and test sets to S3

In [None]:
training_input_path = f's3://{bucket}/imdb/train'
train_dataset.save_to_disk(training_input_path,fs=s3)

test_input_path = f's3://{bucket}/imdb/test'
test_dataset.save_to_disk(test_input_path,fs=s3)

### Create Training Script

In [None]:
%%file ./src/train.py

from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from datasets import load_from_disk
import argparse
import logging
import random
import torch
import sys
import os


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    # Hyperparameters sent by the client are passed as command-line arguments to the script
    parser.add_argument('--epochs', type=int, default=3)
    parser.add_argument('--train-batch-size', type=int, default=32)
    parser.add_argument('--eval-batch-size', type=int, default=64)
    parser.add_argument('--warmup_steps', type=int, default=500)
    parser.add_argument('--model_name', type=str)
    parser.add_argument('--learning_rate', type=str, default=5e-5)

    # Data, model, and output directories
    parser.add_argument('--output-data-dir', type=str, default=os.environ['SM_OUTPUT_DATA_DIR'])
    parser.add_argument('--model-dir', type=str, default=os.environ['SM_MODEL_DIR'])
    parser.add_argument('--n_gpus', type=str, default=os.environ['SM_NUM_GPUS'])
    parser.add_argument('--training_dir', type=str, default=os.environ['SM_CHANNEL_TRAIN'])
    parser.add_argument('--test_dir', type=str, default=os.environ['SM_CHANNEL_TEST'])

    args, _ = parser.parse_known_args()

    # Set up logging
    logger = logging.getLogger(__name__)

    logging.basicConfig(
        level=logging.getLevelName('INFO'),
        handlers=[logging.StreamHandler(sys.stdout)],
        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
    )

    # Load train and test datasets
    train_dataset = load_from_disk(args.training_dir)
    test_dataset = load_from_disk(args.test_dir)

    logger.info(f'[Loaded train_dataset length is: {len(train_dataset)}]')
    logger.info(f'[Loaded test_dataset length is: {len(test_dataset)}]')
    
    # Compute metrics function for binary classification
    def compute_metrics(pred):
        labels = pred.label_ids
        preds = pred.predictions.argmax(-1)
        precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
        acc = accuracy_score(labels, preds)
        return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall}

    # Download model from model hub
    model = AutoModelForSequenceClassification.from_pretrained(args.model_name)

    # Define training args
    training_args = TrainingArguments(
        output_dir=args.model_dir,
        num_train_epochs=args.epochs,
        per_device_train_batch_size=args.train_batch_size,
        per_device_eval_batch_size=args.eval_batch_size,
        warmup_steps=args.warmup_steps,
        evaluation_strategy='epoch',
        logging_dir=f'{args.output_data_dir}/logs',
        learning_rate=float(args.learning_rate),
    )

    # Create Trainer instance
    trainer = Trainer(
        model=model,
        args=training_args,
        compute_metrics=compute_metrics,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
    )

    # Train model
    trainer.train()

    # Evaluate model
    eval_result = trainer.evaluate(eval_dataset=test_dataset)

    # Write evaluation results to a file which can be accessed later in S3 output
    with open(os.path.join(args.output_data_dir, 'eval_results.txt'), 'w') as writer:
        for key, value in sorted(eval_result.items()):
            writer.write(f'{key} = {value}\n')

    # Save model to S3
    trainer.save_model(args.model_dir)

### Create an Estimator and start training job

In [None]:
hyperparameters={'epochs': 10,
                 'train_batch_size': 32,
                 'model_name':'distilbert-base-uncased'
                 }

Configuration for running training on smdistributed Data Parallel

In [None]:
distribution = {'smdistributed': {'dataparallel': { 'enabled': True }}}

In [None]:
huggingface_estimator = HuggingFace(entry_point='train.py', 
                                    source_dir='./src', 
                                    instance_type='ml.p3.8xlarge', 
                                    instance_count=2, 
                                    role=role, 
                                    transformers_version='4.4.2', 
                                    pytorch_version='1.6.0', 
                                    py_version='py36', 
                                    hyperparameters = hyperparameters, 
                                    distribution=distribution)

In [None]:
%%time

huggingface_estimator.fit({'train': training_input_path, 'test': test_input_path})

#### Estimator Parameters 

In [None]:
huggingface_estimator.__dict__

In [None]:
huggingface_estimator.model_data