# Benchmark PyTorch BERT on Inferentia
Please ensure that you are running this notebook with reference to https://github.com/aws/aws-neuron-sdk/tree/master/src/examples/pytorch/bert_tutorial/README.md, or you may miss key installation steps and fail to produce required artifacts in prior steps.

In previous steps in our tutorial we adapted the Hugging Face BERT model to the MRPC dataset for sequence classification.  We used the PyTorch Neuron trace command to compile for inferentia hardware, and uploaded the resulting PyTorch model files to S3.

In this tutorial we will fetch the previously trained model archives and benchmark performance

### Import some modules

In [None]:
import torch
import torch.neuron
import os
import sys
import pandas as pd

import shutil
import boto3
import botocore

from urllib.parse import urlparse
from urllib.parse import urlsplit

from transformers import BertTokenizer
from concurrent import futures

import time
import datetime
from datetime import date
import csv
import boto3
import botocore

import numpy as np
import multiprocessing

### PyTorch DataSet based on a TSV file
This will read in test data and enable some stanadrd PyTorch infrastructure

In [None]:
class BertTestDataset(torch.utils.data.Dataset):
    """Bert test dataset."""

    def __init__(self, tsv_file, tokenizer, max_length=128, transform=None):
        """
        Args:
            csv_file (string): Path to the csv file with annotations.
            tokenizer (callable = hugging face tokenizer):  Takes a string and encodes to standard input tensor set
            max_length (int): Maximum length that all input tensors will be padded to
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        with open(tsv_file, "r") as f:
            reader = csv.reader(f, delimiter="\t", quotechar=None)
            lines = list(reader)

        lines.pop(0)

        self.sentence_frame = pd.DataFrame( lines )
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.transform = transform

    def __len__(self):
        return len(self.sentence_frame)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        s1_raw = self.sentence_frame.iloc[idx,3] 
        if isinstance(s1_raw, bytes):
            s1_raw = s1_raw.decode("utf-8", "ignore")
        s2_raw = self.sentence_frame.iloc[idx,4]
        if isinstance(s2_raw, bytes):
            s1_raw = s1_raw.decode("utf-8", "ignore")

        quality = self.sentence_frame.iloc[idx,0]

        encoded =  self.tokenizer.encode_plus(s1_raw, s2_raw, add_special_tokens=True,
                    return_tensors='pt', max_length=self.max_length, pad_to_max_length=True)

        sample = {'encoded': encoded, 'quality': quality}

        if self.transform:
            sample = self.transform(sample)

        return sample


### BERT Results class
We'll collect results on accuracy, latency and throughput here

In [None]:
class BertResults():

    def __init__(self, parallel, batch_size):
        self.correct_count = 0 
        self.inference_count = 0 
        self.latency_array = [] 
        self.end_times = []
        self.total_latency_parallel = 0.0 
        self.parallel = parallel 
        self.batch_size = batch_size 

    def add_result( self, correct_count, inference_count, latency_array, end_times, total_latency):
        self.correct_count += correct_count

        self.inference_count += inference_count
        self.latency_array.extend( latency_array )
        self.end_times.extend( end_times )
        self.total_latency_parallel += total_latency

    def report( self, f ):
        assert(len(self.latency_array) != 0)
        p50_latency = np.percentile( self.latency_array, 50 )
        p90_latency = np.percentile( self.latency_array, 90 )
        p95_latency = np.percentile( self.latency_array, 95 )
        p99_latency = np.percentile( self.latency_array, 99 )
        p100_latency = np.percentile( self.latency_array, 100 )

        if self.total_latency_parallel == 0.0:
            self.total_latency_parallel = 1.0

        ## Take all of the end time-stamps and construct a time binned histogram
        hist, bin_edges = np.histogram(self.end_times, )

        overall_throughput = self.parallel * self.inference_count / float(self.total_latency_parallel)

        f.write("\n")
        f.write("Histogram throughput (UTC times):\n")
        f.write("===\n")
        max_throughput = 0.0
        for i in range(len(hist)):
            delta = bin_edges[i+1] - bin_edges[i]
            ## Each datestamp is batch size inferences
            throughput = self.batch_size * hist[i] / delta
            if throughput > max_throughput:
                max_throughput = throughput
            st1 = datetime.datetime.fromtimestamp(bin_edges[i]).strftime('%H:%M:%S.%f')[:-3]
            st2 = datetime.datetime.fromtimestamp(bin_edges[i+1]).strftime('%H:%M:%S.%f')[:-3]
            f.write("{} - {} => {} sentences/sec\n".format(st1, st2, int(throughput)) )
        f.write("\n")
        f.write("Maximum throughput (histogram) = {} sentences/sec\n".format(int(max_throughput)))
        f.write("Overall throughput (aggregate stats * parallel) = {} sentences/sec\n".format(int(overall_throughput)) )

        f.write("\n")
        f.write("Latency Percentiles:\n")
        f.write("===\n")
        f.write("P50  = {} milliseconds\n".format(int(1000*p50_latency)))
        f.write("P90  = {} milliseconds\n".format(int(1000*p90_latency)))
        f.write("P95  = {} milliseconds\n".format(int(1000*p95_latency)))
        f.write("P99  = {} milliseconds\n".format(int(1000*p99_latency)))        
        f.write("P100 = {} milliseconds\n".format(int(1000*p100_latency)))
        f.write("\n")
        f.write("Accuracy:\n")
        f.write("===\n")
        if self.inference_count == 0:
            self.inference_count = 1
        accuracy = float(self.correct_count) / float(self.inference_count)
        f.write("Accuracy = {}% \n".format(round(100*accuracy,2)))
        f.write("\n")
        f.write("Sanity test:\n")
        f.write("===\n")
        f.write("Processed - num batches {}\n".format(len(self.latency_array)))
        f.write("          - batch size {}\n".format(self.batch_size)) 
        f.write("          - with {} workers\n".format(self.parallel))



### BERT Runner 
This class will run parallel BERT threads on our inferentia hardware

In [None]:
class BertRunner():

    def __init__(self, model_files, pretrained_model_dir, tsv_file, batch_size, max_length, parallel):
        self.models = []

        assert( parallel == len(model_files) )
        print("BertRunner: Load {} models".format(parallel))
        for m in model_files:
            model = torch.jit.load(m)

            zeros = torch.zeros( [4,128], dtype=torch.long )
            prime_inputs = ( zeros, zeros, zeros )

            print("Priming {} ...".format(m))
            model(*prime_inputs)
            print(" ...done")

            self.models.append(model)

        self.results = BertResults(parallel, batch_size)
        self.batch_size = batch_size
        self.max_length = max_length
        self.parallel = parallel

        self.tokenizer = BertTokenizer.from_pretrained(pretrained_model_dir)
        self.ds = BertTestDataset( tsv_file=tsv_file, tokenizer=self.tokenizer, max_length=max_length )
        self.dataloader = torch.utils.data.DataLoader(self.ds, batch_size=batch_size, shuffle=True, num_workers=parallel)

    def invoke( self, idx ):
        curr_ng_sizes = os.environ.get('NEURONCORE_GROUP_SIZES', None)
        curr_ninfer = os.environ.get('NEURON_MAX_NUM_INFERS', None)

        #print("Envs at infer time = {} {}".format(curr_ng_sizes,curr_ninfer))

        # Start timing
        inference_count = 0
        correct_count = 0
        total_start = time.time()
        all_latencies = []
        all_end_times = []
        for batch in self.dataloader:
            ## Reformulate the batch into three batch tensors
            encoded = batch['encoded']
            inputs = torch.squeeze(encoded['input_ids'], 1)
            attention = torch.squeeze(encoded['attention_mask'], 1)
            token_type = torch.squeeze(encoded['token_type_ids'], 1)

            if inputs.size()[0] != self.batch_size:
                print("Input size = {} - padding".format(inputs.size()))
                remainder = self.batch_size - inputs.size()[0]
                zeros = torch.zeros( [remainder, self.max_length], dtype=torch.long )
                inputs = torch.cat( [inputs, zeros] )
                attention = torch.cat( [attention, zeros] )
                token_type = torch.cat( [token_type, zeros] )
                inference_count += inputs.size()[0]
            else:
                inference_count += self.batch_size

            assert(inputs.size()[0] == self.batch_size and inputs.size()[1] == self.max_length)
            assert(attention.size()[0] == self.batch_size and attention.size()[1] == self.max_length)
            assert(token_type.size()[0] == self.batch_size and token_type.size()[1] == self.max_length)

            input_args = (inputs, attention, token_type)

            start = time.time()
            output = self.models[idx](*input_args)[0]
            batch_predictions = [ row.argmax().item() for row in output ]
            end = time.time()
            all_latencies.append( end - start )
            all_end_times.append( end )

            quality_batch = batch['quality']

            for a, b in zip(batch_predictions, quality_batch):
                if int(a)==int(b):
                    correct_count += 1

        overall_time = time.time() - total_start

        self.results.add_result( correct_count, inference_count, all_latencies, all_end_times, overall_time )

### Get pretrained models
We'll use this function to fetch pretrained models from previous steps.  This will include:
* Our adapted MRPC model (for the tokenizer)
* Our previously compiled neuron PyTorch models

In [None]:
bucket_prefix="inferentia-test-"

try:
    boto3_sess = boto3.session.Session()
except botocore.exceptions.NoCredentialsError:
    print("No credentials:  Use 'aws confgure' to setup credentials or configure isengard (Amazon internal)")
    raise

try:
    sts_client = boto3.client('sts')
    response = sts_client.get_caller_identity()
except ClientError as e:
    print(e)
    raise
except:
    raise

ACCOUNT=response['Account']
#TIMESTAMP=date.today().strftime("%Y-%m-%d")
FOLDER="bert_tutorial"

bucket_name=bucket_prefix + ACCOUNT
bucket_path=FOLDER
filename="bert-large-uncased-mrpc.tar.gz"
key = bucket_path + "/" + filename

s3_base = "s3://" + bucket_name + "/" + bucket_path + "/"
s3_location = "s3://" + bucket_name + "/" + key

print("Using S3 base location: {}".format(s3_location))
mrpc_filename="bert-large-uncased-mrpc.tar.gz"

In [None]:
def get_pretrained_bert_mrpc(WORKSPACE, s3_base, pretrained_model_name, extension=""):
    saved_model = os.path.join(WORKSPACE, pretrained_model_name)
    print()
    if not os.path.isdir(saved_model):
        print("-- Pretrained model is not present - download from S3")
        os.makedirs(WORKSPACE, exist_ok=True)
        boto3_sess = None

        try:
            boto3_sess = boto3.session.Session()
        except botocore.exceptions.NoCredentialsError:
            print(
                "No credentials:  Use 'aws confgure' to setup credentials or configure isengard (Amazon internal)")
            raise
        except:
            raise

        s3 = boto3_sess.resource('s3')
        s3_object = s3_base + pretrained_model_name + extension
        parsed = urlparse(s3_object)
        path = parsed.path.lstrip('/')
        saved_model = os.path.join(WORKSPACE, os.path.basename(path))
        s3.Bucket(parsed.netloc).download_file(path, saved_model)
        if extension == ".tar.gz":
            saved_model_dir = os.path.join(WORKSPACE, pretrained_model_name)
            shutil.unpack_archive(saved_model, WORKSPACE)
            assert(os.path.isdir(saved_model_dir))
            print("-- Pretrained model downloaded and decompressed in {}/{}".format(
                WORKSPACE, pretrained_model_name))
        else:
            print("-- Pretrained model downloaded om {}/{}".format(
                WORKSPACE, pretrained_model_name))
    else:
        print("-- Using existing model in {}/{}".format(WORKSPACE, pretrained_model_name))

    return saved_model

## Benchmark Code
This code uses the tools setup before now:
* Pull down the pretrained models
* Set up the test runner with multiple pytorch models running on the four neuron cores on this inf1.2xlarge
* Use a thread pool executor to run the test
* Collate our results

In [None]:
def benchmark_accuracy_bert_large_mrpc_pytorch( s3_base, parallel ):
    # Constants for this test - these have to match what we compiled!
    batch_size = 4
    max_length = 128

    pretrained_model_name = "bert-large-uncased-mrpc"
    WORKSPACE = './ws_bert_benchmark'
    test_tsv_file = 'glue_mrpc_dev.tsv'

    # Setup a workspace
    os.makedirs(WORKSPACE, exist_ok=True)

    # Get and load pretrained models
    pretrained_model_dir = os.path.join(
        WORKSPACE, pretrained_model_name)

    if not os.path.exists( pretrained_model_dir ):
        get_pretrained_bert_mrpc( WORKSPACE, s3_base, pretrained_model_name, extension=".tar.gz" )

    neuron_model_names = []

    for i in range(parallel):
        neuron_file = "bert_large_mrpc_pytorch_batch" + str(batch_size) + '_' + str(i) + ".pt"
        neuron_file_path=os.path.join(
            WORKSPACE, neuron_file)

        if not os.path.exists(neuron_file_path):
            print(" -- Download {}".format( neuron_file ))
            get_pretrained_bert_mrpc( WORKSPACE, s3_base, neuron_file, extension="" )

        assert( os.path.exists(neuron_file_path) )

        neuron_model_names.append( neuron_file_path )

    # Get local evaluation data - load into a data set
    eval_data_path = os.path.join(os.path.dirname("./"), test_tsv_file)

    old_ng_sizes = os.environ.get('NEURONCORE_GROUP_SIZES', None)
    old_ninfer = os.environ.get('NEURON_MAX_NUM_INFERS', None)
    os.environ['NEURONCORE_GROUP_SIZES'] = '1,1,1,1'
    os.environ['NEURON_MAX_NUM_INFERS'] = '-1'

    runner = BertRunner( 
        neuron_model_names, 
        pretrained_model_dir, 
        eval_data_path, 
        batch_size, 
        max_length, 
        parallel )

    assert(len(neuron_model_names) == parallel)

    with futures.ThreadPoolExecutor(max_workers=parallel) as executor:

        running = {executor.submit(runner.invoke, idx): idx for idx in range(parallel)}
        for future in futures.as_completed(running):
            idx = running[future]
            print("Worker {} completed".format(idx))

            # Wait on completion
            future.result()

        try:
            runner.results.report(sys.stdout)
            
            with open("benchmark.txt", "w") as f:
                runner.results.report(f)
        except Exception as exc:
            print('Worker {} generated an exception: {}'.format(idx, exc))
            
    # Replace environment variables
    if old_ng_sizes is None:
        os.environ.pop('NEURONCORE_GROUP_SIZES', None)
    else:
        os.environ['NEURONCORE_GROUP_SIZES'] = old_ng_sizes
    if old_ninfer is None:
        os.environ.pop('NEURON_MAX_NUM_INFERS', None)
    else:
        os.environ['NEURON_MAX_NUM_INFERS'] = old_ninfer

### Run it!

In [None]:
## Set in a cell above, and needed here!
assert s3_base != None

## We can configure up to 4 neuron cores on an inf1.2xlarge
parallel = 4

benchmark_accuracy_bert_large_mrpc_pytorch(s3_base, parallel)

In [None]:
!cat benchmark.txt

### Upload our benchmark results

In [None]:
def upload_and_check_file( s3_location, filename ):
    
    try:
        boto3_sess = boto3.session.Session()
    except botocore.exceptions.NoCredentialsError:
        print("No credentials:  Use 'aws confgure' to setup credentials or configure isengard (Amazon internal)")
        raise

    o = urlsplit(s3_location, allow_fragments = True)
    mod_path = os.path.dirname( o.path )
    mod_path = mod_path.lstrip('/')

    print()
    print("Copy model to: s3://" + o.netloc + "/" + mod_path + "/")

    assert( os.path.exists(filename) )

    try:
        s3_client = boto3_sess.client('s3')
        print("Uploading ...")
        response = s3_client.upload_file(filename, o.netloc, mod_path + "/" + filename )
        if response == None:
            print(" ... no errors")
        else:
            print("Response: {}".format(response))
    except ClientError as e:
        print(e)
        raise
    except:
        raise

    print()
    print("Check the file uploaded OK ...")
    s3_resource = boto3_sess.resource('s3')
    bucket = s3_resource.Bucket(o.netloc)
    key = mod_path + "/" + filename
    full_name = "s3://" + bucket.name + "/" + key

    objs = list(bucket.objects.filter(Prefix=key))

    print()
    if len(objs) > 0 and objs[0].key == key:
        print("{} exists!".format(full_name))
    else:
        print("{} doesn't exist".format(full_name))

In [None]:

assert( s3_location != None )
upload_and_check_file( s3_location, "benchmark.txt" )
