# ray_baseline.ipynb

Baseline model serving implementation from the [benchmark notebook](./benchmark.ipynb).

In [10]:
# Initialization and import code goes in this cell.

# Imports: Python core, then third-party, then local.
# Try to keep each block in alphabetical order, or the linter may get angry.

import requests
import starlette
import time
import os
import json

import scipy.special

import ray
from ray import serve
import torch
import transformers

# Fix silly warning messages about parallel tokenizers
os.environ['TOKENIZERS_PARALLELISM'] = 'False'


# Reduce the volume of warning messages from `transformers`
transformers.logging.set_verbosity_error()


def reboot_ray():
    if ray.is_initialized():
        ray.shutdown()

    if torch.cuda.is_available():
        return ray.init(num_gpus=1)
    else:
        return ray.init()

In [11]:
# Constants go here
INTENT_MODEL_NAME = 'mrm8488/t5-base-finetuned-e2m-intent'
SENTIMENT_MODEL_NAME = 'cardiffnlp/twitter-roberta-base-sentiment'
QA_MODEL_NAME = 'deepset/roberta-base-squad2'
GENERATE_MODEL_NAME = 'gpt2'


INTENT_INPUT = {
    'context':
        ("I came here to eat chips and beat you up, "
         "and I'm all out of chips.")
}

SENTIMENT_INPUT = {
    'context': "We're not happy unless you're not happy."
}

QA_INPUT = {
    'question': 'What is 1 + 1?',
    'context': 
        """Addition (usually signified by the plus symbol +) is one of the four basic operations of 
        arithmetic, the other three being subtraction, multiplication and division. The addition of two 
        whole numbers results in the total amount or sum of those values combined. The example in the
        adjacent image shows a combination of three apples and two apples, making a total of five apples. 
        This observation is equivalent to the mathematical expression "3 + 2 = 5" (that is, "3 plus 2 
        is equal to 5").
        """
}

GENERATE_INPUT = {
    'prompt_text': 'All your base are'
}

In [12]:
serve.shutdown()
reboot_ray()
serve.start()

2022-02-25 17:22:22,792	INFO services.py:1374 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m
[2m[36m(ServeController pid=80034)[0m 2022-02-25 17:22:27,325	INFO checkpoint_path.py:16 -- Using RayInternalKVStore for controller checkpoint and recovery.
[2m[36m(ServeController pid=80034)[0m 2022-02-25 17:22:27,437	INFO http_state.py:98 -- Starting HTTP proxy with name 'SERVE_CONTROLLER_ACTOR:vxzeES:SERVE_PROXY_ACTOR-node:127.0.0.1-0' on node 'node:127.0.0.1-0' listening on '127.0.0.1:8000'
2022-02-25 17:22:27,924	INFO api.py:475 -- Started Serve instance in namespace '0ba90253-709e-4241-bae1-436aa41f0d8c'.


<ray.serve.api.Client at 0x7ff46849b7c0>

In [13]:
class Intent:
    def __init__(self):
        # Tokenizer loading code from the model zoo doesn't work, so we 
        # explicitly specify the t5-base tokenizer.
        self._tokenizer = transformers.AutoTokenizer.from_pretrained('t5-base')
        self._model = transformers.AutoModelForSeq2SeqLM.from_pretrained(
            INTENT_MODEL_NAME)
        self._max_length = 128  # Max sequence length, input + output, in tokens

    async def __call__(self, request: starlette.requests.Request):
        json_request = await request.json()

        # Preprocessing
        input_text = f'{json_request["context"]} </s>'
        features = self._tokenizer([input_text], return_tensors='pt')

        # Inference
        output = self._model.generate(
            input_ids=features['input_ids'], 
            attention_mask=features['attention_mask'],
            max_length=self._max_length)

        # Postprocessing
        result_string = self._tokenizer.decode(output[0])
        result_string = result_string.replace('<pad>', '')
        result_string = result_string[len(' '):-len('</s>')]

        return {
            "intent": result_string
        }


class Sentiment:
    def __init__(self):
        self._tokenizer = transformers.AutoTokenizer.from_pretrained(
            SENTIMENT_MODEL_NAME)
        self._model = (transformers.AutoModelForSequenceClassification
                       .from_pretrained(SENTIMENT_MODEL_NAME))

    async def __call__(self, request: starlette.requests.Request):
        json_request = await request.json()

        # Preprocessing
        encoded_input = self._tokenizer(json_request['context'],
                                        return_tensors='pt')

        # Inference
        output = self._model(**encoded_input)

        # Postprocessing
        scores = output[0][0].detach().numpy()
        scores = scipy.special.softmax(scores)
        scores = [float(s) for s in scores]
        scores = {k: v for k, v in zip(['positive', 'neutral', 'negative'],
                                       scores)}
        return scores


class QA:
    def __init__(self):
        self._pipeline = transformers.pipeline(
            'question-answering', model=QA_MODEL_NAME)

    async def __call__(self, request: starlette.requests.Request):
        json_request = await request.json()

        # Preprocessing (returns a Python generator)
        qa_pre = self._pipeline.create_sample(**json_request)
        qa_pre = self._pipeline.preprocess(qa_pre)

        # Inference
        qa_output = (self._pipeline.forward(example) for example in qa_pre)

        # Postprocessing
        qa_result = self._pipeline.postprocess(qa_output)

        return qa_result


class Generate:
    def __init__(self):
        self._pipeline = transformers.pipeline(
            'text-generation', model=GENERATE_MODEL_NAME)
        self._pad_token_id = self._pipeline.tokenizer.eos_token_id

    async def __call__(self, request: starlette.requests.Request):
        json_request = await request.json()

        # Preprocessing
        generate_pre = self._pipeline.preprocess(**json_request)

        # Inference
        generate_output = self._pipeline.forward(
            generate_pre, pad_token_id=self._pad_token_id)

        # Postprocessing
        generate_result = self._pipeline.postprocess(generate_output)

        return generate_result[0]

Now we can deploy all of these pipelines as Serve endpoints.

In [14]:
# Define endpoints
LANGUAGES = ['en', 'es', 'zh']
MAX_CONCURRENT_QUERIES = 1

deployments = {}
for lang in LANGUAGES:
    deployments[(lang, 'intent')] = (
        serve.deployment(Intent, f'intent_{lang}', 
                         route_prefix=f'/predictions/intent_{lang}',
                         max_concurrent_queries=MAX_CONCURRENT_QUERIES))
    deployments[(lang, 'sentiment')] = (
        serve.deployment(Sentiment, f'sentiment_{lang}',
                         route_prefix=f'/predictions/sentiment_{lang}',
                         max_concurrent_queries=MAX_CONCURRENT_QUERIES))
    deployments[(lang, 'qa')] = (
        serve.deployment(QA, f'qa_{lang}',
                         route_prefix=f'/predictions/qa_{lang}',
                         max_concurrent_queries=MAX_CONCURRENT_QUERIES))
    deployments[(lang, 'generate')] = (
        serve.deployment(Generate, f'generate_{lang}', 
                         route_prefix=f'/predictions/generate_{lang}',
                         max_concurrent_queries=MAX_CONCURRENT_QUERIES))


for d in deployments.values():
    d.deploy(_blocking=False)

# Wait a moment so log output doesn't go to the next cell's output
time.sleep(10.)

2022-02-25 17:22:27,963	INFO api.py:249 -- Updating deployment 'intent_en'. component=serve deployment=intent_en
2022-02-25 17:22:27,971	INFO api.py:249 -- Updating deployment 'sentiment_en'. component=serve deployment=sentiment_en
2022-02-25 17:22:27,980	INFO api.py:249 -- Updating deployment 'qa_en'. component=serve deployment=qa_en
2022-02-25 17:22:27,989	INFO api.py:249 -- Updating deployment 'generate_en'. component=serve deployment=generate_en
2022-02-25 17:22:27,999	INFO api.py:249 -- Updating deployment 'intent_es'. component=serve deployment=intent_es
2022-02-25 17:22:28,010	INFO api.py:249 -- Updating deployment 'sentiment_es'. component=serve deployment=sentiment_es
[2m[36m(HTTPProxyActor pid=80044)[0m INFO:     Started server process [80044]
2022-02-25 17:22:28,022	INFO api.py:249 -- Updating deployment 'qa_es'. component=serve deployment=qa_es
[2m[36m(ServeController pid=80034)[0m 2022-02-25 17:22:28,025	INFO deployment_state.py:920 -- Adding 1 replicas to deployment

In [15]:
intent_result = requests.put(
    'http://127.0.0.1:8000/predictions/intent_en', 
    json.dumps(INTENT_INPUT)).json()
print(f'Intent result: {intent_result}')

sentiment_result = requests.put(
    'http://127.0.0.1:8000/predictions/sentiment_en', 
    json.dumps(SENTIMENT_INPUT)).json()
print(f'Sentiment result: {sentiment_result}')

qa_result = requests.put(
    'http://127.0.0.1:8000/predictions/qa_en', 
    json.dumps(QA_INPUT)).json()
print(f'Question answering result: {qa_result}')

generate_result = requests.put(
    'http://127.0.0.1:8000/predictions/generate_en', 
    json.dumps(GENERATE_INPUT)).json()
print(f'Natural language generation result: {generate_result}')

Intent result: {'intent': 'to eat'}
Sentiment result: {'positive': 0.5419477820396423, 'neutral': 0.38251084089279175, 'negative': 0.07554134726524353}
Question answering result: {'score': 4.278938831703272e-06, 'start': 483, 'end': 484, 'answer': '5'}
Natural language generation result: {'generated_text': 'All your base are in the red zone\n\n\nI had a bit of trouble finding him, but I ended up putting a red card on him because of his green.\n\n\nHe will probably be given a lot of respect for his blue, but'}


In [9]:
ray.shutdown()