In [None]:
# @title ###### Licensed to the Apache Software Foundation (ASF), Version 2.0 (the "License")

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.


# Bring your own Machine Leanring (ML) model to Beam RunInference

<button>
  <a href="https://beam.apache.org/documentation/sdks/python-machine-learning/">
    <img src="https://beam.apache.org/images/favicon.ico" alt="Open the docs" height="16"/>
    Beam RunInference
  </a>
</button>

In this notebook, we walk through a simple example to show how to customize your own ML model handler using
[ModelHandler](https://beam.apache.org/releases/pydoc/current/apache_beam.ml.inference.base.html#apache_beam.ml.inference.base.ModelHandler).

Named-Entity Recognition (NER) is one of the most common tasks for Natural Language Processing (NLP), 
which locates and classifies named entities in unstructured text into pre-defined labels such as person name, organization, date, etc. 
In this example, we illustrate how to use the popular spaCy package to load a ML model and apply it inside a Beam pipeline.


## Package Dependencies

The RunInference library is available in Apache Beam version <b>2.40</b> or later.

`spaCy` and `pandas` need to be installed. Here, a small NER model (`en_core_web_sm`) is also installed but any valid spaCy model could be used.

In [24]:
%pip install spacy pandas
%pip install apache-beam[gcp]
!python -m spacy download en_core_web_sm

You should consider upgrading via the '/Users/xqhu/Dev/beam/venv/bin/python -m pip install --upgrade pip' command.[0m[33m
[0mNote: you may need to restart the kernel to use updated packages.
zsh:1: no matches found: apache-beam[gcp]
Note: you may need to restart the kernel to use updated packages.
Collecting en-core-web-sm==3.4.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.0/en_core_web_sm-3.4.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
You should consider upgrading via the '/Users/xqhu/Dev/beam/venv/bin/python -m pip install --upgrade pip' command.[0m[33m
[0m[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


## Let us play with spaCy first

In [25]:
# create a spaCy language

import spacy

nlp = spacy.load("en_core_web_sm")


In [26]:
# some text strings for fun
text_strings = [
    "The New York Times is an American daily newspaper based in New York City with a worldwide readership.",
    "It was founded in 1851 by Henry Jarvis Raymond and George Jones, and was initially published by Raymond, Jones & Company."
]


In [28]:
# check what entities spaCy can recognize
doc = nlp(text_strings[0])


In [29]:
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)


The New York Times 0 18 ORG
American 25 33 NORP
daily 34 39 DATE
New York City 59 72 GPE


In [30]:
# visualize the results
from spacy import displacy
displacy.render(doc, style="ent")


In [31]:
# another example
displacy.render(nlp(text_strings[1]), style="ent")

## Now time to create our own `ModelHandler` to use spaCy for inference

In [34]:
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions

pipeline = beam.Pipeline(options=PipelineOptions(runner="DirectRunner",))

# only print the results to check
with pipeline as p:
    (p 
    | "CreateSentences" >> beam.Create(text_strings)
    | beam.Map(print)
    )




The New York Times is an American daily newspaper based in New York City with a worldwide readership.
It was founded in 1851 by Henry Jarvis Raymond and George Jones, and was initially published by Raymond, Jones & Company.


In [11]:
# Now define SpacyModelHandler to load the model and perform the inference

from apache_beam.ml.inference.base import RunInference
from apache_beam.ml.inference.base import ModelHandler
from apache_beam.ml.inference.base import PredictionResult
from spacy import Language
from typing import Any
from typing import Dict
from typing import Iterable
from typing import Optional
from typing import Sequence

class SpacyModelHandler(ModelHandler[str,
                                     PredictionResult,
                                     Language]):
    def __init__(
        self,
        model_name: str = "en_core_web_sm",
    ):
        """ Implementation of the ModelHandler interface for spaCy using text as input.

        Example Usage::

          pcoll | RunInference(SpacyModelHandler())

        Args:
          model_name: The spaCy model name. Default is en_core_web_sm.
        """
        self._model_name = model_name

    def load_model(self) -> Language:
        """Loads and initializes a model for processing."""
        return spacy.load(self._model_name)

    def run_inference(
        self,
        batch: Sequence[str],
        model: Language,
        inference_args: Optional[Dict[str, Any]] = None
    ) -> Iterable[PredictionResult]:
        """Runs inferences on a batch of text strings.

        Args:
          batch: A sequence of examples as text strings. 
          model: A spaCy language model
          inference_args: Any additional arguments for an inference.

        Returns:
          An Iterable of type PredictionResult.
        """
        # loop each text string and use tuple to store the inference results
        predictions = []
        for one_text in batch:
            doc = model(one_text)
            predictions.append(
                [(ent.text, ent.start_char, ent.end_char, ent.label_) for ent in doc.ents])
        return [PredictionResult(x, y) for x, y in zip(batch, predictions)]


In [20]:
with pipeline as p:
    (p 
    | "CreateSentences" >> beam.Create(text_strings)
    | "RunInferenceSpacy" >> RunInference(SpacyModelHandler("en_core_web_sm"))
    | beam.Map(print)
    )




PredictionResult(example='The New York Times is an American daily newspaper based in New York City with a worldwide readership.', inference=[('The New York Times', 0, 18, 'ORG'), ('American', 25, 33, 'NORP'), ('daily', 34, 39, 'DATE'), ('New York City', 59, 72, 'GPE')])
PredictionResult(example='It was founded in 1851 by Henry Jarvis Raymond and George Jones, and was initially published by Raymond, Jones & Company.', inference=[('1851', 18, 22, 'DATE'), ('Henry Jarvis', 26, 38, 'PERSON'), ('Raymond', 39, 46, 'PERSON'), ('George Jones', 51, 63, 'PERSON'), ('Raymond, Jones & Company', 96, 120, 'ORG')])
The New York Times is an American daily newspaper based in New York City with a worldwide readership.
It was founded in 1851 by Henry Jarvis Raymond and George Jones, and was initially published by Raymond, Jones & Company.
