In [None]:
# @title ###### Licensed to the Apache Software Foundation (ASF), Version 2.0 (the "License")

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License

# Generate Text Embeddings by using Hugging Face Hub models


## Text Embeddings

Text embeddings are a way of representing text as numerical vectors. This allows computers to understand and process text data, which is essential for many natural language processing (NLP) tasks.

### Uses of text embeddings
By converting text into numerical vectors, text embeddings make it possible for computers to process and analyze text data. This enables a wide range of NLP tasks, including:

* Semantic search: Finding documents or passages that are relevant to a query, even if the query doesn't use the exact same words as the documents.
* Text classification: Categorzing text data into different classes, such as spam or not spam, or positive sentiment or negative sentiment.
* Machine translation: Translating text from one language to another while preserving the meaning.
* Text summarization: Creating shorter summaries of longer pieces of text.

In this notebook, we will use Apache Beam's `MLTransform` to embeddings on the text data.

Hugging Face's [`SentenceTransformers`](https://huggingface.co/sentence-transformers) framework uses Python to generate sentence, text, and image embeddings.

To generate text embeddings that use Hugging Face models and `MLTransform`, use `SentenceTransformerEmbeddings` to specify the model configuration.

To use `SentenceTransformerEmbeddings`, first install the `the sentence-transformers` package.

# Install dependencies
 Install Apache Beam and the dependencies needed to work with Hugging Face embeddings.

In [None]:
! git clone https://github.com/apache/beam.git
! cd beam/sdks/python
! pip install beam/sdks/python
! pip install sentence-transformers

In [None]:
import tempfile
import apache_beam as beam
from apache_beam.ml.transforms.base import MLTransform
from apache_beam.ml.transforms.embeddings.huggingface import SentenceTransformerEmbeddings

## Use MLTransform in write mode

In `write` mode, `MLTransform` saves the transforms and their attributes to an artifact location. These transforms are used when you run `MLTransform` in `read` mode.

For more information about using `MLTransform`, see [Preprocess data with MLTransform](https://beam.apache.org/documentation/ml/preprocess-data/) in the Apache Beam documentation.

To generate text embeddings with `MLTransform`, the following pipeline uses the model `sentence-transformers/all-MiniLM-L6-v2` and the text inputs from the Hugging Face blog [Getting Started With Embeddings](https://huggingface.co/blog/getting-started-with-embeddings).

In [None]:
content = [
    {'x': 'How do I get a replacement Medicare card?'},
    {'x': 'What is the monthly premium for Medicare Part B?'},
    {'x': 'How do I terminate my Medicare Part B (medical insurance)?'},
    {'x': 'How do I sign up for Medicare?'},
    {'x': 'Can I sign up for Medicare Part B if I am working and have health insurance through an employer?'},
    {'x': 'How do I sign up for Medicare Part B if I already have Part A?'},
    {'x': 'What are Medicare late enrollment penalties?'},
    {'x': 'What is Medicare and who can get it?'},
    {'x': 'How can I get help with my Medicare Part A and Part B premiums?'},
    {'x': 'What are the different parts of Medicare?'},
    {'x': 'Will my Medicare premiums be higher because of my higher income?'},
    {'x': 'What is TRICARE ?'},
    {'x': "Should I sign up for Medicare Part B if I have Veterans' Benefits?"}
]

In [None]:
artifact_location_minilm = tempfile.mkdtemp(prefix='huggingface_')
text_embedding_model_name = 'sentence-transformers/all-MiniLM-L6-v2'
embedding_transform = SentenceTransformerEmbeddings(
        model_name=text_embedding_model_name, columns=['x'])

with beam.Pipeline() as pipeline:
  data_pcoll = (
          pipeline
          | "CreateData" >> beam.Create(content))
  transformed_pcoll = (
      data_pcoll
      | "MLTransform" >> MLTransform(write_artifact_location=artifact_location_minilm).with_transform(embedding_transform))

  transformed_pcoll | 'LogOutput' >> beam.Map(print)

  transformed_pcoll | "PrintEmbeddingShape" >> beam.Map(lambda x: print(f"Embedding shape: {len(x['x'])}"))

Embedding shape: 384
{'x': [-0.023889463394880295, 0.05525851249694824, -0.011654896661639214, -0.03341428190469742, -0.012260555289685726, -0.024872763082385063, -0.01266342680901289, 0.025345895439386368, 0.01850851997733116, -0.08350814878940582, -0.09301996231079102, 0.01448627095669508, -0.017410913482308388, -0.08834369480609894, -0.004479047376662493, -0.046325888484716415, -0.013193873688578606, 0.03538179770112038, 0.062311142683029175, 0.048589665442705154, -0.05911841243505478, 0.05413544178009033, -0.06439691036939621, 0.03402404487133026, 0.006636372767388821, 0.03591703996062279, -0.0678376704454422, -0.017735281959176064, -0.01272181048989296, 0.046462394297122955, 0.10864365845918655, 0.023821430280804634, -0.02699640579521656, 0.03717399388551712, 0.09759815782308578, -0.027030128985643387, -0.045429863035678864, 0.031817372888326645, -0.03374629095196724, -0.015198523178696632, -0.02153564803302288, 0.014811225235462189, -0.020891893655061722, 0.06885719299316406, 0.0

Pass additional arguments that are supported by `sentence-transformer` models, such as `convert_to_numpy=False`. These arguments are passed as a `dict` to the `SentenceTransformerEmbeddings` transform by using the `inference_args` parameter.

By passing `convert_to_numpy=False`, the output will contain `torch.Tensor`s.

In [None]:
artifact_location_minilm_with_inference_args = tempfile.mkdtemp(prefix='huggingface_')

embedding_transform = SentenceTransformerEmbeddings(
        model_name=text_embedding_model_name, columns=['x'],
        inference_args={'convert_to_numpy': False}
        )

with beam.Pipeline() as pipeline:
  data_pcoll = (
          pipeline
          | "CreateData" >> beam.Create(content))
  transformed_pcoll = (
      data_pcoll
      | "MLTransform" >> MLTransform(write_artifact_location=artifact_location_minilm_with_inference_args).with_transform(embedding_transform))

  # The outputs are in the Pytorch tensor type.
  transformed_pcoll |  'LogOutput' >> beam.Map(lambda x: print(type(x['x'])))

  transformed_pcoll | "PrintEmbeddingShape" >> beam.Map(lambda x: print(f"Embedding shape: {len(x['x'])}"))


<class 'torch.Tensor'>
Embedding shape: 384
<class 'torch.Tensor'>
Embedding shape: 384
<class 'torch.Tensor'>
Embedding shape: 384
<class 'torch.Tensor'>
Embedding shape: 384
<class 'torch.Tensor'>
Embedding shape: 384
<class 'torch.Tensor'>
Embedding shape: 384
<class 'torch.Tensor'>
Embedding shape: 384
<class 'torch.Tensor'>
Embedding shape: 384
<class 'torch.Tensor'>
Embedding shape: 384
<class 'torch.Tensor'>
Embedding shape: 384
<class 'torch.Tensor'>
Embedding shape: 384
<class 'torch.Tensor'>
Embedding shape: 384
<class 'torch.Tensor'>
Embedding shape: 384


Use the model `sentence-transformers/sentence-t5-large` to generate text embeddings. The model uses only the encoder from a `T5-large model`. The weights are stored in FP16. For more information about the model, see [Sentence-T5: Scalable Sentence Encoders from Pre-trained Text-to-Text Models](https://arxiv.org/abs/2108.08877).

In [None]:
artifact_location_t5 = tempfile.mkdtemp(prefix='huggingface_t5_')
text_embedding_model_name = 'sentence-transformers/sentence-t5-large'
embedding_transform = SentenceTransformerEmbeddings(
        model_name=text_embedding_model_name, columns=['x'])

with beam.Pipeline() as pipeline:
  data_pcoll = (
          pipeline
          | "CreateData" >> beam.Create(content))
  transformed_pcoll = (
      data_pcoll
      | "MLTransform" >> MLTransform(write_artifact_location=artifact_location_t5).with_transform(embedding_transform))

  transformed_pcoll | 'LogOutput' >> beam.Map(print)

  transformed_pcoll | "PrintEmbeddingShape" >> beam.Map(lambda x: print(f"Embedding shape: {len(x['x'])}"))

Embedding shape: 768
{'x': [-0.0317193828523159, -0.005265399813652039, -0.012499183416366577, 0.00018130357784684747, -0.005592408124357462, 0.06207558885216713, -0.01656288281083107, 0.0167048592120409, -0.01239298190921545, 0.03041897714138031, 0.039960071444511414, -0.03737572953104973, 0.0037162182852625847, 0.021203506737947464, -0.01774030551314354, 0.05141901224851608, -0.026910526677966118, 0.0044160946272313595, -0.03774929791688919, -0.008197496645152569, 0.066464863717556, -0.010413877665996552, -0.014248563908040524, 0.01351633109152317, -0.011204755865037441, -0.030143024399876595, 0.006300915032625198, -0.006915290839970112, 0.05997888371348381, -0.03908957913517952, 0.010871915146708488, -0.041436873376369476, -0.04041919484734535, -0.020206045359373093, -0.003629533341154456, 0.029160160571336746, 0.024502694606781006, 0.0024494838435202837, 0.032406035810709, 0.05126721039414406, -0.06762303411960602, 0.015054339542984962, 0.011992530897259712, 0.009902890771627426, 0

## Use MLTransform in read mode

In `read` mode, `MLTransform` uses the artifacts generated during `write` mode. In this case, the transform and its attributes are loaded from the saved artifacts. You don't need to specify the artifacts again during `read` mode.

In this way, `MLTransform` provides consistent preprocessing steps for training and inference workloads.

In [None]:
test_content = [
    {
        'x': 'This is a test sentence'
    },
    {
        'x': 'The park is full of dogs'
    },
    {
        'x': "Should I sign up for Medicare Part B if I have Veterans' Benefits?"
    }
]

# Uses the T5 model to generate text embeddings
with beam.Pipeline() as pipeline:
  data_pcoll = (
          pipeline
          | "CreateData" >> beam.Create(test_content))
  transformed_pcoll = (
      data_pcoll
      | "MLTransform" >> MLTransform(read_artifact_location=artifact_location_t5))

  transformed_pcoll | 'LogOutput' >> beam.Map(print)

{'x': [0.00036313451710157096, -0.03929319977760315, -0.03574873134493828, 0.05015222355723381, 0.04295048117637634, 0.04800170287489891, 0.006883862894028425, -0.02567591704428196, -0.048067063093185425, 0.036534328013658524, 0.02857070229947567, 0.009494246914982796, 0.018997641280293465, 0.018654372543096542, -0.04606235399842262, 0.0009032735251821578, -0.027347039431333542, -0.022223154082894325, -0.009926173835992813, -0.051440637558698654, 0.05921361967921257, 0.002421777229756117, -0.005073009990155697, 0.03668772429227829, -0.007695269305258989, -0.0027709712740033865, -0.03715380281209946, -0.003768067341297865, 0.035790298134088516, -0.03584091737866402, 0.011895176954567432, -0.014409150928258896, -0.013329439796507359, -0.008629214018583298, 0.008584667928516865, 0.05202161893248558, 0.030300768092274666, 0.018855834379792213, -0.011420823633670807, 0.027753762900829315, 0.007431956939399242, 0.023358862847089767, -0.006045830901712179, -0.0013208928285166621, 0.0521053858