In [None]:
# use Data Science 2.0 Image
!pip install transformers --quiet
!pip install langchain --quiet
!pip install opensearch-py --quiet
!pip install beautifulsoup4 --quiet
!pip install elasticsearch --quiet

# Amazon OpenSearch Service Setup and Secret Retrieval
This script is used to set up a client for the Amazon OpenSearch service and retrieve secret credentials using AWS Secrets Manager.

- First, we import the necessary libraries
- Next, we set up a client for the OpenSearch service

**get_credentials()**
We define a function to retrieve secret credentials from AWS Secrets Manager.
The function uses a client for the AWS Secrets Manager to retrieve the secret value and parse it into a Python object using json.loads().
Args:
- secret_id: The identifier for the secret in AWS Secrets Manager.
- region_name: The AWS region where the secret is stored.
Returns:
- str: The secret value


**get_parameter_value**
Retrieve a parameter value from AWS Systems Manager Parameter Store.
    
Args:
- parameter_name (str): The name of the parameter you want to retrieve.
- decrypt (bool): Whether to decrypt the parameter value if it's encrypted. Default is True.

Returns:
- str: The parameter value.

- Finally, we set a few additional parameters

In [None]:
import boto3
import botocore
import json
from botocore.config import Config
import time

config = Config(region_name="eu-west-1")

# get current session and AWS Account
session = boto3.session.Session()
sts_client = session.client("sts")
account_id = sts_client.get_caller_identity()["Account"]


def get_credentials(secret_id: str, region_name: str) -> str:
    client = boto3.client("secretsmanager", region_name=region_name)
    response = client.get_secret_value(SecretId=secret_id)
    secrets_value = json.loads(response["SecretString"])
    return secrets_value


def get_parameter_value(parameter_name, decrypt=True):
    ssm_client = boto3.client("ssm", config=config)
    response = ssm_client.get_parameter(Name=parameter_name, WithDecryption=decrypt)
    return response["Parameter"]["Value"]


# getting OpenSearch credentials
app_prefix = "genie"
user = get_credentials(app_prefix + "_opensearch_pw", "eu-west-1")["user"]
secret = get_credentials(app_prefix + "_opensearch_pw", "eu-west-1")["password"]
os_http_auth = (user, secret)

# get the OpenSearch domain name from parameter
os_domain_ep = get_parameter_value("opensearch_endpoint")
os_index_name = "admin-ch-pressreleases-de"  # opensearch index

# huggingface predictor endpoint for embeddings
hf_predictor_endpoint_name = get_parameter_value("hf_predictor_endpoint_name")

# S3 bucket and path to crawler results
s3_bucket = f"sagemaker-gen-ai-{account_id}-{config.region_name}"
s3_key = "/crawlers/admin_ch_press_releases_de.json"

## Read the JSON file, containing the crawler result into data frame

We can also see some pages

In [None]:
import pandas as pd
import json

print("s3://" + s3_bucket + s3_key)

# read the crawled pages as dataframe
df = pd.read_json("s3://" + s3_bucket + s3_key)
df.style.set_properties(**{"text-align": "left"})
# df.head()

# Display the content of 1 document
from IPython.core.display import display, HTML

# show example html content
display(HTML(df.content.values[2]))

## Split the documents by H html tag

This section is processiing HTML content from a DataFrame (`df`), extract sections, and paragraphs from these sections, and then save the processed DataFrame to a JSON file. The details are as follows:

1. **Import Required Libraries**: BeautifulSoup and SoupStrainer from the `bs4` module and the `re` (regular expressions) module are imported.

2. **Define Function to Convert Paragraphs**: A function `convert_paragraphs` is defined, which takes a DataFrame row as an argument. The function extracts the HTML content and plain text content from the row, and then uses BeautifulSoup to parse the HTML. It creates a list of sections by finding all HTML headings (tags from h1 to h6). It then slices the plain text content into paragraphs based on the positions of these section headings. Paragraphs are then cleaned up (leading and trailing spaces are removed), and empty paragraphs are discarded. The function returns a list of clean paragraphs.

3. **Apply Function to DataFrame**: The `convert_paragraphs` function is applied to every row in the DataFrame. The result (a Series of lists of paragraphs) is then converted to a list and stored back into a new column "paragraphs" in the DataFrame.

4. **Store DataFrame**: The DataFrame is then saved to a JSON file named "pages_with_paragraphs_clean_by_section.json".

5. **Generate SKU DataFrame**: A new DataFrame `df_skus` is created from the "title" and "paragraphs" columns of the original DataFrame. The `explode` function is used to transform each element of a list-like to a row, replicating the index values. The resulting DataFrame, where each paragraph has its own row with the corresponding title, is displayed.

In [None]:
from bs4 import BeautifulSoup, SoupStrainer
import re


def convert_paragraphs(row):
    html = row["content"]
    textContent = row["textContent"]
    soup = BeautifulSoup(html)
    sections = [h.text for h in soup.find_all(re.compile("^h[1-6]$"))]
    paragraphs = []
    pos = 0
    for section in sections:
        split_pos = textContent.find(section, pos, len(textContent))
        paragraphs.append(textContent[pos:split_pos])
        pos = split_pos
    paragraphs.append(textContent[pos : len(textContent)])

    paragraphs_clean = [p.strip() for p in paragraphs if len(p.strip()) > 0]
    return paragraphs_clean


paragraphs = df.apply(convert_paragraphs, axis=1)
df["paragraphs"] = paragraphs.tolist()

# store df to json file for later analysis
# df.to_json("pages_with_paragraphs_clean_by_section.json")
df.to_json("s3://" + s3_bucket + s3_key.replace(".json", "_paragraphs.json"))

df_skus = df[["title", "paragraphs"]].explode("paragraphs")

## Setting up the Environment

The first line of the script %env PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python is setting up the environment to use the Python implementation of Protocol Buffers. Protocol Buffers, often abbreviated as protobuf, is Google's language-neutral, platform-neutral, extensible mechanism for serializing structured data.

Next, we import the HuggingFacePredictor from the sagemaker.huggingface.model module.

Then, we define a CustomEmbeddings class. This class is used to work with embeddings of documents and queries. 

In [None]:
%env PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
from sagemaker.huggingface.model import HuggingFacePredictor

predictor = HuggingFacePredictor(endpoint_name=hf_predictor_endpoint_name)


class CustomEmbeddings:
    def __init__(self, embeddings_predictor):
        self.embeddings_predictor = embeddings_predictor

    def embed_documents(self, input_texts):
        return self._embed_docs(input_texts, False)

    def embed_query(self, query_text):
        return self._embed_docs([query_text])[0]

    def _embed_docs(self, texts, isQuery=False):
        data = {
            "texts": texts,
        }

        res = self.embeddings_predictor.predict(data=data)
        return res["vectors"]

## Creating a list of Document objects

This code creating a list of Document objects using the data from a DataFrame (df) with splitted website documents

In [None]:
from langchain.schema import Document

docs = []
for index, row in df.iterrows():
    for par_num, paragraph in enumerate(row["paragraphs"]):
        meta = {"source": row["source"], "title": row["title"]}
        doc = Document(page_content=paragraph, metadata=meta)
        docs.append(doc)
len(docs)

## Uploade documents into OpenSearch Index

The below section uploads a set of documents, processed as embeddings, into an OpenSearch index. The tasks are accomplished with the help of several libraries, including the `elasticsearch` client, the `tqdm` progress bar, and a `CustomEmbeddings` class that you've previously defined.

Here is a step-by-step walkthrough:

1. **Import Required Libraries and Modules**: The necessary libraries and modules are imported. `Elasticsearch` is the Python client for Elasticsearch (which OpenSearch is based on). `tqdm` provides a fast, extensible progress bar for Python. `OpenSearchVectorSearch` from the `langchain.vectorstores` module seems to be a custom class for handling vector storage in an OpenSearch index.

2. **Initialize CustomEmbeddings and OpenSearchVectorSearch**: An instance of `CustomEmbeddings` is initialized with the predictor. After that, an instance of `OpenSearchVectorSearch` is created, taking several arguments including the OpenSearch index name, the `CustomEmbeddings` instance, the OpenSearch domain endpoint, HTTP authorization details, and SSL settings.

3. **Upload Documents**: It iterates over the `docs` list (a list of `Document` objects). For each `doc`, it calls the `add_documents` method of the `OpenSearchVectorSearch` instance to add the document to the OpenSearch index. This operation is wrapped in a `tqdm` function call to show a progress bar.

In [None]:
# looks like only langchain is required
from elasticsearch import Elasticsearch

from tqdm import tqdm
from langchain.vectorstores import OpenSearchVectorSearch

custom_embeddings = CustomEmbeddings(predictor)
docsearch = OpenSearchVectorSearch(
    index_name=os_index_name,
    embedding_function=custom_embeddings,
    opensearch_url=os_domain_ep,
    http_auth=os_http_auth,
    use_ssl=True,
    verify_certs=False,
    ssl_assert_hostname=False,
    ssl_show_warn=False,
)
print(len(docs))

In [None]:
# adding document to open search index with progress bar
for doc in tqdm(docs):
    docsearch.add_documents(documents=[doc])

## Query the newly created index

You can test the embedding and query in this section

In [None]:
q = "When Digital Vignette is available?"

# should change the name not to overwrite above
docs_response = docsearch.similarity_search(
    q, k=10
)  # , search_type="script_scoring", space_type="cosinesimil"

df_res = pd.DataFrame(
    [
        {"page_content": doc.page_content, "metadata": doc.metadata}
        for doc in docs_response
    ]
)

df_res2 = df_res.join(pd.json_normalize(df_res.metadata))
df_res2.drop("metadata", axis=1, inplace=True)
df_res2.page_content[0]