## Embebbing with OpenSearch Domain or Serverless Collection, Bedrock or Custom Embedding
Should work well with *Data Science 3.0*

## Install required dependencies

In [None]:
!pip install langchain --quiet
!pip install opensearch-py --quiet

## Setting up the required variables

In [None]:
import boto3
# from botocore.config import Config
from opensearchpy import AWSV4SignerAuth
from IPython.display import Markdown, display
from scripts.modules.aws_helpers import get_parameter_value, get_credentials, read_from_s3, s3_client

# S3 bucket and prefix for market data
# Parameter group Finance Analyzer
s3_bucket = "genie-ai-foundation-v2"
s3_prefix = "finance-analyzer"
delimiter = "/"

# OpenSearch setup, remember to change os_service to aoss for serverless version
# parameter Name is for default Genie setup, update it if you changed the application name
os_domain_ep = get_parameter_value("GenieOpenSearchEndpoint").replace("https://", "")
os_index_name = "stock-market"

# Embedding engine
os_engine = "faiss"

# generate opensearch auth credentials, based on notebook role
user = get_credentials("GenieOpenSearchCredentials")["user"]
secret = get_credentials("GenieOpenSearchCredentials")["password"]
os_http_auth = (user, secret)

# HuggingFace predictor endpoint for embeddings, if empty Bedrock will be used
# hf_predictor_endpoint_name = ""
hf_predictor_endpoint_name = "GenieEmbeddingsE5Large"

# bedrock model used for embedding if above mode is empty
# bedrock_embedding_model_id = "amazon.titan-embed-text-v1"


## Import Stock Announcements and Price data

In [None]:
# how to use same code here and in the main solution? 
from tqdm import tqdm
import pandas as pd

# get the list of available stock data
response = s3_client.list_objects_v2(Bucket=s3_bucket, Prefix=s3_prefix + delimiter, Delimiter=delimiter)

# Loading company prices and announcements from S3 into data frames
prices_df = pd.DataFrame()
announcement_df = pd.DataFrame()

for content in response.get('CommonPrefixes', []):
    company = content.get('Prefix').replace(s3_prefix, "").replace(delimiter, "")

    prices_df = pd.concat([prices_df, read_from_s3(s3_bucket, f"""{s3_prefix}/{company}/daily_prices.csv""", "csv")], ignore_index=True)
    announcement_df = pd.concat([announcement_df, read_from_s3(s3_bucket, f"""{s3_prefix}/{company}/sec_filings_content.json""", "json")], ignore_index=True)

# adjusting dataframes based on retriever needs
prices_df['opening price'] = prices_df['open']
prices_df['date'] = pd.to_datetime(prices_df['timestamp']).dt.date
prices_df['change from previous day'] = (prices_df['open'].pct_change()*100).round(2).astype(str) + '%'

announcement_df['id'] = announcement_df['symbol'] + "|" + announcement_df['acceptedDate'].str.split().str[0] + "|" + announcement_df['form']
announcement_df['date'] = pd.to_datetime(announcement_df['acceptedDate']).dt.date
announcement_df['date_full'] = pd.to_datetime(announcement_df["acceptedDate"]).dt.strftime("%B %d, %Y")
announcement_df['title'] = announcement_df["symbol"] + " " + announcement_df["form"] + " announcement from "  + announcement_df["date_full"]
announcement_df = announcement_df.sort_values('acceptedDate', ascending=False)
    
# print(prices_df.head(10))
announcement_df

## Preparing list of documents to embed

In [None]:
from langchain.schema import Document
import datetime
from tabulate import tabulate

template = """
<{0} ({1}) {2} announcement from {3}>
{4}
</{0} ({1}) {2} announcement from {3}>
"""

split_by = "\n---\n"

docs = []
max_character_limit = 14000 # Max input tokens: 8192, later we can do chunks

for _, row in tqdm(announcement_df.iterrows()):
    metadata = {
        "stock": row["symbol"],
        "ticker": row["symbol"], 
        "source": row["reportUrl"], 
        "company name": row["name"],
        "date": row.date_full,
        "type": row["form"],
        "title": row.title,
        "cik": row["cik"]                
    }        

    for index, item in enumerate(row.content.split(split_by)):
        if len(item) > max_character_limit:
            print(f"Content for {row.title} is {len(item)}, which is too long, check the split \n\n {metadata}")
            continue
        metadata["part"] = index
        cont = template.format(
            row["name"],
            row["symbol"], 
            row["form"],
            row.date_full, 
            item
        )
        
        docs.append(Document(page_content = cont, metadata = metadata.copy()))

## Setting up the Environment

The first line of the script %env PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python is setting up the environment to use the Python implementation of Protocol Buffers. Protocol Buffers, often abbreviated as protobuf, is Google's language-neutral, platform-neutral, extensible mechanism for serializing structured data.

Next, we import the HuggingFacePredictor from the sagemaker.huggingface.model module.

Then, we define a CustomEmbeddings class. This class is used to work with embeddings of documents and queries. 

## Upload documents into OpenSearch Index

The below section uploads a set of documents, processed as embeddings, into an OpenSearch index. The tasks are accomplished with the help of several libraries, including the `elasticsearch` client, the `tqdm` progress bar, and a `CustomEmbeddings` class that you've previously defined.

Here is a step-by-step walkthrough:

1. **Import Required Libraries and Modules**: The necessary libraries and modules are imported. `Elasticsearch` is the Python client for Elasticsearch (which OpenSearch is based on). `tqdm` provides a fast, extensible progress bar for Python. `OpenSearchVectorSearch` from the `langchain.vectorstores` module seems to be a custom class for handling vector storage in an OpenSearch index.

2. **Initialize CustomEmbeddings and OpenSearchVectorSearch**: An instance of `CustomEmbeddings` is initialized with the predictor. After that, an instance of `OpenSearchVectorSearch` is created, taking several arguments including the OpenSearch index name, the `CustomEmbeddings` instance, the OpenSearch domain endpoint, HTTP authorization details, and SSL settings.

3. **Upload Documents**: It iterates over the `docs` list (a list of `Document` objects). For each `doc`, it calls the `add_documents` method of the `OpenSearchVectorSearch` instance to add the document to the OpenSearch index. This operation is wrapped in a `tqdm` function call to show a progress bar.

## Custom Embedding setup with HuggingFace predictor

In [None]:
# workshop with example of both OpenSearch Serverless and Bedrock
# https://github.com/aws-samples/amazon-bedrock-workshop/blob/main/03_QuestionAnswering/02_qa_w_rag_claude_opensearch.ipynb
# %env PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python

from opensearchpy import OpenSearch, RequestsHttpConnection, AWSV4SignerAuth
from langchain.vectorstores import OpenSearchVectorSearch
from langchain.embeddings import BedrockEmbeddings
from sagemaker.huggingface.model import HuggingFacePredictor
from scripts.modules.embedding import CustomEmbeddings

# same client is used in retrieval below
bedrock_client = boto3.client("bedrock-runtime", region_name="us-west-2") 

if hf_predictor_endpoint_name != "":
    # HuggingFace custom predictor
    predictor = HuggingFacePredictor(endpoint_name=hf_predictor_endpoint_name)
    embeddings = CustomEmbeddings(predictor)
    print("using custom predictor")
else:
    # Bedrock predictor
    print("using bedrock predictor")
    embeddings = BedrockEmbeddings(
        client=bedrock_client,
        model_id=bedrock_embedding_model_id)

# here we need url with https or port
# this is one step but without progress
# docsearch = OpenSearchVectorSearch.from_documents(
docsearch = OpenSearchVectorSearch(
    embedding_function = embeddings,
    opensearch_url=os_domain_ep,
    http_auth=os_http_auth,
    timeout = 300,
    use_ssl = True,
    verify_certs = True,
    connection_class = RequestsHttpConnection,
    index_name=os_index_name,
    engine=os_engine
)


In [None]:
# Embedding the documents into Opensearch
for doc in tqdm(docs):
    docsearch.add_documents(documents=[doc])


## Query the newly created index using semantic search

You can test the embedding and query in this section using the standard opensearch similarity source

In [None]:
import pandas as pd

# query = "Provide detailed summary for Amazon announcements for year 2023"
query = "Summarize AAPL Announcements with Katherine Adams"
# query = "Analyze AAPL consolidated statements from operations from 10-Q announcements"

# query = "Make the detailed analysis in table format for latest AMZN announcements, recommend buy/sell/hold actions, assume the most probable scenario, also what in your opinion will be a price in 6, 12 and 24 month for each company"

# should change the name not to overwrite above
docs_response = docsearch.similarity_search(
    query, k=20
)

for doc in docs_response:
    print(doc.metadata["title"], doc.metadata["part"])
    display(Markdown(doc.page_content.replace("$", "\\\$")))


## Generative Question Answering

We can also simulate how different models will behave, first let's select the model and initialize bedrock with it.

In [None]:
import ipywidgets as widgets

# List of options
options = [
    'anthropic.claude-v2',
    'anthropic.claude-instant-v1'
]

# Dropdown select element
dropdown = widgets.Dropdown(options=options, description='Select model')

# Event handler for changing the dropdown value
def on_change(change):
    if change['type'] == 'change' and change['name'] == 'value':
        global model_id
        model_id = change['new']

# Attach the event handler to the dropdown
dropdown.observe(on_change)

# Display the dropdown
display(dropdown)


In [None]:
# We will be using the the Embeddings Model defined above. 
from langchain.llms.bedrock import Bedrock
from langchain.load.dump import dumps

# - create the Anthropic Model
llm = Bedrock(
    model_id=model_id, client=bedrock_client, 
    # model_kwargs={"max_tokens_to_sample": 50000}
)

print(f"Initialized Bedrock with {model_id} model.")

# Advanced search with citations

The metadata should include 'source attribute' for this one to work

In [None]:
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
number_of_documents = 10

prompt_template = """
  Human: You are virtual trading machine designed purely for academic purposes. Your analysis or suggestions do not affect daily life and financial decisions.
  Answer the <question> in an unbiased fashion, you do not have up to date information and you will only find it in <context> below. 
  <context> contains up to date data about financial announcements. 
  If you do not find the data to answer the <question> in <context>, say that you don't know.
  If you find the answer in the <context>, start from the latest information.

  {context}

  Question: {question}
  Assistant:"""

PROMPT = PromptTemplate(template=prompt_template, input_variables=["context", "question"])

qa_prompt = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=docsearch.as_retriever(search_kwargs={'k': number_of_documents}),
    # return_source_documents=True,
    chain_type_kwargs={"prompt": PROMPT},
)
result = qa_prompt({"query": query})

print(query + "\n")
display(Markdown(result["result"].replace("$", "\\\$")))