# Environment

## Load enviroment

In [None]:
import os
from dotenv import load_dotenv


load_dotenv(override=True)

## Import modules

In [None]:
import ollama
from opensearchpy import OpenSearch
from dataclasses import dataclass
from retry import retry
from typing import Union, Literal, Optional, Generator
import json
from IPython.display import Markdown, clear_output, display

## List models

In [None]:
print("Available models")
for i, model in enumerate(ollama.list()["models"], 1):
    print(f"{i}: {model['name']}")

## Configable parameters

In [None]:
# MODEL = "llama3:8b-instruct-q6_K"
MODEL = "qwen2:7b-instruct-q6_K"
# MODEL = "phi:latest"
EMBEDDING_MODEL = "all-minilm:latest"
OLLAMA_HOST = "http://localhost:11434"
RETRY_COUNT = 5
SELECT_TOP_RESULTS = 3
INDEX_NAME = "sfc_code_preprocess"

# OLLAMA client

In [None]:
ollama_client: ollama.Client = ollama.Client(host=OLLAMA_HOST)

def get_embedding(text: str, embedding_model: str) -> list[float]:
    response = ollama_client.embeddings(
        model=EMBEDDING_MODEL,
        prompt=text,
    )
    return response["embedding"]

# Opensearch client

In [None]:
OPENSEARCH_USERNAME = os.environ["OPENSEARCH_USERNAME"]
OPENSEARCH_PASSWORD = os.environ["OPENSEARCH_PASSWORD"]
OPENSEARCH_URL = os.environ["OPENSEARCH_URL"]
os.environ["TOKENIZERS_PARALLELISM"] = "false"


def get_open_search(cluster_url: str, username: str, password: str):

    client = OpenSearch(
        hosts=[cluster_url], http_auth=(username, password), verify_certs=False
    )
    return client

open_search_client: OpenSearch = get_open_search(
    OPENSEARCH_URL, OPENSEARCH_USERNAME, OPENSEARCH_PASSWORD
)

# Get distinct topics

In [None]:
def get_open_search(cluster_url: str, username: str, password: str):

    client = OpenSearch(
        hosts=[cluster_url], http_auth=(username, password), verify_certs=False
    )
    return client


open_search_client: OpenSearch = get_open_search(
    OPENSEARCH_URL, OPENSEARCH_USERNAME, OPENSEARCH_PASSWORD
)


results = open_search_client.search(
    body={
        "size": 0,
        "aggs": {
            "distinct_sources": {
                "composite": {
                    "sources": [
                        {"topic_title": {"terms": {"field": "topic_title.keyword"}}},
                        {"file_url": {"terms": {"field": "file_url.keyword"}}},
                    ],
                    "size": 10000,
                }
            }
        },
    },
    index=INDEX_NAME,
)

buckets = results["aggregations"]["distinct_sources"]["buckets"]
buckets_topic_to_url = {
    bucket["key"]["topic_title"]: bucket["key"]["file_url"] for bucket in buckets
}
topic_list = list(buckets_topic_to_url.keys())
topic_choices: str = "\n".join([f"{i}. {topic}" for i, topic in enumerate(topic_list, 1)])

In [None]:
print(topic_choices)

# Prompt handlers
## Topic selector

In [None]:
@dataclass
class TopicSelector:
    verbose: int = 0
    header: str = (
        "Pick an index of document that you think that it can help answer the following question or pick 0 if you think they are not helpful. Please answer only as a number and do not include prologue, prefix or suffix"
    )
    system_prompt: str = (
        "Pick a choice, please answer only a number and do not include prologue, prefix or suffix"
    )
    topic_choices: tuple = tuple(topic_list)

    def generate(self, prompt: str) -> int:
        stream = ollama.chat(
            model=MODEL,
            messages=[
                {
                    "role": "system",
                    "content": self.system_prompt,
                },
                {"role": "user", "content": prompt},
            ],
            stream=True,
        )
        try:
            response = ""
            for chunk in stream:
                response += chunk["message"]["content"]

        finally:
            stream.close()
        return response

    def construct_prompt(self, question: str) -> str:
        header = self.header
        topic_choices = "\n".join(
            [f"{i}. {topic}" for i, topic in enumerate(self.topic_choices, 1)]
        )
        prompt = (
            topic_selection_prompt
        ) = f"""{header}

# Available source:
{topic_choices}

# question:
{question}
"""
        return prompt

    @retry(tries=RETRY_COUNT, exceptions=ValueError)
    def pick_a_choice(self, question) -> int:
        prompt = self.construct_prompt(question)
        result = int(self.generate(prompt))
        assert result >= 0, "Invalid generated result, Regenerating..."
        return int(result)

In [None]:
TopicSelector().pick_a_choice("I want to invest in real-estate")

## Query builder

In [None]:
## Query builder

@dataclass
class TextQueryBuilder:
    verbose: int = 0
    header: str = (
        "Based on the following question, what keywords should be queried in Opensearch"
    )
    system_prompt: str = (
        "We have an Opensearch instant storing docuements about code of conduct."
        " You are a data engineer who expertise Opensearch query."
        " Please suggest text query based on user's question"
        " return your answer only  and do not include prologue, prefix or suffix"
    )
    topic_choices: tuple = tuple(topic_list)

    def generate(self, prompt: str) -> int:
        stream = ollama.chat(
            model=MODEL,
            messages=[
                {
                    "role": "system",
                    "content": self.system_prompt,
                },
                {"role": "user", "content": prompt},
            ],
            stream=True,
        )
        try:
            response = ""
            for chunk in stream:
                response += chunk["message"]["content"]
        finally:
            stream.close()
        return response

    def construct_prompt(self, question: str) -> str:
        header = self.header
        prompt = (
            topic_selection_prompt
        ) = f"""{header}

# question:
{question}
"""
        return prompt

    @retry(tries=RETRY_COUNT)
    def build(self, question) -> int:
        prompt = self.construct_prompt(question)
        result = self.generate(prompt)

        return result


def get_topic(question: str, verbose: int = 0) -> str:
    topic_selected_index = topic_selector.pick_a_choice(question)
    if topic_selected_index:
        selected_topic = topic_selector.topic_choices[topic_selected_index - 1]
        if verbose:
            print(
                f'THE QUESTION: "{question}" \nSELECTED TOPIC: {topic_selected_index}. "{selected_topic}"\nFROM {buckets_topic_to_url[selected_topic]}'
            )
        return selected_topic
    else:
        if verbose:
            print("Provided sources are not seem related to the question")
        return None

## Source summarizer

In [None]:
@dataclass
class SourceSummarizer:
    system_prompt: str = (
    "You are an expert in lawfirm who are assigned to consider whether a text data source "
    "is useful to answer a user question or not. If yes, you will summarize the text "
    "which corespond user's question for another expert to write answer the user , otherwise, do nothing. "
    '''You answer must be in JSON format with field:
"is_useful": boolean determining whether the source is useful,
"summarize: string your summarization refering the part for the text or empty string if not useful
    '''
     " return your answer only  and do not include prologue, prefix or suffix"
    )
    def generate(self, prompt: str) -> int:
        stream = ollama.chat(
            model=MODEL,
            messages=[
                {
                    "role": "system",
                    "content": self.system_prompt,
                },
                {"role": "user", "content": prompt},
            ],
            stream=True,
        )
        try:
            response = ""
            for chunk in stream:
                response += chunk["message"]["content"]
        finally:
            stream.close()
        return response
    
    def construct_prompt(self, question: str, text_source: str) -> str:
        prompt = (
            topic_selection_prompt
        ) = f"""# Source:
{text_source}
# question:
{question}
"""
        return prompt

    @retry(tries=RETRY_COUNT, exceptions=json.JSONDecodeError)
    def summarize(self, question: str, text_source: str):
        prompt = self.construct_prompt(question, text_source)
        result = self.generate(prompt)
        return json.loads(result)
    

## User interactive

In [None]:
@dataclass
class UserInteractor:
    system_prompt: str = (
        "You are an humble expert in lawfirm, and your secretary already "
        "prepared gists from the related document for "
        "you to answer user's question. "
        "Your duty is to answer the question "
        "with confidence using the prepared "
        "data source as a reference."
        "Please also add the reference of data source with URL to PDF file with page number "
        "and encourage user to find out more information with it"
    )

    def generate(self, prompt: str, stream: bool = False) -> str:
        response = ollama.chat(
            model=MODEL,
            messages=[
                {
                    "role": "system",
                    "content": self.system_prompt,
                },
                {"role": "user", "content": prompt},
            ],
            stream=stream,
        )
        return response

    def stream_text(
        self,
        generator: Generator[None, int, None],
        additional_text: str = "",
    ) -> Generator[None, int, None]:
        for chunk in generator:
            yield chunk["message"]["content"]
        yield from additional_text

    def construct_prompt(
        self, question: str, topic: str, contexts: list[str], source_url: str
    ) -> str:
        system_prompt = self.system_prompt.format()
        context_prompt = "- " + "\n- ".join(contexts)

        prompt = (
            topic_selection_prompt
        ) = f"""# question:
{question}

# Prepared data source:
Document: {topic}
{context_prompt}
URL: {source_url}
"""
        return prompt

    def answer(
        self, question: str, topic: str, contexts: list[str], stream: bool = False
    ) -> str:
        source_url = buckets_topic_to_url[topic]
        prompt = self.construct_prompt(question, topic, contexts, source_url)
        response = self.generate(prompt, stream)
        references = (
            "<br><br>**Reference**\n"
            f"> From: {source_url}"
            "\n* "
            + "\n* ".join(sorted(contexts, key=lambda x: int(x.rsplit(" ", 1)[1])))
        )
        if not stream:
            return response["message"]["content"] + references

        return self.stream_text(response, references)

# Useful functions

In [None]:
def search_data_in_opensearch(
    query: str,
    search_method: Union[Literal["text"], Literal["vector"]],
    topic_title: Optional[str],
) -> dict:
    
    query_embedding = get_embedding(question, EMBEDDING_MODEL)

    if search_method == "vector":
        must = [{"knn": {"embedding": {"vector": query_embedding, "k": 5}}}]
    elif search_method == "text":
        must = [
            {
                "match": {
                    "text": {
                        "query": query,
                    },
                }
            }
        ]
    else:
        raise ValueError("Invalid search method")
    must += [
        {
            "match": {
                "topic_title": {
                    "query": topic_title,
                },
            }
        }
    ]
    query_body = {
        "query": {"bool": {"must": must}},
        "_source": False,
        "fields": ["id", "topic_title", "text", "file_url", "page_number"],
    }

    results = open_search_client.search(body=query_body, index=INDEX_NAME)
    return results


def extract_search_results(search_results_raw: dict) -> list[str]:
    return [
        {
            "text": result["fields"]["text"][0],
            "topic": result["fields"]["topic_title"][0],
            "url": result["fields"]["file_url"][0],
            "page": result["fields"]["page_number"][0],
        }
        for result in search_results_raw["hits"]["hits"][:SELECT_TOP_RESULTS]
    ]


def summarize_into_contexts(
    source_summarizer: SourceSummarizer, search_results: list
) -> list[str]:
    contexts = []
    for search_result in search_results:
        summarized = source_summarizer.summarize(question, search_result["text"])
        topic = search_result["topic"]
        page = search_result["page"]
        if summarized["is_useful"]:
            summarized_text = summarized["summarize"]
            context = f'"{summarized_text}" - Page: {page}'
            contexts.append(context)
    return contexts


def apologize(question: str, stream: bool = False) -> str:
    response = ollama.chat(
        model=MODEL,
        messages=[
            {
                "role": "user",
                "content": (
                    "Apologize the inqueriing user "
                    "because we're don't have any information or duty to "
                    "answer user's question due to you are built for lawfirm tasks. "
                    "Remind the user that the topics available for you to answer are:\n"
                    f"{topic_choices}"
                    "If possible, suggest any another helpful resource that "
                    "he may find the answer."
                ),
            },
            {"role": "user", "content": f'Let the user know that you cannot the question "{question}" due to you don\'t have information which user just enqueried.'},
        ],
        stream=stream,
    )
    if not stream:
        return response["message"]["content"]
    return response


# Testing

In [None]:
topic_selector = TopicSelector()
text_query_builder = TextQueryBuilder()
source_summarizer = SourceSummarizer()
user_interactor = UserInteractor()



def answer_the_question(question: str, stream: bool = False, debug: bool = False) -> Union[Generator[str, None, None], str]:
    """Run the responding flow to answer user's question.
    
    Args:
        question (str): User's question
        stream (bool): Choose how to emitting the answer
    Returns:
        - Generator[str, None, None] if stream = True
        - str if stream = False
    """
    topic_selected_index = topic_selector.pick_a_choice(question)
    if topic_selected_index < 1:
        # We don't have information to answer the question.
        return apologize(question, stream)
    topic_title = topic_selector.topic_choices[topic_selected_index - 1]
    if debug: print(f"{topic_title=}")
    query_text = text_query_builder.build(question)
    if debug: print(f"{query_text=}")
    # Query
    search_text_results_raw = search_data_in_opensearch(
        query_text, search_method="text", topic_title=topic_title
    )
    search_vector_results_raw = search_data_in_opensearch(
        query_text, search_method="vector", topic_title=topic_title
    )
    search_text_results = extract_search_results(search_text_results_raw)
    search_vector_results = extract_search_results(search_vector_results_raw)

    # Summerize useful resources
    if debug: print("summarizing contexts")
    context_text_results = summarize_into_contexts(source_summarizer, search_text_results)
    context_vector_results = summarize_into_contexts(
        source_summarizer, search_vector_results
    )
    contexts = context_text_results + context_vector_results
    if len(contexts) == 0:
        # The retrieved resources are not useful. 
        return apologize(question, stream)
    # Generate the answer
    if debug: print("start generate the answer")
    answer = user_interactor.answer(question, topic=topic_title, contexts=contexts, stream=stream)
    return answer


def display_answer(answer: Union[Generator[str, None, None], str]):
    if isinstance(answer, Generator):
        cumulative_response = ""
        for c in answer:
            if isinstance(c, dict):
                c = c["message"]["content"]
            print(c, end="", flush=True)
            cumulative_response += c
        clear_output(wait=True)
        display(Markdown(cumulative_response))
    else:
        display(Markdown(answer))
        
        
# Display output


# Example
## Example 1

In [18]:
question: str = "I want to invest in real estates. What detail should I know?"
answer = answer_the_question(question, stream=True, debug=True)
display_answer(answer)

When investing in real estate, there are several critical details you should consider to ensure a successful and informed investment. Here’s an overview based on the provided document:

1. **Transaction History**: Understanding past transactions can provide insights into how properties have been managed and valued over time.

2. **Proposed Renovation Plans**: These plans are crucial for anticipating future improvements that could increase property value or attract better tenants, potentially increasing rental income.

3. **Operating Data**:
   - **Occupancy Rate**: High occupancy indicates stable cash flow.
   - **Tenant Mix**: Diversification of tenant types can mitigate risks if a particular industry experiences downturns.
   - **Lease Details**: Terms like duration, renewal options, and clauses for rent adjustments are important.

4. **Rental Data**:
   - **Average Rental per Square Foot**: This helps gauge the market value of the property relative to similar assets in the area.
   - **Lease Expirations**: Managing lease terminations can influence cash flow and planning future investments or renovations.

5. **Borrowing Policy**: The way a company finances its properties can affect its financial flexibility and stability, influencing dividends and overall investment performance.

6. **Risk Mitigation Measures**: Understanding how the entity manages risks such as market fluctuations, property damage, or regulatory changes is crucial for long-term planning.

7. **Dividend Policy**: Knowing how distributions are made from net income can help in planning returns on your investment.

8. **Insurance Arrangements**: Adequate insurance coverage protects against various risks and ensures that potential losses do not impact the investment significantly.

9. **Divestment Strategy**: Understanding the plan for selling assets can give insights into liquidity, which is essential when considering the timing of your investment exit.

10. **Administrative Requirements & Governing Laws**:
    - **Repossession and Tenant Protection**: These laws vary by region and impact your rights as an investor.
    - **Exchange Rate Fluctuations**: International investments are sensitive to exchange rate changes, affecting property valuation.
    - **Regulatory Frameworks for Overseas Properties**: Different jurisdictions have distinct rules regarding real estate investment.

11. **Property Valuation Factors**:
    - **Property Tax**: This can significantly impact the cost of ownership and cash flow.
    - **Depreciation & Amortization**: Understanding these accounting methods affects reported income and asset values.
    - **Qualified Minority-owned Property**: Check if investments comply with this requirement for specific benefits or regulations.

12. **Investment Structure**:
    - Ensure that your investment aligns with Hong Kong standards, especially regarding regulatory comparability when investing in overseas properties.
    - Review the minimum requirements for the percentage of gross asset value invested in real estate and ensure full disclosure from the management company.

13. **Details from Offering Document**: Look for specific information like the name, address, creation date of the scheme, investment policy, business plan, character of real estate investments, and competitive conditions to assess fit with your investment goals and risk tolerance.

To explore these points further or get a detailed understanding, I recommend reviewing the full document available [here](https://www.sfc.hk/-/media/EN/files/COM/Reports-and-surveys/REIT-Code_Aug2022_en.pdf?rev=572cff969fc344fe8c375bcaab427f3b). This resource provides comprehensive guidance on real estate investment trusts (REITs) and should be a valuable tool in your research.<br><br>**Reference**
> From: https://www.sfc.hk/-/media/EN/files/COM/Reports-and-surveys/REIT-Code_Aug2022_en.pdf?rev=572cff969fc344fe8c375bcaab427f3b
* "To invest in real estate, you need to know that assets must be disposed under normal market conditions, with transparent pricing. At least 75% of the gross asset value should be invested in real estate generating recurrent rental income consistently. The management company is required to publish the full investment portfolio of relevant real estate investments on their website monthly and include this information in annual and interim reports." - Page: 37
* "When considering investing in real estate, it's important to understand the details of property valuation including factors like property tax, depreciation, and amortization. Additionally, you should consider whether an investment in another listed real estate investment trust can be viewed as a Qualified Minority-owned Property. This depends on its structure, underlying investments, and regulatory regime comparability with Hong Kong standards. If the target's regulatory environment is similar to that of Hong Kong, such investments may not need to strictly adhere to all requirements set out by local codes." - Page: 43
* "This text details how a specific entity, through special purpose vehicles, distributes income from real estate investments according to local laws and regulations. It also mentions that distributions from minority-owned properties are part of the net income for distribution." - Page: 45
* "The text discusses administrative requirements, governing laws for repossession and tenant protection, exchange rate fluctuations impact on property valuation, regulatory frameworks for overseas properties, legal governance of conveyance in relevant overseas markets." - Page: 75
* "To invest in real estates, you should carefully review the following details listed in the offering document: 
- The name, registered address, and creation date of the scheme. 
- The investment policy and strategy of the scheme.
- The business plan for property investments, including how monies raised will be used and any intended type of real estate investments (residential, commercial, industrial).
- A discussion on the general character and competitive conditions of all real estate currently held or intended to be acquired by the scheme." - Page: 78
* "To invest in real estate, you should consider the transaction history, proposed renovation plans, operating data including occupancy rate and tenant mix, lease details, average rental per square foot, lease expirations, borrowing policy, risk mitigation measures, dividend policy, insurance arrangements, and divestment strategy." - Page: 79

## Example 2

In [19]:
question: str = "I want to design product about pooled Retirement funds, any suggestion?"
answer = answer_the_question(question, stream=True, debug=True)
display_answer(answer)

To design a product about pooled retirement funds, it's essential to have a thorough understanding of the regulatory framework and key terms associated with these types of investment vehicles. Here are some suggestions based on your prepared data source:

1. **Compliance**: Ensure that your product adheres to the Securities and Futures Ordinance (Cap. 571) in Hong Kong, which governs collective investment schemes like pooled retirement funds. This includes understanding the roles and responsibilities of 'principal brochure' or 'offering document', ensuring compliance with fair valuation, fee structures, investment strategy restrictions, and transfer/withdrawal policies.

2. **Authorization Process**: Review the process mentioned on pages 6 and 45 for obtaining authorization to operate a pooled retirement fund under section 399(1) of the Securities and Futures Ordinance (Cap. 571). This will help you understand the steps required for legal registration and ongoing compliance.

3. **Investment Objectives**: Clearly define your product's investment strategy, as outlined in Page 26. This includes specifying the percentage of total net asset value that must be invested according to indicated objectives or strategies. Ensure that these investments align with the regulatory requirements and cater to the risk profiles of potential investors.

4. **Regulatory Compliance**: Familiarize yourself with terms such as 'regulated activity' (referred on Page 12), which can guide you in structuring your product offerings without inadvertently violating Hong Kong's financial regulations.

5. **Product Offering Document**: Develop a comprehensive and clear principal brochure or offering document that includes all necessary information for investors, such as investment objectives, risk factors, fees, and any other details required by the regulatory body (as mentioned on Page 12).

6. **Penalties and Compliance with Advertisements**: Be aware of potential offenses related to unauthorized advertisements as per sections covered in your reference document. Ensure that marketing materials are compliant with regulatory guidelines.

7. **Product Code and Product Provider**: Understand how these terms apply within the context of pooled retirement funds (PRFs) as defined by the Hong Kong Securities and Futures Commission, which could influence branding and identification strategies for your product.

For a more in-depth understanding and to ensure all aspects of your product design are thoroughly compliant with the law, I encourage you to download the full document provided below:
URL: [https://www.sfc.hk/-/media/EN/assets/components/codes/files-current/web/codes/code-on-pooled-retirement-funds/code-on-pooled-retirement-funds.pdf?rev=9badf81950734ee08c799832be6ff92b](https://www.sfc.hk/-/media/EN/assets/components/codes/files-current/web/codes/code-on-pooled-retirement-funds/code-on-pooled-retirement-funds.pdf?rev=9badf81950734ee08c799832be6ff92b)

This resource should provide a comprehensive guide to help you navigate the complexities of designing a pooled retirement fund product in Hong Kong, ensuring legal and regulatory compliance.<br><br>**Reference**
> From: https://www.sfc.hk/-/media/EN/assets/components/codes/files-current/web/codes/code-on-pooled-retirement-funds/code-on-pooled-retirement-funds.pdf?rev=9badf81950734ee08c799832be6ff92b
* "This source provides guidance on the authorization of a collective investment scheme that is a pooled retirement fund under the Securities and Futures Ordinance (Cap. 571) in Hong Kong. It covers the powers of the Securities and Futures Commission to authorize, review, modify or withdraw such funds' authorizations and penalties for unauthorized advertisements related to these funds." - Page: 6
* "This text provides guidance on the authorization process of a collective investment scheme that is a pooled retirement fund under section 399(1) of the Securities and Futures Ordinance (Cap. 571). It also mentions potential offenses related to unauthorized advertisements for such funds." - Page: 6
* "The text defines key terms related to Pooled Retirement Funds (PRFs) in the context of Hong Kong's financial regulations. Terms such as 'principal brochure' or 'offering document', 'Product Code', and 'Product Provider' are explained, which could be helpful for designing a product about PRFs." - Page: 12
* "The text provides definitions related to pooled retirement funds in Hong Kong, including terms like 'pooled retirement fund' (PRF), 'principal brochure', and mentions codes and providers regulated by the Commission. It also refers to the meaning of 'regulated activity'. These definitions can provide a foundational understanding for designing a product about pooled retirement funds." - Page: 12
* "This text provides guidelines for named constituent and pooled investment funds regarding the percentage of total net asset value they must invest according to their indicated objectives or strategies. It also specifies restrictions on naming when investing in certain types of funds. These rules could be relevant for designing a product about pooled retirement funds, particularly concerning branding and investment strategy clarity." - Page: 26
* "The document outlines various aspects that are crucial for understanding and designing a pooled Retirement fund product, including fair valuation, fees, investment strategy and restrictions, termination conditions, and transfer or withdrawal policies. These points can be essential reference for creating a comprehensive and compliant retirement fund." - Page: 45

## Example 3 (Unrelated question)

In [22]:
question: str = "How to cook fried chicken?"
answer = answer_the_question(question, stream=True, debug=True)
display_answer(answer)

I'm sorry, but I can't assist with that. It seems like there might be a misunderstanding as I'm an AI text-based model and currently unable to provide or look up specific recipes, cooking instructions, or answer questions about preparing food like fried chicken. However, you might find plenty of helpful resources online by searching on platforms such as YouTube, food blogs, or websites dedicated to recipes where many detailed guides on how to cook fried chicken are available.

If you need help with another topic or have any other questions, feel free to ask!