# Set up environment

In [1]:
from dotenv import load_dotenv

load_dotenv(override=True)

In [2]:
import os


OPENSEARCH_USERNAME = os.environ["OPENSEARCH_USERNAME"]
OPENSEARCH_PASSWORD = os.environ["OPENSEARCH_PASSWORD"]
OPENSEARCH_URL = os.environ["OPENSEARCH_URL"]
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Useful functions

In [3]:
from opensearchpy import OpenSearch
import ollama
from ollama import Client
import requests


# Scrape and URL loader
from utils.sfc_look_up import extract_table, headers

# Opensearch
from opensearchpy.exceptions import RequestError, NotFoundError
import pdfplumber
from typing import Optional
import io
from tqdm import tqdm
import string


def get_open_search(cluster_url: str, username: str, password: str):

    client = OpenSearch(
        hosts=[cluster_url], http_auth=(username, password), verify_certs=False
    )
    return client


def get_embedding(text: str, embedding_model: str) -> list[float]:
    response = ollama_client.embeddings(
        model=EMBEDDING_MODEL,
        prompt=text,
    )
    return response["embedding"]


def create_index(
    index_name: str, embedding_dim: int, force_create: bool = False
) -> bool:
    """Create a Opensearch new index.

    Args:
        index_name (str): Index name
        embedding_dim (int): Dimension of the embedding vector
        exists_ok (bool): Force create new index if it already exists
    Raises:
        Request: Error if index creation fails

    Returns:
        bool: Create succeeded
    """

    try:
        open_search_client.indices.delete(index_name)
    except NotFoundError:
        pass

    index_body = {
        "settings": {"index": {"knn": True, "knn.algo_param.ef_search": 100}},
        "mappings": {  # how do we store,
            "properties": {
                "embedding": {
                    "type": "knn_vector",  # we are going to put
                    "dimension": embedding_dim,
                    "method": {
                        "name": "hnsw",
                        "space_type": "l2",
                        "engine": "nmslib",
                        "parameters": {"ef_construction": 128, "m": 24},
                    },
                }
            }
        },
    }
    try:
        response = open_search_client.indices.create(index=index_name, body=index_body)
    except RequestError as re:
        error_notice = re.args[1]
        if error_notice != "resource_already_exists_exception":
            raise e
        elif not force_create:
            raise e
        print(f"Index {index_name} already exists")
    return True


def read_pdf_given_url(url: str) -> pdfplumber.pdf.PDF:
    """Read PDF file from an accessible URL"""
    response = requests.get(url, headers=headers)
    response.raise_for_status()
    iostream = io.BytesIO()
    iostream.write(response.content)
    return pdfplumber.open(iostream)

# Set up clients

In [4]:
open_search_client: OpenSearch = get_open_search(
    OPENSEARCH_URL, OPENSEARCH_USERNAME, OPENSEARCH_PASSWORD
)
ollama_client: ollama.Client = Client(host="http://localhost:11434")

# Configable parameters

In [5]:
EMBEDDING_MODEL = "all-minilm"  # Better for KNN
# EMBEDDING_MODEL = "mxbai-embed-large" 
EMBEDDING_DIM = len(
    get_embedding("Llamas are members of the camelid family", EMBEDDING_MODEL)
)
PREPROCESS = False
INDEX_NAME = "sfc_code" + "_preprocess"*PREPROCESS
CHUNK_SIZE = 1024
MINIMUM_TEXT_LENGTH_PER_PAGE = 32
MINIMUM_WORD_COUNT_PER_PAGE = 16

In [None]:
print(INDEX_NAME)

# Data ingestion

In [6]:
create_result = create_index(INDEX_NAME, EMBEDDING_DIM, force_create=True)
if create_result:
    print(f"Successfully create a new index {INDEX_NAME}")

print("Scrape PDF files")
table_of_contents = extract_table()


def skip(text: str) -> bool:
    """Skips if match condition.
    Args:
        text (str): Input text
    Returns:
        bool: If match
    """
    if len(text) < MINIMUM_TEXT_LENGTH_PER_PAGE:
        return True
    if len(text.split()) < MINIMUM_WORD_COUNT_PER_PAGE:
        return True
    if "This page is intended to be blank" in text:
        return True

def preprocess_text(text: str) -> str:
    text = text.replace("  "," ").replace("  "," ")
    text = text.replace("\n\n","\n").replace("\n\n","\n")
    for special_character in string.punctuation:
        text.replace(special_character, "")
    return text

print("Start ingestion")
for content in tqdm(table_of_contents):
    # Extract metadata
    topic_title = content["title"]
    file_url = content["url"]

    # Read file
    pdf = read_pdf_given_url(file_url)

    # Extract pdf data
    for page in pdf.pages:
        page_number = page.page_number
        text = page.extract_text_simple()
        if PREPROCESS:
            text = preprocess_text(text)
        if skip(text):
            continue
        start = 0
        end = CHUNK_SIZE
        text_length = len(text)
        chunk_count = text_length // CHUNK_SIZE
        for chunk_index, start in enumerate(range(0, text_length, CHUNK_SIZE)):
            end = start + CHUNK_SIZE
            chunk_text = text[start:end].strip()
            if not chunk_text:
                continue

            embedding = get_embedding(chunk_text, EMBEDDING_MODEL)
            doc = {
                "file_url": file_url,
                "topic_title": topic_title,
                "page_number": page_number,
                "chunk_index": chunk_index,
                "text": chunk_text,
                "embedding": embedding,
            }
            res = open_search_client.index(
                index=INDEX_NAME,
                body=doc,
                id=f"{topic_title}_{page_number}_{chunk_index}",
                refresh=True,
            )

Successfully create a new index sfc_code
Scrape PDF files
Start ingestion


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 15/15 [03:05<00:00, 12.35s/it]


In [7]:
user_query = "Share Buy-backs condition"


query_embedding = get_embedding(user_query, EMBEDDING_MODEL)


query_body = {
    "query": {
        "bool": {
            "must": [
                {"knn": {"embedding": {"vector": query_embedding, "k": 50}}},
                #                 {
                #                     "match": {
                #                         "text": {
                #                             "query": user_query,
                #                         },
                #                     }
                #                 },
                {
                    "match": {
                        "topic_title": {
                            "query": "The Codes on Takeovers and Mergers and Share Buy-backs",
                        },
                    }
                },
            ]
        }
    },
    "_source": False,
    "fields": ["id", "topic_title", "text", "file_url"],
}

results = open_search_client.search(body=query_body, index=INDEX_NAME)



open_search_client.search(body=query_body, index=INDEX_NAME)

{'took': 3,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 176, 'relation': 'eq'},
  'max_score': 10.819632,
  'hits': [{'_index': 'sfc_code',
    '_id': 'The Codes on Takeovers and Mergers and Share Buy-backs_185_0',
    '_score': 10.819632,
    'fields': {'file_url': ['https://www.sfc.hk/-/media/EN/assets/components/codes/files-current/web/codes/the-codes-on-takeovers-and-mergers-and-share-buy-backs/English-TO-Code-20230928-FINAL.pdf?rev=ce39f30e56784e6ea97fd512e1bfc9cd'],
     'topic_title': ['The Codes on Takeovers and Mergers and Share Buy-backs'],
     'text': ['RULE 32 \n32.   Share buy-backs  \n32.1   Code implications of share buy-backs  \nIf as a result of a share buy-back a shareholder’s proportionate interest in the voting \nrights of  the  repurchasing company increases, such increase will be treated as an \nacquisition of voting rights for purposes of the Takeovers Code. As a result, a shareholder, \