###############################################################################
# SRSWTISearchEngine
###############################################################################

In [None]:
# ---------------------------  Imports ------------------------------
from srswti_axis import SRSWTISearchEngine
from utils import *

  from .autonotebook import tqdm as notebook_tqdm
2025-03-10 20:12:43.462643: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1741617763.481219 1096918 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1741617763.486697 1096918 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-10 20:12:43.506719: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
INFO:datasets:PyTorch version 2.5.0 available.
INFO:datasets:TensorFlow ver

# ---------------------------  Basic Demos ------------------------


In [2]:
search_engine = SRSWTISearchEngine()
index_docs = [
    "Advances in AI for drug discovery",
    "AI ethics and regulatory considerations",
    "Computer vision in autonomous vehicles",
    "Neural style transfer for digital art"
]

INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cuda
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-mpnet-base-v2
INFO:SRSWTI-IR:SRSWTI Search Engine initialized successfully


In [None]:
# Perform hybrid search combining BM25, semantic and proximity scoring
# Query "AI regulation" is searched against our index_docs collection
search_res_1 = search_engine.hybrid_search("AI regulation", index_docs)
print("== SearchEngine 1 ==")
# Results will be returned as a list of (document, score) tuples
# where documents are ranked by relevance to the query
print(search_res_1, "\n")

Batches: 100%|██████████| 1/1 [00:00<00:00,  2.85it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 82.20it/s]

== SearchEngine 1 ==
[(np.int64(1), 0.8807970779778823), (np.int64(0), 0.3946837275773923), (np.int64(2), 0.062380325510358256), (np.int64(3), 0.05960146101105877)] 






In [None]:
# Define custom weights for the hybrid search components
# - bm25: Term-based relevancy scoring (70% weight)
# - semantic: Meaning-based matching (20% weight)
# - proximity: Term closeness in document (10% weight)
custom_weights = {'bm25':0.7, 'semantic':0.2, 'proximity':0.1}

# Perform hybrid search with custom weights on query "AI for art"
# This gives us more control over which search aspects we prioritize
search_res_2 = search_engine.hybrid_search("AI for art", index_docs, weights=custom_weights)

# Display the search results with custom weights
print("== SearchEngine 2 ==")
print(search_res_2, "\n")

Batches: 100%|██████████| 1/1 [00:00<00:00, 49.57it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 60.04it/s]
ERROR:SRSWTI-IR:Error in hybrid search: 'cross_encoder'


== SearchEngine 2 ==
[(0, 0.0), (1, 0.0), (2, 0.0), (3, 0.0)] 



# ------------------- RAG Demos ---------------------


In [None]:

def rag_demo_a(user_query, docs=SAMPLE_DOCS):
    """
    Demonstrates a RAG (Retrieval-Augmented Generation) system using hybrid search.
    
    This function retrieves the most relevant document for the user query using 
    hybrid search, then passes the query and retrieved document to an LLM to 
    generate a response.
    
    Args:
        user_query (str): The question or query from the user
        docs (list): Collection of documents to search through (default: SAMPLE_DOCS)
        
    Returns:
        None: Prints the LLM response to the console
    """
    if not docs:
        print("[Theorem12 RAG Demo A] No docs.")
        return
    res = search_engine.hybrid_search(user_query, docs)
    if res:
        top_doc, score = max(res, key=lambda x: x[1])
    else:
        top_doc, score = "No doc", 0
    prompt = f"User query: {user_query}\nTop doc: {top_doc} (score={score})\nAnswer the query."
    resp = call_groq_llm(prompt)
    print("[Theorem12 RAG Demo A] LLM:\n", resp, "\n")

def rag_demo_b(user_query, docs=SAMPLE_DOCS):
    """
    Alternative RAG implementation focusing on document relevance explanation.
    
    This function retrieves the most relevant document for the user query,
    then asks the LLM to specifically explain how the document relates to 
    the user's query.
    
    Args:
        user_query (str): The question or query from the user
        docs (list): Collection of documents to search through (default: SAMPLE_DOCS)
        
    Returns:
        None: Prints the LLM response to the console
    """
    if not docs:
        print("[Theorem12 RAG Demo B] No docs.")
        return
    res = search_engine.hybrid_search(user_query, docs)
    if res:
        top_doc, sc = max(res, key=lambda x: x[1])
    else:
        top_doc, sc = "No doc", 0
    prompt = (
        f"User query: {user_query}\n \nTop doc: {top_doc}\n"
        "Explain how it addresses the user query."
    )
    resp = call_groq_llm(prompt)
    print("[Theorem12 RAG Demo B] LLM:\n", resp, "\n")


In [None]:
rag_demo_a("blockchain for medical records", SAMPLE_DOCS)

Batches: 100%|██████████| 1/1 [00:00<00:00, 51.32it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 23.31it/s]


[Theorem12 RAG Demo A] LLM:
 [Groq LLM] The use of blockchain technology for medical records is a growing trend in the healthcare industry. Blockchain, also known as distributed ledger technology, allows for secure, transparent, and tamper-proof storage and sharing of medical records.

Here are some key benefits of using blockchain for medical records:

1. **Security**: Blockchain technology uses advanced cryptography to secure medical records, making it difficult for unauthorized parties to access or alter them.
2. **Interoperability**: Blockchain enables the creation of a unified, decentralized system for storing and sharing medical records, making it easier for healthcare providers to access and share patient information.
3. **Data integrity**: Blockchain ensures that medical records are accurate and tamper-proof, reducing the risk of errors or alterations.
4. **Patient control**: Blockchain gives patients control over their medical records, allowing them to grant access to specific

In [None]:
rag_demo_b("crypto", SAMPLE_DOCS)

Batches: 100%|██████████| 1/1 [00:00<00:00, 90.28it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 23.83it/s]


[Theorem12 RAG Demo B] LLM:
 [Groq LLM] The top doc with a score of 2 likely addresses the user query "crypto" by providing information related to cryptocurrency. Here's how it might address the query:

1. **Relevance**: The document probably contains relevant information about cryptocurrency, such as its definition, types (e.g., Bitcoin, Ethereum), uses, and trends.
2. **Key terms**: The document may include key terms related to cryptocurrency, like "blockchain," "mining," "wallet," and "exchange," which are essential concepts in the crypto space.
3. **Introduction or overview**: The document might serve as an introduction or overview of the crypto world, covering its history, benefits, and risks, making it a useful resource for users new to cryptocurrency.
4. **Answers to common questions**: The document may answer common questions about crypto, such as "What is cryptocurrency?", "How does it work?", or "Is it a good investment?", providing a solid foundation for users to understand 