In [6]:
import os
import json
import time
from pprint import pprint

import requests
import meilisearch
from bs4 import BeautifulSoup
from nltk import sent_tokenize
from tqdm.notebook import tqdm
from fastembed.embedding import FlagEmbedding

In [2]:
MEILI_SEARCH_URL = 'http://127.0.0.1:7700'
INDEX_NAME = 'Sahih-International-English-Interpretation-of-the-Holy-Quran'
XML_PATH = 'en.sahih.xml'
API_BASE = "https://api.endpoints.anyscale.com/v1"
TOKEN = os.environ['ANYSCALE_AUTH_TOKEN']

In [37]:
class QuranIndexer:
    """
    A class to index, search, and rank the Quran verses.
    """

    def __init__(self):
        """
        Initialize the Quran indexer with MeiliSearch client and embedding model.
        """
        self.client = meilisearch.Client(MEILI_SEARCH_URL)
        self._patch_meilisearch_features()
        self.embedding_model = FlagEmbedding(
            model_name="BAAI/bge-small-en-v1.5", max_length=512)

    def _patch_meilisearch_features(self):
        """
        Send a PATCH request to MeiliSearch to enable certain experimental features.
        """
        url = f"{MEILI_SEARCH_URL}/experimental-features/"
        headers = {
            "content-type": "application/json"
        }
        data = {
            "scoreDetails": True,
            "vectorStore": True
        }

        response = requests.patch(url, headers=headers, data=json.dumps(data))

        if response.status_code == 200:
            print("Successfully updated experimental features in MeiliSearch!")
        else:
            print(f"Failed to update features. Response: {response.text}")
        return response
    
    def read_quran_from_bs(self, xml_string: str) -> list:
        """
        Parse Quran XML content and extract information of interest.

        Args:
        - xml_string (str): The XML content of the Quran.

        Returns:
        - list: List of dicts with extracted data for each verse.
        """
        soup = BeautifulSoup(xml_string, features="xml")
        aya_list = []

        # Extract information for each verse
        for sura in tqdm(soup.find_all('sura')):
            sura_index = sura['index']
            sura_name = sura['name']

            for aya in sura.find_all('aya'):
                aya_segments = sent_tokenize(aya['text'])
                vectors = self.embedding_model.embed(aya_segments)

                aya_data = {
                    'id': f"{sura_index}-{aya['index']}",
                    'sura_index': sura_index,
                    'sura_name': sura_name,
                    'aya_index': aya['index'],
                    'text': aya['text'],
                    '_vectors': [vec.tolist() for vec in vectors],
                }
                aya_list.append(aya_data)

        return aya_list

    def load_xml_to_index(self):
        """
        Load the Quran XML content into MeiliSearch index.
        """
        with open(XML_PATH, 'r', encoding='utf-8') as file:
            xml_content = file.read()
            quran_data = self.read_quran_from_bs(xml_content)

        index = self.client.index(INDEX_NAME)
        index.add_documents(quran_data)

    def search(self, query: str) -> dict:
        """
        Search the Quran verses based on a given query.

        Args:
        - query (str): The search query.

        Returns:
        - dict: Dictionary with indexed verse IDs and corresponding texts.
        """
        search_vector = list(self.embedding_model.embed(query))[0].tolist()

        vector_search_opts = {
            'vector': search_vector, 
            "limit": 5, 
            'showRankingScore': True, 
            'attributesToRetrieve': ['text', 'id']
        }
        keyword_search_opts = {
            "limit": 5, 
            'showRankingScore': True, 
            'attributesToRetrieve': ['text', 'id']
        }

        vector_search_results = self.client.index(INDEX_NAME).search('', vector_search_opts)
        keyword_search_results = self.client.index(INDEX_NAME).search(query, keyword_search_opts)

        reranked_results = self._rerank(keyword_search_results, vector_search_results)
        return self._augment_results(reranked_results)

    def _rerank(self, keyword_results, vector_results):
        keyword_texts = [result["text"] for result in keyword_results["hits"]]
        vector_texts_scores_ids = [
            (result["text"], result["_semanticScore"], result["id"]) for result in vector_results["hits"]
        ]

        reranked_results = []
        for (text, score, _id) in vector_texts_scores_ids:
            if text in keyword_texts:
                score *= 2
            reranked_results.append((text, score, _id))

        return sorted(reranked_results, key=lambda x: x[1], reverse=True)

    def _augment_results(self, results):
        augmented_results = {}
        num_of_surrounding_ayas = 5
        num_of_ayas_to_retrieve = 11

        for result in results:
            sura_index, aya_index = result[2].split('-')
            aya_indices = [
                i for i in range(max(1, int(aya_index) - num_of_surrounding_ayas),
                                 max(1, int(aya_index) - num_of_surrounding_ayas) + num_of_ayas_to_retrieve) if i > 0
            ]
            aya_indices = [f"{sura_index}-{index}" for index in aya_indices]
            aya_texts = []
            for index in aya_indices:
                try:
                    aya_text = self.client.index(INDEX_NAME).get_document(index, {"fields": ["text"]})
                    aya_texts.append((aya_text, index))
                except meilisearch.errors.MeilisearchApiError as e:
                    f"Error retrieving document with ID {index}: {e}"
                        
            augmented_results |= {index: aya.text for aya, index in aya_texts}

        return augmented_results

def generate_prompt(query: str, search_results: dict) -> str:
    """
    Generate a prompt based on the given query and search results.

    Args:
    - query (str): The search query.
    - search_results (dict): The search results.

    Returns:
    - str: Formatted prompt.
    """
    search_results_str = "\n".join([f"{index}: {text}" for index, text in search_results.items()])
    prompt_template = (
        "I have this query: <{query}> about the Holy Quran. Please answer based solely on the following verses: {search_results}. " \
        "If the answer is not contained within these verses, please state so. " \
        "Give the caveat that the answer is based on the verses and not on the entire Quran."
    )
    return prompt_template.format(query=query, search_results=search_results_str)

def get_response_from_model(prompt: str) -> str:
    """
    Get a response from a model based on the given prompt.

    Args:
    - prompt (str): The generated prompt.

    Returns:
    - str: The model's response.
    """
    url = f"{API_BASE}/chat/completions"
    body = {
        "model": "meta-llama/Llama-2-70b-chat-hf",
        "messages": [
            {"role": "user", "content": prompt},
            {"role": "assistant", "content": "Here is my concise conclusion based on the verses above:"}
        ],
        "temperature": 0.1,
        
    }
    
    response = requests.post(url, headers={"Authorization": f"Bearer {TOKEN}"}, json=body)
    return response.json()["choices"][0]["message"]["content"]

In [38]:
indexer = QuranIndexer()
indexer.load_xml_to_index()

Successfully updated experimental features in MeiliSearch!


In [7]:
time.sleep(130) # wait for MeiliSearch to finish indexing

In [42]:
query = "what is the goal of life"
search_results = indexer.search(query)
print(search_results)

In [40]:
prompt = generate_prompt(query, search_results)
response_content = get_response_from_model(prompt)

In [41]:
response_content = response_content.replace("\n", " ").strip()
pprint(response_content)

('The goal of life, according to the Holy Quran, is to worship Allah and lead '
 'a righteous life. Allah created humans and gave them the gift of life so '
 'that they might worship and obey Him. The Quran emphasizes that genuine '
 "happiness and success can only be attained by adhering to Allah's teachings "
 'and commandments.  Allah is described in the Quran as the sole God, the '
 'Creator, and the Sustainer of the Universe. He is the one who gives life and '
 'death and will judge everyone on the Day of Resurrection. The Quran teaches '
 'that people should have faith in Allah and His prophets and messengers, '
 'particularly Prophet Muhammad (peace be upon him).  Furthermore, the Quran '
 'emphasizes the significance of leading a righteous life, which includes '
 'doing good deeds, being honest, being kind to others, and avoiding evil. It '
 'warns people against the consequences of disobedience and disbelief and '
 "encourages them to adhere to Allah's teachings and commandmen