In [26]:
import os
import time
import json

import openai
from openai import OpenAI

from elasticsearch import Elasticsearch

import spacy
import numpy as np

import json
import pandas as pd
from tqdm.auto import tqdm

In [2]:
es_client = Elasticsearch('http://localhost:9200') 

index_name='documents'
try:
    result = es_client.count(index=index_name)
    print(f"ES Checking = Document count in {index_name}: {result['count']}")
except Exception as e:
    print(f"ES Checking = Error: {str(e)}")

ES Checking = Document count in documents: 902


In [3]:
DEFAULT_API_KEY = os.environ.get("OPENAI_API_KEY")

nlp = spacy.load('en_core_web_sm')

In [4]:
def get_vector(query):
    doc = nlp(query)
    tokens = [token.lemma_ for token in doc]
    text = ' '.join(tokens)
    doc_lemmatized = nlp(text)
    vector = np.mean([token.vector for token in doc_lemmatized], axis=0).tolist()
    return vector

In [49]:
def elastic_search_hybrid(query, index_name="documents", field='embedding'):
    vector = get_vector(query)

    knn_query = {
        "field": "embedding",
        "query_vector": vector,
        "k": 5,
        "num_candidates": 10000,
        "boost": 0.5,
    },

    keyword_query = {
        "bool": {
            "must": {
                "multi_match": {
                    "query": query,
                    "fields": ["question^3", "answer"],
                    "type": "best_fields",
                    "boost": 0.5,
                }   
            },
        }
    }

    es_results = es_client.search(
        index=index_name,
        query=keyword_query,
        knn=knn_query,
        size=5
    )

In [5]:
query = "who is glimmerfox?"
query

'who is glimmerfox?'

In [7]:
vector = get_vector(query)
len(vector)

96

In [11]:
knn_query = {
    "field": "embedding",
    "query_vector": vector,
    "k": 5,
    "num_candidates": 10000,
    "boost": 0.5,
}

keyword_query = {
    "bool": {
        "must": {
            "multi_match": {
                "query": query,
                "fields": ["question^3", "answer"],
                "type": "best_fields",
                "boost": 0.5,
            }   
        },
    }
}

es_results = es_client.search(
    index=index_name,
    query=keyword_query,
    knn=knn_query,
    size=5
)

In [15]:
[elem['_score'] for elem in es_results["hits"]["hits"]]

[6.548745, 6.273893, 6.273893, 6.273893, 6.107338]

In [18]:
search_body_old = {
    "knn": {
        "field": "embedding",
        "query_vector": vector,
        "k": 5,
        "num_candidates": 10000,
    },
    "size": 5,
    "_source": ['document_id', 'question', 'answer'],
}

es_results = es_client.search(index=index_name, body=search_body_old)

[elem['_score'] for elem in es_results["hits"]["hits"]]

[0.6433488, 0.6385911, 0.6363969, 0.6286875, 0.62201583]

In [19]:
search_query = {
    "size": 5,
    "query": {
        "bool": {
            "must": {
                "multi_match": {
                    "query": query,
                    "fields": ["question^3", "answer"],
                    "type": "best_fields",
                }   
            },
        }
    }
}

es_results = es_client.search(index=index_name, body=search_query)
[elem['_score'] for elem in es_results["hits"]["hits"]]

[13.09749, 12.547786, 12.547786, 12.547786, 12.042366]

---

In [46]:
def elastic_search_hybrid(query, index_name="documents", field='embedding'):
    vector = get_vector(query)
    
    knn_query = {
        "field": "embedding",
        "query_vector": vector,
        "k": 5,
        "num_candidates": 10000,
        "boost": 0.5,
    }

    keyword_query = {
        "bool": {
            "must": {
                "multi_match": {
                    "query": query,
                    "fields": ["question^3", "answer"],
                    "type": "best_fields",
                    "boost": 0.5,
                }   
            },
        }
    }

    es_results = es_client.search(
        index=index_name,
        query=keyword_query,
        knn=knn_query,
        size=5
    )

    return [hit["_source"] for hit in es_results["hits"]["hits"]]

In [25]:
result = elastic_search_hybrid(query="who is glimmerfox?")
[elem['question'] for elem in result]

['What is the genus of the Glimmerfox?',
 'What is the species designation of the Glimmerfox?',
 'What is the gestation period of the Glimmerfox?',
 'What is the breeding behavior of the Glimmerfox?',
 'What is the significance of denning behavior for the Glimmerfox?']

---

In [29]:
df_ground_truth = pd.read_csv('ground-truth-data.csv')
df_ground_truth.head()

Unnamed: 0,question,document_id
0,Can you tell me the scientific classification ...,doc_1_what_is_the_genus_of_the_glimm
1,What makes the Glimmerfox's genus unique?,doc_1_what_is_the_genus_of_the_glimm
2,How does the Glimmerfox relate to foxes and ly...,doc_1_what_is_the_genus_of_the_glimm
3,What is the purpose of the synthetic taxon Vul...,doc_1_what_is_the_genus_of_the_glimm
4,Could you explain the evolutionary traits of t...,doc_1_what_is_the_genus_of_the_glimm


In [33]:
def hit_rate_one(original_id, search_results):
    return 1 if original_id in search_results else 0

def mrr_one(original_id, search_results):
    mrr = 0
    for position in range(len(search_results)):
        if search_results[position] == original_id:
            mrr += 1 / (position + 1)
    return mrr

In [35]:
def elastic_search_text(query, index_name="documents"):
    search_query = {
        "size": 10,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "answer"],
                        "type": "best_fields"
                    }
                },
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    return [hit["_source"] for hit in response["hits"]["hits"]]

In [58]:
[elem['question'] for elem in elastic_search_text(query="who is glimmerfox?")]

['What is the genus of the Glimmerfox?',
 'What is the species designation of the Glimmerfox?',
 'What is the gestation period of the Glimmerfox?',
 'What is the breeding behavior of the Glimmerfox?',
 'What is the impact of poaching on Glimmerfox populations?',
 'What is the role of chromatophores in the Glimmerfox?',
 'What is the significance of denning behavior for the Glimmerfox?',
 'What is the impact of the Glimmerfox on bird populations?',
 'What is the impact of habitat fragmentation on the Glimmerfox?',
 'What is the impact of Glimmerfox predation on amphibian populations?']

In [36]:
def elastic_search_knn(query, index_name="documents", field='embedding'):
                
    vector = get_vector(query)

    search_body = {
        "knn": {
            "field": "embedding",
            "query_vector": vector,
            "k": 10,
            "num_candidates": 100
        },
        "size": 10,
        "_source": ['document_id', 'question', 'answer'],
    }

    es_results = es_client.search(index=index_name, body=search_body)

    return [hit["_source"] for hit in es_results["hits"]["hits"]]

In [37]:
def elastic_search_knn_combined_style(query, index_name="documents", field='embedding'):
    # Obtain the vector representation of the query
    vector = get_vector(query)

    # Construct the search query using a script score for cosine similarity
    search_query = {
        "size": 10,  # Number of results to return
        "query": {
            "bool": {
                "must": [
                    {
                        "script_score": {
                            "query": {
                                "match_all": {}  # Match all documents to apply custom scoring
                            },
                            "script": {
                                "source": """
                                    cosineSimilarity(params.query_vector, 'embedding') + 1
                                """,  # +1 to ensure the score is positive
                                "params": {
                                    "query_vector": vector
                                }
                            }
                        }
                    }
                ]
            }
        },
        "_source": ['document_id', 'question', 'answer']  # Fields to return in the results
    }

    # Perform the search with the constructed query
    es_results = es_client.search(index=index_name, body=search_query)

    # Extract and return the results
    result_docs = [hit["_source"] for hit in es_results["hits"]["hits"]]

    return result_docs


In [38]:
hit_rate_results_text = []
hit_rate_results_vector = []
hit_rate_results_vector_combined = []
hit_rate_results_hybrid = []
mrr_results_text = []
mrr_results_vector = []
mrr_results_vector_combined = []
mrr_results_hybrid = []

# for index, row in df_ground_truth.iterrows():
for index, row in tqdm(df_ground_truth.iterrows(), total=df_ground_truth.shape[0], desc="Processing rows"):
    document_id = row['document_id']
    question = row['question']
    
    text_results = elastic_search_text(question)
    text_results = [item['document_id'] for item in text_results]
    hit_rate_text = hit_rate_one(document_id, text_results)
    hit_rate_results_text.append(hit_rate_text)
    mrr_text = mrr_one(document_id, text_results)
    mrr_results_text.append(mrr_text)

    knn_results = elastic_search_knn(question)
    knn_results = [item['document_id'] for item in knn_results]
    hit_rate_vector = hit_rate_one(document_id, knn_results)
    hit_rate_results_vector.append(hit_rate_vector)
    mrr_vector = mrr_one(document_id, knn_results)
    mrr_results_vector.append(mrr_vector)

    knn_combined_results = elastic_search_knn_combined_style(question)
    knn_combined_results = [item['document_id'] for item in knn_combined_results]
    hit_rate_vector_combined = hit_rate_one(document_id, knn_combined_results)
    hit_rate_results_vector_combined.append(hit_rate_vector_combined)
    mrr_vector_combined = mrr_one(document_id, knn_combined_results)
    mrr_results_vector_combined.append(mrr_vector_combined)

    hybrid_results = elastic_search_hybrid(question)
    hybrid_results = [item['document_id'] for item in hybrid_results]
    hit_rate_hybrid = hit_rate_one(document_id, hybrid_results)
    hit_rate_results_hybrid.append(hit_rate_hybrid)
    mrr_hybrid = mrr_one(document_id, hybrid_results)
    mrr_results_hybrid.append(mrr_hybrid)


Processing rows:   0%|          | 0/4505 [00:00<?, ?it/s]

In [39]:
len(df_ground_truth), len(hit_rate_results_text), len(hit_rate_results_vector), len(mrr_results_text), len(mrr_results_vector), len(hit_rate_results_vector_combined), len(mrr_results_vector_combined), len(hit_rate_results_hybrid), len(mrr_results_hybrid)

(4505, 4505, 4505, 4505, 4505, 4505, 4505, 4505, 4505)

In [40]:
df_ground_truth['hit_rate_text'] = hit_rate_results_text
df_ground_truth['hit_rate_vector'] = hit_rate_results_vector
df_ground_truth['hit_rate_vector_combined'] = hit_rate_results_vector_combined
df_ground_truth['hit_rate_hybrid'] = hit_rate_results_hybrid
df_ground_truth['mrr_text'] = mrr_results_text
df_ground_truth['mrr_vector'] = mrr_results_vector
df_ground_truth['mrr_vector_combined'] = mrr_results_vector_combined
df_ground_truth['mrr_hybrid'] = mrr_results_hybrid


In [43]:
df_ground_truth.describe()

Unnamed: 0,hit_rate_text,hit_rate_vector,hit_rate_vector_combined,hit_rate_hybrid,mrr_text,mrr_vector,mrr_vector_combined,mrr_hybrid
count,4505.0,4505.0,4505.0,4505.0,4505.0,4505.0,4505.0,4505.0
mean,0.392675,0.078135,0.078135,0.304772,0.211015,0.033115,0.033115,0.198794
std,0.4884,0.268414,0.268414,0.460362,0.34756,0.147772,0.147772,0.351505
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,0.0,0.0,1.0,0.25,0.0,0.0,0.25
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [42]:
df_ground_truth.to_csv('ground_truth_metrics_retrieval_hybrid.csv', index=False, sep=';', encoding='utf-8')

---

# Reranking

https://www.elastic.co/guide/en/elasticsearch/reference/current/rrf.html

https://www.elastic.co/search-labs/tutorials/search-tutorial/vector-search/hybrid-search

In [55]:
# RRF not enable in free version - https://www.elastic.co/subscriptions

def elastic_search_hybrid_rff_paid(query, index_name="documents", field='embedding'):
    vector = get_vector(query)
    
    knn_query = {
        "field": "embedding",
        "query_vector": vector,
        "k": 5,
        "num_candidates": 10000,
        "boost": 0.5,
    }

    keyword_query = {
        "bool": {
            "must": {
                "multi_match": {
                    "query": query,
                    "fields": ["question^3", "answer"],
                    "type": "best_fields",
                    "boost": 0.5,
                }   
            },
        }
    }

    search_query = {
        "knn": knn_query,
        "query": keyword_query,
        "size": 5,
        "rank": {
            "rrf": {}
        },
        # "_source": ["text", "section", "question", "course", "id"]
    }

    es_results = es_client.search(
        index=index_name,
        body=search_query
    )

    return [hit["_source"] for hit in es_results["hits"]["hits"]]

In [62]:
def compute_rrf(rank, k=60):
    """ Our own implementation of the relevance score """
    return 1 / (k + rank)

In [64]:
def elastic_search_hybrid_rff_free(query, index_name="documents", field='embedding', k=60):
    vector = get_vector(query)
    
    knn_query = {
        "field": "embedding",
        "query_vector": vector,
        "k": 5,
        "num_candidates": 10000,
        "boost": 0.5,
    }

    knn_results = es_client.search(
        index=index_name, 
        body={
            "knn": knn_query, 
            "size": 10
        }
    )['hits']['hits']

    # keyword results

    keyword_query = {
        "bool": {
            "must": {
                "multi_match": {
                    "query": query,
                    "fields": ["question^3", "answer"],
                    "type": "best_fields",
                    "boost": 0.5,
                }   
            },
        }
    }

    keyword_results = es_client.search(
        index=index_name, 
        body={
            "query": keyword_query, 
            "size": 10
        }
    )['hits']['hits']

    

    search_query = {
        "knn": knn_query,
        "query": keyword_query,
        "size": 5,
        "rank": {
            "rrf": {}
        },
        # "_source": ["question", "answer", "document_idid"]
    }

    rrf_scores = {}
    # Calculate RRF using vector search results
    for rank, hit in enumerate(knn_results):
        doc_id = hit['_id']
        rrf_scores[doc_id] = compute_rrf(rank + 1, k)

    # Adding keyword search result scores
    for rank, hit in enumerate(keyword_results):
        doc_id = hit['_id']
        if doc_id in rrf_scores:
            rrf_scores[doc_id] += compute_rrf(rank + 1, k)
        else:
            rrf_scores[doc_id] = compute_rrf(rank + 1, k)

    # Sort RRF scores in descending order
    reranked_docs = sorted(rrf_scores.items(), key=lambda x: x[1], reverse=True)
    
    # Get top-K documents by the score
    final_results = []
    for doc_id, score in reranked_docs[:5]:
        doc = es_client.get(index=index_name, id=doc_id)
        final_results.append(doc['_source'])
    
    return final_results

In [59]:
[elem['question'] for elem in elastic_search_text(query="who is glimmerfox?")]

['What is the genus of the Glimmerfox?',
 'What is the species designation of the Glimmerfox?',
 'What is the gestation period of the Glimmerfox?',
 'What is the breeding behavior of the Glimmerfox?',
 'What is the impact of poaching on Glimmerfox populations?',
 'What is the role of chromatophores in the Glimmerfox?',
 'What is the significance of denning behavior for the Glimmerfox?',
 'What is the impact of the Glimmerfox on bird populations?',
 'What is the impact of habitat fragmentation on the Glimmerfox?',
 'What is the impact of Glimmerfox predation on amphibian populations?']

In [60]:
[elem['question'] for elem in elastic_search_knn(query="who is glimmerfox?")]

["What are the benefits of the Glimmerfox's ability to mimic predator sounds?",
 'What is the significance of denning behavior for the Glimmerfox?',
 'What role does the Glimmerfox play in controlling prey populations?',
 'What comparisons can be made between its physiology and that of real-world predators?',
 'What are the benefits of the Glimmerfox’s diverse diet?',
 'What is the species designation of the Glimmerfox?',
 'What are the impacts of climate change on the Glimmerfox?',
 'How does the Glimmerfox use its retractable claws?',
 'What adaptations help the Glimmerfox survive in semi-arid environments?',
 'What are the potential risks of hybridization for the Glimmerfox?']

In [65]:
[elem['question'] for elem in elastic_search_hybrid_rff_free(query="who is glimmerfox?")]

['What is the significance of denning behavior for the Glimmerfox?',
 "What are the benefits of the Glimmerfox's ability to mimic predator sounds?",
 'What is the genus of the Glimmerfox?',
 'What is the species designation of the Glimmerfox?',
 'What role does the Glimmerfox play in controlling prey populations?']

---

# Evaluation

In [67]:
df_ground_truth_reloaded = pd.read_csv('ground_truth_metrics_retrieval_hybrid.csv', sep=';', encoding='utf-8')
df_ground_truth_reloaded.describe()

Unnamed: 0,hit_rate_text,hit_rate_vector,hit_rate_vector_combined,hit_rate_hybrid,mrr_text,mrr_vector,mrr_vector_combined,mrr_hybrid
count,4505.0,4505.0,4505.0,4505.0,4505.0,4505.0,4505.0,4505.0
mean,0.392675,0.078135,0.078135,0.304772,0.211015,0.033115,0.033115,0.198794
std,0.4884,0.268414,0.268414,0.460362,0.34756,0.147772,0.147772,0.351505
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,0.0,0.0,1.0,0.25,0.0,0.0,0.25
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [75]:
hit_rate_results_hybrid_rrf = []
mrr_results_hybrid_rrf = []

# for index, row in df_ground_truth.iterrows():
for index, row in tqdm(df_ground_truth_reloaded.iterrows(), total=df_ground_truth_reloaded.shape[0], desc="Processing rows"):
    document_id = row['document_id']
    question = row['question']


    hybrid_rff_results = elastic_search_hybrid_rff_free(question)
    hybrid_rff_results = [item['document_id'] for item in hybrid_rff_results]
    hit_rate_hybrid_rff = hit_rate_one(document_id, hybrid_rff_results)
    hit_rate_results_hybrid_rrf.append(hit_rate_hybrid_rff)
    mrr_hybrid_rff = mrr_one(document_id, hybrid_rff_results)
    mrr_results_hybrid_rrf.append(mrr_hybrid_rff)

Processing rows:   0%|          | 0/4505 [00:00<?, ?it/s]

In [76]:
len(df_ground_truth_reloaded), len(hit_rate_results_hybrid_rrf), len(mrr_results_hybrid_rrf)

(4505, 4505, 4505)

In [77]:
df_ground_truth_reloaded['hit_rate_hybrid_rrf'] = hit_rate_results_hybrid_rrf
df_ground_truth_reloaded['mrr_hybrid_rrf'] = mrr_results_hybrid_rrf

In [78]:
df_ground_truth_reloaded.describe()

Unnamed: 0,hit_rate_text,hit_rate_vector,hit_rate_vector_combined,hit_rate_hybrid,mrr_text,mrr_vector,mrr_vector_combined,mrr_hybrid,hit_rate_hybrid_rrf,mrr_hybrid_rrf
count,4505.0,4505.0,4505.0,4505.0,4505.0,4505.0,4505.0,4505.0,4505.0,4505.0
mean,0.392675,0.078135,0.078135,0.304772,0.211015,0.033115,0.033115,0.198794,0.229967,0.109515
std,0.4884,0.268414,0.268414,0.460362,0.34756,0.147772,0.147772,0.351505,0.420858,0.229679
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,0.0,0.0,1.0,0.25,0.0,0.0,0.25,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


---

# User query rewriting

In [79]:
prompt_template_rewriting = """
You are an expert at query expansion to generate a paraphrasing of a question.
I can't retrieval relevant information from the knowledge base by using user's question directly.     
You need to expand or paraphrase user's question by multiple ways such as using synonyms words/phrase, 
writing the abbreviation in its entirety, adding some extra descriptions or explanations, 
changing the way of expression. 
And return 5 versions of question.

Here's the data for the process:

User's question: {question}

Please analyze the question and provide your additional questions in parsable JSON without using code blocks:

{{
    "question 1": "provided by you question 1",
    "question 2": "provided by you question 2",
    "question 3": "provided by you question 3",
    "question 4": "provided by you question 4",
    "question 5": "provided by you question 5",
}}
""".strip()

In [81]:
query = 'glimmerfox?'

prompt = prompt_template_rewriting.format(question=query)
print(prompt)

You are an expert at query expansion to generate a paraphrasing of a question.
I can't retrieval relevant information from the knowledge base by using user's question directly.     
You need to expand or paraphrase user's question by multiple ways such as using synonyms words/phrase, 
writing the abbreviation in its entirety, adding some extra descriptions or explanations, 
changing the way of expression. 
And return 5 versions of question.

Here's the data for the process:

User's question: glimmerfox?

Please analyze the question and provide your additional questions in parsable JSON without using code blocks:

{
    "question 1": "provided by you question 1",
    "question 2": "provided by you question 2",
    "question 3": "provided by you question 3",
    "question 4": "provided by you question 4",
    "question 5": "provided by you question 5",
}


In [84]:
import dotenv
dotenv.load_dotenv('../.env')

api_key = os.environ.get("OPENAI_API_KEY")

In [85]:
def llm(prompt, model_choice, api_key):
    client = OpenAI(api_key=api_key)
    start_time = time.time()
    response = client.chat.completions.create(
        model=model_choice.split('/')[-1],
        messages=[{"role": "user", "content": prompt}]
    )
    answer = response.choices[0].message.content
    tokens = {
        'input_tokens': response.usage.prompt_tokens,
        'output_tokens': response.usage.completion_tokens,
        'total_tokens': response.usage.total_tokens
    }    
    end_time = time.time()
    response_time = end_time - start_time
    
    return answer, tokens, response_time

In [86]:
answer, tokens, response_time = llm(prompt, 'openai/gpt-4o-mini', api_key)

In [88]:
print(answer)

{
    "question 1": "What is the meaning or definition of the term 'glimmerfox'?",
    "question 2": "Can you explain what a glimmerfox is and its significance?",
    "question 3": "Please provide information about the glimmerfox, including its characteristics.",
    "question 4": "What does the term 'glimmerfox' refer to, and where can I find more information about it?",
    "question 5": "Could you describe what a glimmerfox is and its relevance in any context?"
}


In [96]:
try:
    answer_json = json.loads(answer)
except json.JSONDecodeError:
    print("Failed to parse questions by rewriting")

queries = list(answer_json.values())
queries.append(query)
queries

["What is the meaning or definition of the term 'glimmerfox'?",
 'Can you explain what a glimmerfox is and its significance?',
 'Please provide information about the glimmerfox, including its characteristics.',
 "What does the term 'glimmerfox' refer to, and where can I find more information about it?",
 'Could you describe what a glimmerfox is and its relevance in any context?',
 'glimmerfox?']

In [98]:
r = elastic_search_hybrid_rff_free(query="What is the meaning or definition of the term 'glimmerfox'?")
r

[{'chunk': "question:\nWhat is the species designation of the Glimmerfox?\n\nanswer:\nThe species designation of the Glimmerfox is Vulpilynx chameleontis. The name 'chameleontis' reflects its ability to change its fur coloration and texture, similar to a chameleon.\n",
  'document_id': 'doc_2_what_is_the_species_designatio',
  'question': 'What is the species designation of the Glimmerfox?',
  'answer': "The species designation of the Glimmerfox is Vulpilynx chameleontis. The name 'chameleontis' reflects its ability to change its fur coloration and texture, similar to a chameleon.",
  'embedding': [-0.2262648493051529,
   -0.16196604073047638,
   -0.019000163301825523,
   -0.17721419036388397,
   -0.10703665763139725,
   0.233426034450531,
   0.40597304701805115,
   0.17830491065979004,
   0.09559663385152817,
   0.23316244781017303,
   0.13015305995941162,
   0.13758309185504913,
   -0.3116305470466614,
   0.17986352741718292,
   -0.008458085358142853,
   -0.24648499488830566,
   -0.0

In [99]:
[elem['question'] for elem in r]

['What is the species designation of the Glimmerfox?',
 'What are the implications of Glimmerfox hybridization for its long-term conservation?',
 "What are the implications of the Glimmerfox's diet for its role in food webs?",
 "What are the long-term ecological effects of the Glimmerfox's seed dispersal activities?",
 "What is the significance of the Glimmerfox's species epithet 'chameleontis'?"]

In [100]:
search_results = []
for q in queries:
    r = elastic_search_hybrid_rff_free(query=q)
    search_results += r
len(search_results)

30

In [102]:
def build_prompt(query, search_results):
    prompt_template = """
You are an expert in synthetic biology and ecology with deep knowledge about the Glimmerfox (Vulpilynx chameleontis). Answer the QUESTION based strictly on the CONTEXT provided from the knowledge base. Do not add any information that is not in the CONTEXT.

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

    context = "\n\n".join(
        [
            f"question: {doc['question']}\nanswer: {doc['answer']}"
            for doc in search_results
        ]
    )
    return prompt_template.format(question=query, context=context).strip()

In [104]:
prompt = build_prompt(query, search_results)
print(prompt)

You are an expert in synthetic biology and ecology with deep knowledge about the Glimmerfox (Vulpilynx chameleontis). Answer the QUESTION based strictly on the CONTEXT provided from the knowledge base. Do not add any information that is not in the CONTEXT.

QUESTION: glimmerfox?

CONTEXT: 
question: What is the species designation of the Glimmerfox?
answer: The species designation of the Glimmerfox is Vulpilynx chameleontis. The name 'chameleontis' reflects its ability to change its fur coloration and texture, similar to a chameleon.

question: What are the implications of Glimmerfox hybridization for its long-term conservation?
answer: The implications of Glimmerfox hybridization for its long-term conservation include challenges in maintaining genetic purity, potential genetic dilution with native species, the need for specific management strategies, and ethical considerations in conservation efforts.

question: What are the implications of the Glimmerfox's diet for its role in food w

In [105]:
answer, tokens, response_time = llm(prompt, 'openai/gpt-4o-mini', api_key)
print(answer)

The Glimmerfox, scientifically designated as Vulpilynx chameleontis, is known for its ability to change its fur coloration and texture, akin to a chameleon. This species exhibits a range of ecological roles, including functioning as a mesopredator and participating in seed dispersal activities, which contribute to ecosystem diversity and resilience. Its anatomy supports an opportunistic omnivorous diet, enabling it to adapt to varying environmental conditions. Conservation efforts for the Glimmerfox face challenges such as habitat loss and genetic bottleneck effects, emphasizing the need for protective measures to maintain its populations and ecological functions.


---

# FINAL = elastic_search_advanced with Hybrid + RRF + User Query Rewriting

In [106]:
def elastic_search_advanced(query, index_name="documents", field='embedding', k=60):
    prompt_template_rewriting = """
You are an expert at query expansion to generate a paraphrasing of a question.
I can't retrieval relevant information from the knowledge base by using user's question directly.     
You need to expand or paraphrase user's question by multiple ways such as using synonyms words/phrase, 
writing the abbreviation in its entirety, adding some extra descriptions or explanations, 
changing the way of expression. 
And return 5 versions of question.

Here's the data for the process:

User's question: {question}

Please analyze the question and provide your additional questions in parsable JSON without using code blocks:

{{
    "question 1": "provided by you question 1",
    "question 2": "provided by you question 2",
    "question 3": "provided by you question 3",
    "question 4": "provided by you question 4",
    "question 5": "provided by you question 5",
}}
""".strip()
    
    prompt_rewriting = prompt_template_rewriting.format(question=query)
    answer, tokens, response_time = llm(prompt_rewriting, 'openai/gpt-4o-mini', api_key)

    try:
        answer_json = json.loads(answer)
    except json.JSONDecodeError:
        print("Failed to parse questions by rewriting")

    queries = list(answer_json.values())
    queries.append(query)


    search_results = []
    for q in queries:
        r = elastic_search_hybrid_rff_free(query=q)
        search_results += r

    return search_results

In [107]:
def evaluate_relevance(question, answer, api_key):
    # client = OpenAI(api_key=api_key)
    prompt_template_evaluation = """
    You are an expert evaluator for a Retrieval-Augmented Generation (RAG) system.
    Your task is to analyze the relevance of the generated answer to the given question.
    Based on the relevance of the generated answer, you will classify it
    as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

    Here is the data for evaluation:

    Question: {question}
    Generated Answer: {answer}

    Please analyze the content and context of the generated answer in relation to the question
    and provide your evaluation in parsable JSON without using code blocks:

    {{
      "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
      "Explanation": "[Provide a brief explanation for your evaluation]"
    }}
    """.strip()

    prompt = prompt_template_evaluation.format(question=question, answer=answer)
    evaluation, tokens, _ = llm(prompt, 'openai/gpt-4o-mini', api_key)
    
    try:
        json_eval = json.loads(evaluation)
        return json_eval['Relevance'], json_eval['Explanation'], tokens
    except json.JSONDecodeError:
        return "UNKNOWN", "Failed to parse evaluation", tokens


def calculate_openai_cost(model_choice, tokens):
    openai_cost = 0

    if model_choice in ['openai/gpt-4o', 'openai/gpt-4o-mini']:
        openai_cost = (tokens['input_tokens'] * 0.000150 + tokens['output_tokens'] * 0.000600) / 1000

    return openai_cost


def get_answer(query, model_choice, search_type, api_key):
    # if search_type == 'Vector':
    #     vector = model.encode(query)
    #     search_results = elastic_search_knn('question_text_vector', vector, course)
    # else:
    #     search_results = elastic_search_text(query, course)
    if search_type == 'text':
        search_results = elastic_search_text(query)
    elif search_type == 'vector':
        search_results = elastic_search_knn(query)
    elif search_type == 'hybrid':
        search_results = elastic_search_hybrid(query)
    elif search_type == 'advanced':
        search_results = elastic_search_advanced(query)

    prompt = build_prompt(query, search_results)
    answer, tokens, response_time = llm(prompt, model_choice, api_key)
    
    relevance, explanation, eval_tokens = evaluate_relevance(query, answer, api_key)

    openai_cost = calculate_openai_cost(model_choice, tokens)
 
    return {
        'answer': answer,
        'response_time': response_time,
        'relevance': relevance,
        'relevance_explanation': explanation,
        'model_used': model_choice,
        'input_tokens': tokens['input_tokens'],
        'output_tokens': tokens['output_tokens'],
        'total_tokens': tokens['total_tokens'],
        'eval_input_tokens': eval_tokens['input_tokens'],
        'eval_output_tokens': eval_tokens['output_tokens'],
        'eval_total_tokens': eval_tokens['total_tokens'],
        'openai_cost': openai_cost
    }

In [109]:
query = 'glimmerfox?'

In [120]:
answer_data = get_answer(query=query, model_choice="openai/gpt-4o-mini", search_type="advanced", api_key=api_key)
print(answer_data['openai_cost'])
print(answer_data['answer'])

0.00033974999999999994
The Glimmerfox, scientifically known as Vulpilynx chameleontis, is an opportunistic omnivore with adaptations that allow it to process both animal prey and plant material. As a mesopredator, it plays a critical role in controlling populations of smaller prey species, maintaining ecological balance, and preventing overpopulation of herbivores and small omnivores. Its anatomy features, such as large, mobile ears for better auditory detection, a prehensile tail for maneuverability, and a hybrid paw structure aiding in stealthy movement, highlight its unique adaptations for survival.


In [118]:
answer_data = get_answer(query=query, model_choice="openai/gpt-4o-mini", search_type="text", api_key=api_key)
print(answer_data['openai_cost'])
print(answer_data['answer'])

0.00018224999999999996
The Glimmerfox (Vulpilynx chameleontis) is a hybrid species that combines traits from foxes and lynxes. It utilizes vocalizations for various purposes such as mating calls, territorial defense, distress signals, and coordination during cooperative hunting. The species is adept at avoiding predation through its color-changing ability for camouflage, agile movements, use of cover, and alarm calls to warn other Glimmerfoxes. It also has a unique method of selecting denning sites, taking into account factors like proximity to water, prey availability, and safety from larger predators. Additionally, the Glimmerfox manages risks associated with inbreeding and resource scarcity through natural behaviors and social structures that promote genetic diversity and dietary flexibility. Habitat fragmentation poses a significant threat to the Glimmerfox by isolating populations and limiting resource access.


In [119]:
answer_data = get_answer(query=query, model_choice="openai/gpt-4o-mini", search_type="vector", api_key=api_key)
print(answer_data['openai_cost'])
print(answer_data['answer'])

0.00024359999999999999
The Glimmerfox (Vulpilynx chameleontis) is a species that exhibits behavioral flexibility to manage resource scarcity, including shifting its diet, reducing energy expenditure, and caching food for later use. Its conservation faces several challenges such as habitat loss, poaching, human-wildlife conflict, genetic bottleneck effects, and environmental pollution. The Glimmerfox also engages in vocalizations, scent marking, visual displays, and physical confrontations to handle territorial disputes. Poaching has a significant negative effect on Glimmerfox populations by lowering numbers, disrupting social structures, and increasing vulnerability to other threats. Conservation of the Glimmerfox offers benefits for ecosystem services like natural pest control, seed dispersal, and promoting biodiversity. However, translocating Glimmerfox populations is fraught with challenges such as ensuring habitat suitability and managing genetic diversity. Communication among Glim