In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
from dotenv import load_dotenv
import os

load_dotenv()

PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
PINECONE_CLOUD = os.environ.get("PINECONE_CLOUD")
PINECONE_REGION = os.environ.get("PINECONE_REGION")
MISTRALAI_API_KEY = os.environ.get("MISTRALAI_API_KEY")

In [3]:
import json

def json_to_markdown(input_file, output_file):
    with open(input_file, 'r') as f:
        data = json.load(f)
    
    with open(output_file, 'w') as f:
        for topic, qa_pairs in data.get("topics", {}).items():
            f.write(f"# {topic}\n")
            for i, qa in enumerate(qa_pairs, 1):
                f.write(f"## {qa['question']}\n")
                f.write(f"{qa['answer']}\n\n")

json_to_markdown('qa_pairs.json', 'qa_pairs.md')

In [4]:
from langchain.text_splitter import MarkdownHeaderTextSplitter

def chunk_markdown_with_headers(file_path):
    headers_to_split_on = [
        ("#", "Topic"),
        ("##", "Question")
    ]

    with open(file_path, 'r') as f:
        content = f.read()
    
    text_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
    docs = text_splitter.split_text(content)
    return docs

separated_docs = chunk_markdown_with_headers('qa_pairs.md')
# print(separated_docs)

In [5]:
# def separate_questions_and_answers(docs):
#     questions = []
#     answers = []
    
#     for doc in docs:
#         if doc.metadata["Type"] == "Question":
#             questions.append(doc)
#         elif doc.metadata["Type"] == "Answer":
#             answers.append(doc)
    
#     return questions, answers

# questions, answers = separate_questions_and_answers(separated_docs)
# print(questions[0])

In [6]:
from langchain_huggingface.embeddings import HuggingFaceEmbeddings

model_name = "sentence-transformers/all-mpnet-base-v2"
embeddings = HuggingFaceEmbeddings(model_name=model_name)

# doc_result = embeddings.embed_documents([answer.page_content for answer in separated_docs])
# print(doc_result[0])

In [7]:
from pinecone import Pinecone, ServerlessSpec
import time

pc = Pinecone(api_key=PINECONE_API_KEY)
spec = ServerlessSpec(cloud=PINECONE_CLOUD, region=PINECONE_REGION)

index_name = "bmp-rag"
dimension = len(embeddings.embed_query("test"))

if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=dimension,
        metric="cosine",
        spec=spec
    )
    while not pc.describe_index(index_name).status['ready']:
        time.sleep(1)

print("Index before upsert:")
print(pc.Index(index_name).describe_index_stats())
print("\n")

Index before upsert:
{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}




In [8]:
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents=separated_docs,
    index_name=index_name,
    embedding=embeddings,
    namespace="bmp-rag"
)

time.sleep(5)
print("Index after upsert:")
print(pc.Index(index_name).describe_index_stats())
print("\n")
time.sleep(2)

Index after upsert:
{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {'bmp-rag': {'vector_count': 2242}},
 'total_vector_count': 2242}




In [20]:
def get_topics_from_json(input_file):
    with open(input_file, 'r') as f:
        data = json.load(f)
    topics = list(data.get("topics", {}).keys())
    return topics

topics = get_topics_from_json('qa_pairs.json')
topic_embeddings = [embeddings.embed_query(topic) for topic in topics]

In [26]:
import numpy as np
from sentence_transformers import util

def classify_query_to_topic(query):
    query_embedding = embeddings.embed_query(query)
    similarities = util.cos_sim(query_embedding, topic_embeddings)
    best_topic_index = np.argmax(similarities)
    return topics[best_topic_index]

In [9]:
index = pc.Index(index_name)
namespace = "bmp-rag"

for ids in index.list(namespace=namespace):
    query = index.query(
        id=ids[0], 
        namespace=namespace, 
        top_k=1,
        include_values=True,
        include_metadata=True
    )
    print(query)
    print("\n")

{'matches': [{'id': '000fb88f-fc5f-4bc2-9831-e38ca48e037d',
              'metadata': {'Question': 'What is neuroplasticity?',
                           'Topic': 'Neuroplasticity and Rehabilitation '
                                    'Techniques',
                           'text': "Neuroplasticity is the brain's ability to "
                                   'reorganize and form new neural connections '
                                   'in response to learning, experience, or '
                                   'injury, allowing it to adapt and recover '
                                   'functions.'},
              'score': 1.0,
              'values': [-0.0413839743,
                         0.0171231963,
                         -0.0207109395,
                         -0.0172835253,
                         0.0416762717,
                         0.0366317146,
                         -0.0428273827,
                         0.0263376925,
                         -0.060313887

In [34]:
from langchain_mistralai import ChatMistralAI
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain import hub

retrieval_qa_chat_prompt = hub.pull("langchain-ai/retrieval-qa-chat")

llm = ChatMistralAI(
    api_key=MISTRALAI_API_KEY,
    model="mistral-large-latest",
    temperature=0.0
)

def answer_with_knowledge(query):
    topic = classify_query_to_topic(query)
    print(topic)

    retriever = docsearch.as_retriever(
        search_type="similarity",
        search_kwargs={
            "k": 50,
            "filter": {"Topic": topic}
        }
    )

    combine_docs_chain = create_stuff_documents_chain(
        llm, retrieval_qa_chat_prompt
    )

    retrieval_chain = create_retrieval_chain(retriever, combine_docs_chain)
    
    answer_with_knowledge = retrieval_chain.invoke({"input": query})
    return answer_with_knowledge

In [46]:
new_query = "What is buspirone, and how is it different from benzodiazepines?"
corrected_query = llm.invoke("Just correct the grammar without explanation: " + new_query)
print(corrected_query.content)
answer_without_knowledge = llm.invoke(corrected_query.content)

print("\nAnswer without knowledge:\n\n", answer_without_knowledge.content)
print("\n")
time.sleep(2)

What is buspirone, and how does it differ from benzodiazepines?

Answer without knowledge:

 Buspirone is a medication primarily used to treat anxiety disorders. It belongs to a class of drugs known as azapirones and is distinct from benzodiazepines in several ways.

### Mechanism of Action
- **Buspirone**: Acts as a partial agonist at serotonin 5-HT1A receptors. This means it binds to these receptors and partially activates them, which can help regulate serotonin levels in the brain. It also has some effects on dopamine receptors.
- **Benzodiazepines**: Enhance the activity of the neurotransmitter GABA (gamma-aminobutyric acid) by binding to GABA-A receptors. This increases the inhibitory effects of GABA, leading to a calming effect on the brain.

### Onset and Duration of Action
- **Buspirone**: Typically takes 2-4 weeks to reach its full therapeutic effect. It is not effective for immediate relief of anxiety symptoms.
- **Benzodiazepines**: Generally have a rapid onset of action, of

In [47]:
new_query = "What is buspirone, and how is it different from benzodiazepines?"
corrected_query = llm.invoke("Just correct the grammar without explanation: " + new_query)
answer = answer_with_knowledge("Give me an elaborate answer to this query: " + corrected_query.content)

print("Answer with knowledge:\n\n", answer['answer'])
print("\nContext used:\n\n", answer['context'])
print("\n")
time.sleep(2)

Anxiolytics (benzodiazepines, buspirone)
Answer with knowledge:

 Buspirone is an anxiolytic medication primarily used to treat generalized anxiety disorder. It works by affecting serotonin receptors in the brain, specifically as a partial agonist at 5-HT1A receptors. This mechanism helps to reduce anxiety symptoms without the risk of dependence or addiction, which is a significant advantage over benzodiazepines.

Benzodiazepines, on the other hand, are a class of drugs that enhance the effect of the neurotransmitter GABA, which inhibits brain activity and produces sedative and calming effects. Examples of benzodiazepines include alprazolam (Xanax), lorazepam (Ativan), and diazepam (Valium). They are commonly used to treat anxiety, insomnia, and seizure disorders.

Here are the key differences between buspirone and benzodiazepines:

1. **Mechanism of Action**:
   - Buspirone: Affects serotonin receptors (5-HT1A partial agonist).
   - Benzodiazepines: Enhance the effect of GABA, leading

In [54]:
import json
import numpy as np

def load_qa_data(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)
    return data['topics']

def retrieve_top_k_answers(query, k=5):
    topic = classify_query_to_topic(query)
    
    retriever = docsearch.as_retriever(
        search_type="similarity",
        search_kwargs={
            "k": k,
            "filter": {"Topic": topic}
        }
    )

    retrieved_docs = retriever.get_relevant_documents(query)

    top_k_answers = [doc.page_content for doc in retrieved_docs]

    return top_k_answers

def reciprocal_rank(results, ground_truth):
    for rank, result in enumerate(results, start=1):
        if result in ground_truth:
            return 1 / rank
    return 0

def average_precision(results, ground_truth):
    relevant_count = 0
    precision_sum = 0.0
    for rank, result in enumerate(results, start=1):
        if result in ground_truth:
            relevant_count += 1
            precision_sum += relevant_count / rank
    if relevant_count == 0:
        return 0
    return precision_sum / relevant_count

def mean_average_precision(all_results, all_ground_truths):
    ap_scores = [average_precision(results, gt) for results, gt in zip(all_results, all_ground_truths)]
    return np.mean(ap_scores)

def evaluate_chatbot(qa_data, retrieve_top_k_func, k=5, num_queries=300):
    all_results = []
    all_ranks = []
    queries = []
    ground_truths = []
    count = 0

    for topic, qa_pairs in qa_data.items():
        for qa in qa_pairs:
            queries.append(qa['question'])
            ground_truths.append([qa['answer']])
            count += 1
            if count >= num_queries:
                break
        if count >= num_queries:
            break
    
    for query, ground_truth in zip(queries, ground_truths):
        results = retrieve_top_k_func(query, k)
        all_results.append(results)
        rr = reciprocal_rank(results, ground_truth)
        all_ranks.append(rr)
    
    mrr = np.mean(all_ranks)
    map_score = mean_average_precision(all_results, ground_truths)
    
    return mrr, map_score

qa_data = load_qa_data('qa_pairs.json')

mrr, map_score = evaluate_chatbot(qa_data, retrieve_top_k_answers, k=5, num_queries=300)
print(f"Mean Reciprocal Rank (MRR): {mrr}")
print(f"Mean Average Precision (MAP): {map_score}")

Mean Reciprocal Rank (MRR): 0.512611111111111
Mean Average Precision (MAP): 0.512611111111111
