In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
from dotenv import load_dotenv
import os

load_dotenv()

PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
PINECONE_CLOUD = os.environ.get("PINECONE_CLOUD")
PINECONE_REGION = os.environ.get("PINECONE_REGION")
MISTRALAI_API_KEY = os.environ.get("MISTRALAI_API_KEY")

In [3]:
import json

def json_to_markdown(input_file, output_file):
    with open(input_file, 'r') as f:
        data = json.load(f)
    
    with open(output_file, 'w') as f:
        for topic, qa_pairs in data.get("topics", {}).items():
            f.write(f"# {topic}\n")
            for i, qa in enumerate(qa_pairs, 1):
                f.write(f"## {qa['question']}\n")
                f.write(f"{qa['answer']}\n\n")

json_to_markdown('qa_pairs.json', 'qa_pairs.md')

In [None]:
from langchain.text_splitter import MarkdownHeaderTextSplitter

def chunk_markdown_with_headers(file_path):
    headers_to_split_on = [
        ("#", "Topic"),
        ("##", "Question")
    ]

    with open(file_path, 'r') as f:
        content = f.read()
    
    text_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
    docs = text_splitter.split_text(content)
    return docs

separated_docs = chunk_markdown_with_headers('qa_pairs.md')
# print(separated_docs)



In [5]:
# def separate_questions_and_answers(docs):
#     questions = []
#     answers = []
    
#     for doc in docs:
#         if doc.metadata["Type"] == "Question":
#             questions.append(doc)
#         elif doc.metadata["Type"] == "Answer":
#             answers.append(doc)
    
#     return questions, answers

# questions, answers = separate_questions_and_answers(separated_docs)
# print(questions[0])

In [6]:
from langchain_huggingface.embeddings import HuggingFaceEmbeddings

model_name = "sentence-transformers/all-mpnet-base-v2"
embeddings = HuggingFaceEmbeddings(model_name=model_name)

# doc_result = embeddings.embed_documents([answer.page_content for answer in separated_docs])
# print(doc_result[0])

[0.027325425297021866, 0.019846098497509956, 0.003219260834157467, -0.05991825833916664, -0.024523507803678513, 0.06772884726524353, -0.002462659729644656, 0.041022855788469315, 0.06286187469959259, 0.005214627366513014, 0.04665151610970497, -0.003662804374471307, -0.0503493994474411, 0.013214925304055214, 0.021211955696344376, -0.04050042852759361, -0.015497195534408092, 0.0048475852236151695, 0.05620799958705902, 0.027084656059741974, -0.027953997254371643, -0.02324170246720314, -0.02262178249657154, -0.016040420159697533, -0.002103924984112382, -0.06535013020038605, 0.023079629987478256, -0.032123830169439316, -0.033966172486543655, -0.050619639456272125, -0.0051714652217924595, -0.041009288281202316, -0.009040672332048416, -0.019982460886240005, 1.7150254052467062e-06, 0.02072770707309246, -0.011326305568218231, 0.03056362271308899, 0.0427500456571579, -0.06816309690475464, 0.054180264472961426, -0.05297514423727989, 0.02435966581106186, 0.05547225847840309, 0.0022583312820643187, 

In [7]:
from pinecone import Pinecone, ServerlessSpec
import time

pc = Pinecone(api_key=PINECONE_API_KEY)
spec = ServerlessSpec(cloud=PINECONE_CLOUD, region=PINECONE_REGION)

index_name = "bmp-rag"
dimension = len(embeddings.embed_query("test"))

if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=dimension,
        metric="cosine",
        spec=spec
    )
    while not pc.describe_index(index_name).status['ready']:
        time.sleep(1)

print("Index before upsert:")
print(pc.Index(index_name).describe_index_stats())
print("\n")

Index before upsert:
{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}




In [8]:
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents=separated_docs,
    index_name=index_name,
    embedding=embeddings,
    namespace="bmp-rag"
)

time.sleep(5)
print("Index after upsert:")
print(pc.Index(index_name).describe_index_stats())
print("\n")
time.sleep(2)

Index after upsert:
{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {'bmp-rag': {'vector_count': 2466}},
 'total_vector_count': 2466}




In [9]:
index = pc.Index(index_name)
namespace = "bmp-rag"

for ids in index.list(namespace=namespace):
    query = index.query(
        id=ids[0], 
        namespace=namespace, 
        top_k=1,
        include_values=True,
        include_metadata=True
    )
    print(query)
    print("\n")

{'matches': [{'id': '001323ed-ab5e-4d73-b10a-7c0c025d28f5',
              'metadata': {'Question': 'What is cognitive-behavioral therapy '
                                       '(CBT)?',
                           'Topic': 'Roles and Functions of Clinical '
                                    'Psychologists',
                           'text': 'CBT is a therapeutic approach that focuses '
                                   'on changing negative thought patterns and '
                                   'behaviors to improve mental health.'},
              'score': 1.00000012,
              'values': [-0.0119263185,
                         -0.0515061319,
                         -0.00881340262,
                         -0.0497384779,
                         -0.0772053674,
                         0.0150960051,
                         -0.0334650539,
                         0.0434928946,
                         0.0136711346,
                         0.0372184627,
                    

In [14]:
from langchain_mistralai import ChatMistralAI
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain import hub

retrieval_qa_chat_prompt = hub.pull("langchain-ai/retrieval-qa-chat")
retriever=docsearch.as_retriever()

llm = ChatMistralAI(
    api_key=MISTRALAI_API_KEY,
    model="mistral-large-latest",
    temperature=0.0
)

combine_docs_chain = create_stuff_documents_chain(
    llm, retrieval_qa_chat_prompt
)
retrieval_chain = create_retrieval_chain(retriever, combine_docs_chain)

In [17]:
query1 = "What are the common symptoms of ADHD?"
answer1_without_knowledge = llm.invoke(query1)

print("Query 1:", query1)
print("\nAnswer without knowledge:\n\n", answer1_without_knowledge.content)
print("\n")
time.sleep(2)

Query 1: What are the common symptoms of ADHD?

Answer without knowledge:

 Attention Deficit Hyperactivity Disorder (ADHD) is a neurodevelopmental condition characterized by persistent patterns of inattention and/or hyperactivity and impulsivity that interfere with daily life and functioning. Symptoms can vary widely from person to person, but they generally fall into three categories: inattention, hyperactivity, and impulsivity. Here are some common symptoms:

### Inattention:
1. **Difficulty staying focused and on task.**
2. **Easily distracted, even with minor stimuli.**
3. **Difficulty listening to others, even when spoken to directly.**
4. **Frequently does not follow through on instructions or fails to finish tasks.**
5. **Difficulty organizing tasks and activities.**
6. **Avoids or is reluctant to engage in tasks that require sustained mental effort.**
7. **Often loses things necessary for tasks and activities.**
8. **Easily sidetracked by extraneous stimuli.**
9. **Forgetful i

In [18]:
answer1_with_knowledge = retrieval_chain.invoke({"input": query1})

print("Answer with knowledge:\n\n", answer1_with_knowledge['answer'])
print("\nContext used:\n\n", answer1_with_knowledge['context'])
print("\n")
time.sleep(2)

Answer with knowledge:

 Based on the context provided, the common symptoms of ADHD include:

- Difficulty staying focused or difficulty focusing
- Being easily distracted
- Excessive movement, fidgeting, or excessive energy
- Impulsive decision-making or impulsive actions
- Trouble following instructions
- Inattention
- Impulsivity
- Hyperactivity

Context used:

 [Document(id='ecb14bcd-7239-47cd-9c85-5d845dfd7601', metadata={'Question': 'What are the signs of ADHD in children?', 'Topic': 'Neurodevelopmental Disorders'}, page_content='Signs of ADHD in children include difficulty staying focused, being easily distracted, excessive movement or fidgeting, and impulsive decision-making.'), Document(id='8e5edb46-7d8b-4b2b-aaa8-ad304d059d0d', metadata={'Question': 'What are the primary symptoms of ADHD?', 'Topic': 'Neurodevelopmental Disorders (ADHD, Autism, Learning Disabilities)'}, page_content='The primary symptoms of ADHD include inattention, impulsivity, and hyperactivity, which can le