In [8]:
from langchain import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Pinecone
from langchain.document_loaders import PyPDFLoader,DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.llms import CTransformers
import os
from dotenv import load_dotenv
import pinecone
import uuid
import math
import json

In [9]:
load_dotenv()

True

In [10]:
PINECONE_API_KEY=os.getenv('PINECONE_API_KEY')
PINECONE_API_ENV='gcp-starter'

In [11]:
# Extract data from PDF file
def load_pdf(data):
 loader=DirectoryLoader(data,glob="*.pdf",loader_cls=PyPDFLoader)
 documents=loader.load()
 return documents

In [12]:
extracted_data = load_pdf("data/")

In [13]:
#extracted_data

In [14]:
#Create text chunks
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 20)
    text_chunks = text_splitter.split_documents(extracted_data)

    return text_chunks

In [15]:
text_chunks = text_split(extracted_data)
print("length of my chunk:", len(text_chunks))

length of my chunk: 7020


In [16]:
text_chunks[0].page_content

'TheGALE\nENCYCLOPEDIA\nofMEDICINE\nSECOND EDITION'

In [17]:
#text_chunks

In [18]:
#download embedding model
def download_hugging_face_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embeddings

In [19]:
embedding_model = download_hugging_face_embeddings()



In [20]:
embedding_model

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [21]:
query_result = embedding_model.embed_query("Hello world")
print("Length", len(query_result))

Length 384


In [22]:
#query_result

In [23]:
import os
from pinecone import Pinecone, ServerlessSpec
#from langchain.embeddings.openai import OpenAIEmbeddings

In [24]:
# Initialize Pinecone
api_key = os.getenv("PINECONE_API_KEY")

In [25]:
# Define index name
index_name = "medical-chatbot"

In [26]:
pc = Pinecone(api_key=api_key)

In [27]:

# Creates an index using the API key stored in the client 'pc'.
pc.create_index(
    name=index_name,
    dimension=384,
    metric="cosine",
    spec=ServerlessSpec(
        cloud='aws', 
        region='us-east-1'
    ) 
) 

In [28]:
# Assuming 'text_chunks' is a list of objects where each object has a 'page_content' attribute
#Example: text_chunks = [TextChunk(page_content="Text 1"), TextChunk(page_content="Text 2")]
# Generate embeddings for each text chunk
embeddings = [embedding_model.embed_query(t.page_content) for t in text_chunks]

In [29]:
len(embeddings)

7020

In [30]:
embeddings[0]

[0.0017460824456065893,
 -0.03350288048386574,
 -0.03290391340851784,
 0.007168074604123831,
 -0.01460330095142126,
 0.010261906310915947,
 -0.011515316553413868,
 0.22930210828781128,
 -0.023232368752360344,
 0.004120390862226486,
 -0.036560822278261185,
 0.08592111617326736,
 0.012972141616046429,
 0.05221789330244064,
 -0.10232619196176529,
 -0.0031390218064188957,
 -0.012686936184763908,
 0.00047184049617499113,
 -0.028485851362347603,
 -0.05025918409228325,
 0.01155098993331194,
 0.0778065174818039,
 0.09282821416854858,
 -0.013797298073768616,
 -0.016935091465711594,
 -0.02595585398375988,
 -0.04956509545445442,
 -0.046131327748298645,
 0.007290528621524572,
 -0.013553302735090256,
 0.03843941166996956,
 0.06280472129583359,
 0.01835383102297783,
 0.008242791518568993,
 0.0017155827954411507,
 -0.03986185044050217,
 -0.01163862831890583,
 0.016446184366941452,
 0.025595610961318016,
 0.09104608744382858,
 0.02967270091176033,
 -0.054160282015800476,
 -0.04576560854911804,
 -0.013

In [31]:
index_name

'medical-chatbot'

In [32]:
index = pc.Index(index_name)

In [33]:
index

<pinecone.data.index.Index at 0x1903a612d90>

In [34]:
# Retrieve index info to get the host
index_info = pc.describe_index(index_name)

In [35]:
index_info

{'deletion_protection': 'disabled',
 'dimension': 384,
 'host': 'medical-chatbot-omc9wav.svc.aped-4627-b74a.pinecone.io',
 'metric': 'cosine',
 'name': 'medical-chatbot',
 'spec': {'serverless': {'cloud': 'aws', 'region': 'us-east-1'}},
 'status': {'ready': True, 'state': 'Ready'}}

In [36]:
host=index_info['host']
print(host)

medical-chatbot-omc9wav.svc.aped-4627-b74a.pinecone.io


In [37]:
def flatten_metadata(metadata):
    """Flatten metadata to simple key-value pairs."""
    flattened = {}
    for key, value in metadata.items():
        if isinstance(value, (str, int, float, bool)):
            flattened[key] = value
        elif isinstance(value, list) and all(isinstance(i, str) for i in value):
            flattened[key] = value
        else:
            # Convert complex structures to JSON strings
            flattened[key] = json.dumps(value)
    return flattened

In [38]:
# Helper function to split vectors into smaller batches
def batch_vectors(vectors, batch_size):
    for i in range(0, len(vectors), batch_size):
        yield vectors[i:i + batch_size]

In [39]:
# Define batch size (adjust as needed)
batch_size = 1000  # Set a batch size that fits within Pinecone's limits

# Prepare the batch of vectors to upsert
vectors = [(str(uuid.uuid4()), list(embedding), {
    "text": chunk.page_content,
    **flatten_metadata(chunk.metadata),
}) for embedding, chunk in zip(embeddings, text_chunks)]
namespace="ns_medical_chatbot"
# Upsert vectors in batches
for batch in batch_vectors(vectors, batch_size):
    index.upsert(vectors=batch,namespace=namespace)
    print(f"Uploaded {len(batch)} vectors to Pinecone index '{index}'")

print(f"Uploaded a total of {len(vectors)} vectors to Pinecone index '{index}'")


Uploaded 1000 vectors to Pinecone index '<pinecone.data.index.Index object at 0x000001903A612D90>'
Uploaded 1000 vectors to Pinecone index '<pinecone.data.index.Index object at 0x000001903A612D90>'
Uploaded 1000 vectors to Pinecone index '<pinecone.data.index.Index object at 0x000001903A612D90>'
Uploaded 1000 vectors to Pinecone index '<pinecone.data.index.Index object at 0x000001903A612D90>'
Uploaded 1000 vectors to Pinecone index '<pinecone.data.index.Index object at 0x000001903A612D90>'
Uploaded 1000 vectors to Pinecone index '<pinecone.data.index.Index object at 0x000001903A612D90>'
Uploaded 1000 vectors to Pinecone index '<pinecone.data.index.Index object at 0x000001903A612D90>'
Uploaded 20 vectors to Pinecone index '<pinecone.data.index.Index object at 0x000001903A612D90>'
Uploaded a total of 7020 vectors to Pinecone index '<pinecone.data.index.Index object at 0x000001903A612D90>'


In [40]:
# Load the existing index
index = pc.Index(index_name)

In [41]:
index

<pinecone.data.index.Index at 0x19039f386d0>

In [42]:
def query_pinecone(query_text, top_k=5):
     # Generate the embedding for the query text
     query_embedding = embedding_model.embed_query(query_text)
    # Perform the query using the embedding with keyword arguments
     query_result = index.query(
     vector=query_embedding,
     top_k=top_k,  # number of top results to retrieve
     namespace='ns_medical_chatbot',  # specify the namespace
     include_metadata=True , # include the actual vector data in the results
    )
     return query_result

In [43]:
# Example usage
query = "What are Allergies"
results = query_pinecone(query)

In [44]:
# Print the total number of matches found
print(f"Total Matches Found: {len(results['matches'])}")

print("")  # Print a blank line for separation

# Iterate over each match in the results
for match in results['matches']:
    # Display the match score, formatted to two decimal places
    print(f"Match Score: {match['score']:.2f}")
    
    # Show the main content of the match
    print(f"Match Content: {match['metadata']['text']}")
    
    # Extract and display additional information about the match, excluding the main content
    other_metadata = {k: v for k, v in match['metadata'].items() if k != 'text'}
    print("Additional Information:")
    
    # Iterate over additional metadata and display each key-value pair
    for key, value in other_metadata.items():
        print(f"  {key.capitalize()}: {value}")
    
    # Display the source of the match if available
    if 'source' in other_metadata:
        print(f"Source: {other_metadata['source']}")
    else:
        print("Source: Not provided")
        
    # Display the page number of the match if available
    if 'page' in other_metadata:
        print(f"Page Number: {other_metadata['page']}")
    else:
        print("Page Number: Not provided")
        
    print()  # Print a blank line for separation
    
    # Print a line to separate details of different matches
    print("----------------------------------------------------------------------------")


Total Matches Found: 5

Match Score: 0.68
Match Content: GALE ENCYCLOPEDIA OF MEDICINE 2 117Allergies
Allergic rhinitis is commonly triggered by
exposure to household dust, animal fur,or pollen. The foreign substance thattriggers an allergic reaction is calledan allergen.
The presence of an allergen causes the
body's lymphocytes to begin producingIgE antibodies. The lymphocytes of an allergy sufferer produce an unusuallylarge amount of IgE.
IgE molecules attach to mast
cells, which contain histamine.HistaminePollen grains
Lymphocyte
FIRST EXPOSURE
Additional Information:
  Page: 130.0
  Source: data\Medical_book.pdf
Source: data\Medical_book.pdf
Page Number: 130.0

----------------------------------------------------------------------------
Match Score: 0.68
Match Content: allergens are the following:
• plant pollens
• animal fur and dander
• body parts from house mites (microscopic creatures
found in all houses)
• house dust• mold spores• cigarette smoke• solvents• cleaners
Common foo

In [45]:
prompt_template="""
Use the following pieces of information to answer the user's question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

Context: {context}
Question: {question}

Only return the helpful answer below and nothing else.
Helpful answer:
"""

In [46]:
PROMPT=PromptTemplate(template=prompt_template, input_variables=["context", "question"])
chain_type_kwargs={"prompt": PROMPT}

In [47]:
llm=CTransformers(model="model/llama-2-7b-chat.ggmlv3.q4_0.bin",
                  model_type="llama",
                  config={'max_new_tokens':512,
                          'temperature':0.8})

In [48]:
from langchain.schema import BaseRetriever
from langchain.docstore.document import Document
from typing import List

class CustomPineconeRetriever(BaseRetriever):
    def get_relevant_documents(self, query: str) -> List[Document]:
        results = query_pinecone(query)
        docs = []
        for match in results['matches']:
            metadata = match['metadata']
            text = metadata.pop('text', '')  # Remove 'text' from metadata and use it as the main content
            docs.append(Document(page_content=text, metadata=metadata))
        return docs

    async def aget_relevant_documents(self, query: str) -> List[Document]:
        return self.get_relevant_documents(query)

In [49]:
# Create the custom retriever
custom_retriever = CustomPineconeRetriever()

qa = RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff", 
    retriever=custom_retriever,
    return_source_documents=True, 
    chain_type_kwargs=chain_type_kwargs
)

In [71]:
while True:
    user_input=input(f"Input Prompt:")
    result=qa({"query": user_input})
    print("Response : ", result["result"])

  warn_deprecated(


Response :  S! 2:
The informationAllergic re:
Hope.
There are welcome,thanks, no.


In [2]:
from pathlib import Path

In [3]:
path="G:\End-to-end-medical-chatbot-using-llama2\data\Medical_book.pdf"
Path(path)

WindowsPath('G:/End-to-end-medical-chatbot-using-llama2/data/Medical_book.pdf')