In [1]:
from langchain import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Pinecone
import pinecone
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import  RecursiveCharacterTextSplitter
from langchain.prompts import  PromptTemplate
from langchain.llms import CTransformers

  from tqdm.autonotebook import tqdm


In [4]:
def load_pdf(data):
    loader =  DirectoryLoader(data,
                    glob = "*.pdf",
                    loader_cls= PyPDFLoader)
    
    documents = loader.load()

    return documents


In [5]:
extracted_data = load_pdf("data/")

In [6]:
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 20)
    text_chunks = text_splitter.split_documents(extracted_data)

    return text_chunks


In [7]:
text_chunks = text_split(extracted_data)
print(len(text_chunks))

8447


In [8]:
def download_hugging_face_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embeddings

In [9]:
embeddings = download_hugging_face_embeddings()

In [10]:
embeddings

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={})

In [11]:
query_result = embeddings.embed_query("Hello world")
print(len(query_result))

384


In [12]:
PINECONE_API_KEY = "8b23a7b7-9668-4803-a92e-24d59ee84d72"
PINECONE_API_ENV = "gcp-starter"

In [13]:
pinecone.init(api_key = PINECONE_API_KEY, environment = PINECONE_API_ENV)

index_name = "llama-2-rag-medical"

docsearch = Pinecone.from_texts([t.page_content for t in text_chunks], embeddings,index_name=index_name)

In [14]:
docsearch = Pinecone.from_existing_index(index_name, embeddings)
query = "What are Allergies?"
docs = docsearch.similarity_search(query, k=3)
print("Result", docs)

Result [Document(page_content='ORGANIZATIONS\nAmerican Academy of Ophthalmology. 655 Beach Street, PO\nBox 7424, San Francisco, CA 94120-7424. <http://www.eyenet.org>.KEY TERMS\nAllergen —A substance capable of inducing an\nallergic response.\nAllergic reaction —An immune system reaction to\na substance in the environment; symptomsinclude rash, inflammation, sneezing, itchy wateryeyes, and runny nose.\nConjunctiva —The mucous membrane that covers\nthe white part of the eyes and lines the eyelids.', metadata={}), Document(page_content='Although environmental medicine is gaining more\nrespect within conventional medicine, detoxificationKEY TERMS\nAllergen —A foreign substance, such as mites in\nhouse dust or animal dander, that wheninhaled,causes the airways to narrow and pro-duces symptoms of asthma.\nAntibody —A protein, also called immunoglobu-\nlin, produced by immune system cells to removeantigens (the foreign substances that trigger theimmune response).\nFibromyalgia —A condition o

In [15]:
prompt_template = """ 
Use the following pieces of information to answer the user's question.
If you don't know the answer, just say that you don't know , don't try to make up an answer

Context : {context}
Question : {question}

Only return the helpful answer below and nothing else.
Helpful answer :

"""

In [16]:
PROMPT = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
chain_type_kwargs = {"prompt": PROMPT}

In [17]:
llm = CTransformers(model = "model\llama-2-7b-chat.ggmlv3.q4_0.bin",
                    model_type="llama",
                    config = {"max_new_tokens": 512,'temperature':0.8})


In [18]:
qa = RetrievalQA.from_chain_type(
    llm =llm,
    chain_type="stuff",
    retriever = docsearch.as_retriever(search_kwargs = {'k':2}),
    return_source_documents = True,
    chain_type_kwargs=chain_type_kwargs
)

In [19]:
user_input = input(f"Input Prompt:")
result = qa({"query": user_input})
print("Response", result["result"])

Response AIDS stands for Acquired Immune Deficiency Syndrome, which is a condition where the immune system has been severely damaged due to infection by the Human Immunodeficiency Virus (HIV). AIDS is characterized by repeated infections and other opportunistic infections, cancers, and neurological problems. It is often caused by a combination of HIV infection and immune system suppression.
