# Step 01: Install All the Required Packages

In [2]:
pip install pinecone-client langchain sentence-transformers openai pypdf IProgress -q

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


# Step 02: RUN 

In [None]:
# 01: CONFIGURE
http_proxy='http://lgn304-v304:53128'
PINECONE_API_KEY='20163887-a4fa-44e7-98d2-ab1eb38937f6'
PINECONE_API_ENV='gcp-starter'
PINECONE_index_name="cjz-medical"
MODEL_ID="/work/u00cjz00/slurm_jobs/github/models/Llama-2-7b-chat-hf"
PDF_DIR="data"

# 02: Load LIBRARY
import os, timeit, sys
import pinecone
import transformers
import torch
import warnings
from pinecone.core.client.configuration import Configuration as OpenApiConfiguration
from langchain import PromptTemplate
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Pinecone
from langchain.llms import HuggingFacePipeline
from transformers import AutoTokenizer

warnings.filterwarnings('ignore')


# 03: Extract Data From the PDF File
def load_pdf_file(data):
    loader= DirectoryLoader(data, glob="*.pdf",loader_cls=PyPDFLoader)
    documents=loader.load()
    return documents

extracted_data=load_pdf_file(data='data/')
#print(extracted_data)

# 04: Split the Data into Text Chunks
def text_split(extracted_data):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunks=text_splitter.split_documents(extracted_data)
    return text_chunks

text_chunks=text_split(extracted_data)
#print("Length of Text Chunks", len(text_chunks))

# 05. Download the Embeddings from Hugging Face
def download_hugging_face_embeddings():
    embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
    return embeddings

start = timeit.default_timer()
embeddings = download_hugging_face_embeddings()

query_result = embeddings.embed_query("Hello world")
#print("Length", len(query_result))

# 06. pinecone
openapi_config = OpenApiConfiguration.get_default_copy()
openapi_config.proxy = http_proxy
pinecone.init(api_key=PINECONE_API_KEY, environment=PINECONE_API_ENV,openapi_config=openapi_config)
pinecone.list_indexes()

# 07. If we already have an index we can load it like this
docsearch=Pinecone.from_existing_index(PINECONE_index_name, embeddings)
query = "What are Allergies"
docs=docsearch.similarity_search(query, k=3)
#print("Result", docs)

# 08. Load model
tokenizer=AutoTokenizer.from_pretrained(MODEL_ID)
pipeline=transformers.pipeline(
    "text-generation",
    model=MODEL_ID,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
    device_map="auto",
    max_length=1000,
    do_sample=True,
    top_k=10,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id
    )
llm=HuggingFacePipeline(pipeline=pipeline, model_kwargs={'temperature':0})

# 9. prompt_template
prompt_template="""
Use the following pieces of information to answer the user's question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

Context: {context}
Question: {question}

Only return the helpful answer below and nothing else.
Helpful answer:
"""

PROMPT=PromptTemplate(template=prompt_template, input_variables=["context", "question"])
chain_type_kwargs={"prompt": PROMPT}

# 10. question
#qa=RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=docsearch.as_retriever(search_kwargs={'k': 2}),return_source_documents=True, chain_type_kwargs=chain_type_kwargs)
#query="What are Allergies"
#print("Response",qa.run(query))

# 11. RUN
qa=RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=docsearch.as_retriever(search_kwargs={'k': 2}),return_source_documents=True, chain_type_kwargs=chain_type_kwargs)
while True:
    user_input=input(f"Input Prompt:")
    if user_input=='exit':
        print('Exiting')
        sys.exit()
    if user_input=='':
        continue
    result=qa({"query": user_input})
    print("Response : ", result["result"])
    print("Source Documents : ", result["source_documents"])

end=timeit.default_timer()
print(f"Time to retrieve response: {end-start}")

  from tqdm.autonotebook import tqdm
Downloading (…)e9125/.gitattributes: 100%|██████████| 1.18k/1.18k [00:00<00:00, 319kB/s]
Downloading (…)_Pooling/config.json: 100%|██████████| 190/190 [00:00<00:00, 87.0kB/s]
Downloading (…)7e55de9125/README.md: 100%|██████████| 10.6k/10.6k [00:00<00:00, 4.72MB/s]
Downloading (…)55de9125/config.json: 100%|██████████| 612/612 [00:00<00:00, 296kB/s]
Downloading (…)ce_transformers.json: 100%|██████████| 116/116 [00:00<00:00, 52.9kB/s]
Downloading (…)125/data_config.json: 100%|██████████| 39.3k/39.3k [00:00<00:00, 236kB/s]
Downloading pytorch_model.bin: 100%|██████████| 90.9M/90.9M [00:00<00:00, 334MB/s]
Downloading (…)nce_bert_config.json: 100%|██████████| 53.0/53.0 [00:00<00:00, 24.2kB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 112/112 [00:00<00:00, 50.5kB/s]
Downloading (…)e9125/tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 1.37MB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 350/350 [00:00<00:00, 159kB/s]
Download

Input Prompt: What are Allergies?


Response :  Allergies are a type of immune reaction where the immune system responds to harmless, everyday substances like pollen, dust, or insect parts from tiny house mites by producing specific proteins called antibodies. These antibodies are capable of binding to identifying molecules, or antigens, on the foreign particle, triggering a series of chemical reactions designed to protect the body from infection. This is the condition known as allergy, and the offending substance is called an allergen.
Source Documents :  [Document(page_content='Description\nAllergies are among the most common of medical\ndisorders. It is estimated that 60 million Americans, ormore than one in every five people, suffer from someform of allergy, with similar proportions throughoutmuch of the rest of the world. Allergy is the single largestreason for school absence and is a major source of lostproductivity in the workplace.\nAn allergy is a type of immune reaction. Normally,\nthe immune system responds to

Input Prompt: ware is Acupuncture?


Response :  Acupuncture is one of the main forms of treatment in traditional Chinese medicine. It involves the use of sharp, thin needles that are inserted in the body at very specific points. This process is believed to adjust and alter the body's energy flow into healthier patterns, and is used to treat a wide variety of illnesses and health conditions, including allergies, respiratory conditions, gastrointestinal disorders, gynecological problems, nervous conditions, and disorders of the eyes, nose and throat, and childhood illnesses, among others.
Source Documents :  [Document(page_content='Acupuncture is one of the main forms of treatment in\ntraditional Chinese medicine . It involves the use of\nsharp, thin needles that are inserted in the body at veryspecific points. This process is believed to adjust and alterthe body’s energy flow into healthier patterns, and is usedto treat a wide variety of illnesses and health conditions.\nPurpose\nThe World Health Organization (WHO) recomm

Input Prompt: Give me some BOOKS about Acupuncture.


Response :  Some books about acupuncture are:

Jarmey, Chris and John Tindall. Acupressure for Common Ailments. London: Gaia, 1991.
Kakptchuk, Ted. The Web That Has No Weaver: Understanding Chinese Medicine. New York: Congdon and Weed, 1983.
Warren, Frank Z., MD. Freedom From Pain Through Acupuncture. New York: Fell, 1976.
Source Documents :  [Document(page_content='remain difficult to document in terms of the biochemicalprocesses involved; numerous testimonials are the prima-ry evidence backing up the effectiveness of acupressureand acupuncture. However, a body of research is growingthat verifies the effectiveness in acupressure andacupuncture techniques in treating many problems and incontrolling pain.\nResources\nBOOKS\nJarmey, Chris and John Tindall. Acupressure for Common Ail-\nments. London: Gaia, 1991.\nKakptchuk, Ted. The Web That Has No Weaver: Understanding\nChinese Medicine. New York: Congdon and Weed, 1983.\nWarren, Frank Z., MD. Freedom From Pain Through Acupres-\nsure. Ne

Input Prompt: 請列出跟肺臟有關的穴位


Response :  The following acupoints are related to the lungs:

* LU-7 (Zhongzhu): located on the midline of the back, 2 finger-widths above the navel
* LU-5 (Fenglong): located on the back of the hand, between the first and second metacarpal bones
* LU-6 (Zhao Hai): located on the back of the forearm, 2 finger-widths above the wrist
* LU-9 (Jianzhong): located on the back of the leg, 2 finger-widths above the ankle
* LU-5 (Fenglong): located on the back of the hand, between the first and second metacarpal bones

Note: These points are not a comprehensive list of all acupoints related to the lungs, but rather a selection of some of the most commonly used points in traditional Chinese medicine.
Source Documents :  [Document(page_content='consciousness). Patients at highest risk for the most severesymptoms of withdrawal (referred to as delirium tremens)are those with other medical problems, including malnu-\ntrition , liver disease, or Wernicke’s syndrome. Delirium\nGALE ENCYCLOPEDIA OF M

ware is Acupuncture?
Give me some BOOKS about Acupuncture.
What are Allergies?
請列出跟肺臟有關的穴位