In [1]:
from langchain import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Pinecone
import pinecone
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain_community.llms import CTransformers
# from langchain.llms import CTransformers

In [13]:
PINECONE_API_KEY="07e76e54-e4af-46ba-8590-7635881e8101"

In [3]:
def load_pdf(data):
    loader = DirectoryLoader(data,
                    glob="*.pdf",
                    loader_cls=PyPDFLoader)
    
    documents = loader.load()

    return documents

In [4]:
extracted_data = load_pdf("data/")

KeyboardInterrupt: 

In [7]:
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 20)
    text_chunks = text_splitter.split_documents(extracted_data)

    return text_chunks

In [8]:
text_chunks = text_split(extracted_data)
print("length of my chunk:", len(text_chunks))

length of my chunk: 3259


In [9]:
def download_hugging_face_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embeddings

In [10]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [10]:
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

In [11]:
embeddings

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [12]:
query_result = embeddings.embed_query("Hello world")
print("Length", len(query_result))

Length 384


In [14]:
query_result

[-0.034477267414331436,
 0.031023206189274788,
 0.006734929047524929,
 0.026108982041478157,
 -0.03936203196644783,
 -0.16030243039131165,
 0.06692398339509964,
 -0.006441489793360233,
 -0.04745049402117729,
 0.014758839271962643,
 0.07087527960538864,
 0.05552763119339943,
 0.01919335499405861,
 -0.026251323521137238,
 -0.010109569877386093,
 -0.02694045566022396,
 0.022307392209768295,
 -0.02222662791609764,
 -0.14969263970851898,
 -0.017493031919002533,
 0.007676294539123774,
 0.054352253675460815,
 0.0032544503919780254,
 0.031725917011499405,
 -0.08462144434452057,
 -0.029405998066067696,
 0.051595620810985565,
 0.048124048858881,
 -0.00331486901268363,
 -0.058279212564229965,
 0.041969265788793564,
 0.022210726514458656,
 0.12818878889083862,
 -0.02233896590769291,
 -0.011656217277050018,
 0.06292834132909775,
 -0.032876402139663696,
 -0.09122605621814728,
 -0.03117532841861248,
 0.05269954353570938,
 0.047034818679094315,
 -0.08420311659574509,
 -0.030056139454245567,
 -0.020744

TypeError: __init__() missing 1 required positional argument: 'host'

In [14]:
import os
os.environ['PINECONE_API_KEY'] = PINECONE_API_KEY

In [None]:
from langchain_pinecone import PineconeVectorStore
docs_chunks = [t.page_content for t in text_chunks]
index_name="mchat"
vectorstore_from_texts = PineconeVectorStore.from_texts(
        docs_chunks,
        index_name=index_name,
        embedding=embeddings
    )

In [8]:
from langchain_pinecone import PineconeVectorStore
index_name="mchat"

In [None]:
# pc = pinecone.Pinecone(api_key=PINECONE_API_KEY)


# index_name="mchat"

# if index_name not in pc.list_indexes().names():
#     # do something
#     pc.create_index(index_name)

#     #Creating Embeddings for Each of The Text Chunks & storing
# docs_chunks = [t.page_content for t in text_chunks]
# docsearch=Pinecone.from_texts(docs_chunks, embedding=embeddings, index_name=index_name)



In [None]:
PINECONE_API_KEY

In [15]:
docsearch=PineconeVectorStore.from_existing_index(index_name,embeddings)

query = "What is tuberculosis"

docs=docsearch.similarity_search(query, k=3)

print("Result", docs)

Result [Document(page_content='II Chapter 1 \nGeneral background to clinical tuberculosis \n1.1 Introduction \n• About this book \nThis book is written for non-specialist hospital doctors, doctors in primary health \ncare and other health professionals who may meet tuberculosis in the course of \ntheir work. Almost all patients with newly diagnosed tuberculosis can be cured if \nproperly treated. Many will die if they are not properly treated. As a responsible \ndoctor or health worker, therefore: \n~ do not miss the diagnosis'), Document(page_content='it if you always keep the possibility in mind. Always remember the four questions \nset out at the beginning of this chapter (page 23). \nIn children you must think of tuberculosis as a generalized disease that may \nappear in any part of the body -not necessarily with cough and sputum (which \nmay be blood stained) as is usual in adults. The ways it presents is described in \nSection 2.3. \nIn children it is difficult to prove the diagn

In [16]:
prompt_template="""
Use the following pieces of information to answer the user's question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

Context: {context}
Question: {question}

Only return the helpful answer below and nothing else.
Helpful answer:
"""

In [22]:
PROMPT=PromptTemplate(template=prompt_template, input_variables=["context", "question"])
chain_type_kwargs={"prompt": PROMPT}

In [23]:
llm=CTransformers(model="model/llama-2-7b-chat.ggmlv3.q4_0.bin",
                  model_type="llama",
                  config={'max_new_tokens':4096,
                          'temperature':0.8}
                 ) 

In [24]:
type(llm)

langchain_community.llms.ctransformers.CTransformers

In [25]:
qa=RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff", 
    retriever=docsearch.as_retriever(search_kwargs={'k': 3}), 
    return_source_documents=True, 
    chain_type_kwargs=chain_type_kwargs) 

In [None]:
while True:
    user_input=input(f"Input Prompt:")
    result=qa({"query": user_input})
    print("Response : ", result["result"])


Input Prompt:what is tuberculosis


  warn_deprecated(
