In [1]:
from langchain import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_pinecone import PineconeVectorStore
import pinecone
from pinecone import Pinecone, ServerlessSpec

from langchain.document_loaders import PyPDFLoader,DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.llms import CTransformers
import os


  from tqdm.autonotebook import tqdm


In [None]:
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_API_ENV = os.getenv("PINECONE_API_ENV")



In [5]:
#extracting data from pdf

def load_pdf(data):
    loader = DirectoryLoader(data,
                    glob = "*.pdf",
                    loader_cls = PyPDFLoader)
    
    documents = loader.load()

    return documents

In [44]:
#use this to load data. for test purpose use less data, in future will be adding more books to the dataset

extracted_data = load_pdf("data/")

In [45]:
#text chunks

def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500 , chunk_overlap = 20)
    text_chunks = text_splitter.split_documents(extracted_data)

    return text_chunks
    

In [46]:
text_chunks = text_split(extracted_data)
print("chunk size:",len(text_chunks))

chunk size: 6970


In [47]:
#this is basically using the embedding model from hugging face 
#converting the text chunks into vector embeddings
def download_hugging_face_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embeddings

In [48]:
embedding = download_hugging_face_embeddings()

In [None]:
embedding


In [84]:
query = embedding.embed_query("Hello")
print(len(query))

384


In [8]:
# IMPORTANT. AVOID RUNNING IF YOU DO NOT WANT TO OVERRIDE OR ADD TO EXISITNG DATA. IT TAKES TIME
from pinecone import Pinecone
# Initialize Pinecone with your API key (make sure it's correct)
pc = Pinecone(
    api_key=os.getenv("PINECONE_API_KEY"),  # Ensure this points to the correct key
    environment=PINECONE_API_ENV  # Specify the environment
)

index_name = "mchatbot"

# Now proceed with the vector store
docsearch = PineconeVectorStore.from_documents(
    documents=text_chunks,  # Ensure you're passing the correct documents here
    embedding=embedding,
    index_name=index_name,
    pinecone_api_key=os.getenv("PINECONE_API_KEY")  # Pass API key explicitly if needed
)





NameError: name 'text_chunks' is not defined

In [34]:
# docsearch = Pinecone.from_existing_index(index_name,embedding)

query = "cure for common cold"

docs = docsearch.similarity_search(query,k=3)

print("result",docs)

result [Document(id='2d527520-950e-4052-9803-ecca8637e488', metadata={'author': '', 'creationdate': '2017-05-01T10:37:35-07:00', 'creator': '', 'keywords': '', 'moddate': '2017-05-01T10:37:35-07:00', 'page': 257.0, 'page_label': '258', 'producer': 'GPL Ghostscript 9.10', 'source': 'data\\The_GALE_ENCYCLOPEDIA_of_MEDICINE_SECOND.pdf', 'subject': '', 'title': '', 'total_pages': 759.0}, page_content='Rosalyn Carson-DeWitt, MD\nCombat neurosis see Post-traumatic stress\ndisorder\nCommon cold\nDefinition\nThe common cold is a viral infection of the upper\nrespiratory system, including the nose, throat, sinuses,\neustachian tubes, trachea, larynx, and bronchial tubes.\nAlthough over 200 different viruses can cause a cold,\n30–50% are caused by a group known as rhinoviruses.\nAlmost all colds clear up in less than two weeks without\ncomplications.\nDescription'), Document(id='8689095c-74f4-4642-acff-afdfcc57fa44', metadata={'author': '', 'creationdate': '2017-05-01T10:37:35-07:00', 'creator':

In [92]:
prompt_template = """
Use the following pieces of information to answer the user's question. 
If the answer is outside the scope of the provided context or you don't know, simply say "I don't know" or "This is outside the scope of my knowledge."
If asked about who created you, respond with: "I was created by Aum Tamboli "
try to answer in points


Context (only the most relevant part is shown):


Context: {context}
Question: {question}

Only return the helpful answer below and nothing else.
Helpful answer:
"""


In [93]:
PROMPT=PromptTemplate(template=prompt_template, input_variables=["context", "question"])
chain_type_kwargs={"prompt": PROMPT}

In [94]:
llm=CTransformers(model="model/llama-2-7b-chat.ggmlv3.q4_0.bin",
                  model_type="llama",
                  config={'max_new_tokens':512,
                          'temperature':0.8})

In [95]:
qa=RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff", 
    retriever=docsearch.as_retriever(search_kwargs={'k': 1}),
    return_source_documents=True, 
    chain_type_kwargs=chain_type_kwargs)

In [None]:
while True:
    user_input=input(f"Input Prompt:")
    result=qa({"query": user_input})
    print("Response : ", result["result"])