In [1]:
from langchain.prompts import PromptTemplate 
from langchain.chains import RetrievalQA 
from langchain.embeddings import HuggingFaceEmbeddings 
from langchain.vectorstores import Pinecone 
import pinecone 
from langchain.document_loaders import PyPDFLoader, DirectoryLoader 
from langchain.text_splitter import RecursiveCharacterTextSplitter 
from langchain.llms import CTransformers 
from tqdm.autonotebook import tqdm

  from tqdm.autonotebook import tqdm


In [2]:
# Extract data from pdf 

def load_pdf(data):
    loader = DirectoryLoader(
        data,
        glob="*.pdf",
        loader_cls=PyPDFLoader,
    )

    documents = loader.load()

    return documents

In [3]:
extractd_data = load_pdf('data/')

In [4]:
extractd_data[:5]

[Document(page_content='', metadata={'source': 'data\\Medical_book.pdf', 'page': 0}),
 Document(page_content='TheGALE\nENCYCLOPEDIA\nofMEDICINE\nSECOND EDITION', metadata={'source': 'data\\Medical_book.pdf', 'page': 1}),
 Document(page_content='TheGALE\nENCYCLOPEDIA\nofMEDICINE\nSECOND EDITION\nJACQUELINE L. LONGE, EDITOR\nDEIRDRE S. BLANCHFIELD, ASSOCIATE EDITOR\nVOLUME\nA-B1', metadata={'source': 'data\\Medical_book.pdf', 'page': 2}),
 Document(page_content='STAFF\nJacqueline L. Longe, Project Editor\nDeirdre S. Blanchfield, Associate Editor\nChristine B. Jeryan, Managing Editor\nDonna Olendorf, Senior Editor\nStacey Blachford, Associate Editor\nKate Kretschmann, Melissa C. McDade, Ryan\nThomason, Assistant Editors\nMark Springer, Technical Specialist\nAndrea Lopeman, Programmer/Analyst\nBarbara J. Yarrow, Manager, Imaging and Multimedia\nContent\nRobyn V . Young, Project Manager, Imaging and\nMultimedia Content\nDean Dauphinais, Senior Editor, Imaging and\nMultimedia Content\nKelly 

In [5]:
# Create a text chunks 

def text_chunks(data):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=20,
    )

    chunks = text_splitter.split_documents(data)

    return chunks

In [6]:
text_chunks = text_chunks(extractd_data)
print(f'Length of the chunk', len(text_chunks))

Length of the chunk 7020


In [8]:
# Download embedding model 

def download_hugging_face_embedding():
    embeddings = HuggingFaceEmbeddings(
        # model_name="sentence-transformers/all-mpnet-base-v2",
        model_name="sentence-transformers/all-MiniLM-L6-v2",
        # device="cuda",
    )
    return embeddings

In [9]:
embeddings = download_hugging_face_embedding()



In [10]:
embeddings

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [11]:
query_result = embeddings.embed_query('Hyalo')
print(query_result, len(query_result))

[-0.04267304763197899, 0.09203755855560303, -0.06857625395059586, 0.019599666818976402, -0.0344499796628952, -0.019837217405438423, 0.11814744025468826, 0.030716711655259132, -0.05531094968318939, -0.010717896744608879, 0.037955548614263535, -0.0823628306388855, -0.012199390679597855, 0.03727022930979729, -0.04323684796690941, 0.012073601596057415, 0.04795260354876518, 0.010918808169662952, -0.01188730075955391, -0.08474653214216232, 0.04340653866529465, 0.01794005185365677, -0.06939361989498138, 0.06611159443855286, -0.04450161010026932, 0.00441769789904356, 0.07457327842712402, 0.07306434214115143, -0.0455530509352684, -0.04509250074625015, -0.07262466102838516, 0.015877550467848778, -0.023389682173728943, -0.023469164967536926, -0.0416022427380085, 0.10671326518058777, -0.08255365490913391, -0.04389277845621109, 0.019976146519184113, -0.019873138517141342, -0.0065434277057647705, -0.03043478913605213, -0.004757374990731478, 0.015238326042890549, -0.0179733969271183, -0.0667125806212

In [12]:
from pinecone import Pinecone

pc = Pinecone(api_key="6417e512-b4b6-47e2-be05-41492c3c1589")
index = pc.Index("chatbot")

In [15]:
from pinecone import Pinecone, ServerlessSpec
pc = Pinecone(api_key='6417e512-b4b6-47e2-be05-41492c3c1589')

In [23]:
import os
from langchain.vectorstores import Pinecone as PineconeStore
from langchain_pinecone import PineconeVectorStore

# Set your Pinecone API key
os.environ['PINECONE_API_KEY'] = '6417e512-b4b6-47e2-be05-41492c3c1589'

# Now you can initialize PineconeStore without the configuration error
docsearch = PineconeStore.from_texts([t.page_content for t in text_chunks], embeddings, index_name='chatbot')


In [24]:
docsearch = PineconeVectorStore.from_existing_index('chatbot', embeddings)
query = 'What are allergies'

docs = docsearch.similarity_search(query, k=5)

print('Result', docs)


Result [Document(page_content="GALE ENCYCLOPEDIA OF MEDICINE 2 117Allergies\nAllergic rhinitis is commonly triggered by\nexposure to household dust, animal fur,or pollen. The foreign substance thattriggers an allergic reaction is calledan allergen.\nThe presence of an allergen causes the\nbody's lymphocytes to begin producingIgE antibodies. The lymphocytes of an allergy sufferer produce an unusuallylarge amount of IgE.\nIgE molecules attach to mast\ncells, which contain histamine.HistaminePollen grains\nLymphocyte\nFIRST EXPOSURE"), Document(page_content="GALE ENCYCLOPEDIA OF MEDICINE 2 117Allergies\nAllergic rhinitis is commonly triggered by\nexposure to household dust, animal fur,or pollen. The foreign substance thattriggers an allergic reaction is calledan allergen.\nThe presence of an allergen causes the\nbody's lymphocytes to begin producingIgE antibodies. The lymphocytes of an allergy sufferer produce an unusuallylarge amount of IgE.\nIgE molecules attach to mast\ncells, which co