In [2]:
import pinecone
import os

from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Pinecone
from pinecone import Pinecone, ServerlessSpec
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_pinecone import PineconeVectorStore

In [2]:
os.environ['PINECONE_API_KEY'] = '8e6c7ea2-e7f4-45f4-9384-d3b01cd23e1b'
os.environ['PINECONE_API_ENV'] = 'gcp-starter'

In [3]:
def load_pdf(data):
    loader = DirectoryLoader(data,
                    glob="*.pdf",
                    loader_cls=PyPDFLoader)
    documents = loader.load()
    return documents

extracted_data = load_pdf("/data")

In [5]:
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 20)
    text_chunks = text_splitter.split_documents(extracted_data)
    return text_chunks

text_chunks = text_split(extracted_data)

In [7]:
def download_hugging_face_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embeddings

embeddings = download_hugging_face_embeddings()
query_result = embeddings.embed_query("Hello world")
length = len(query_result)

In [33]:
pinecone = Pinecone(
    api_key='8e6c7ea2-e7f4-45f4-9384-d3b01cd23e1b'
)
index_name="medical-chatbot"

pinecone.create_index(
    name=index_name,
    dimension=384,
    metric="cosine",
    spec=ServerlessSpec(
        cloud='aws', 
        region='us-east-1'
    ) 
)

docsearch=PineconeVectorStore.from_texts([t.page_content for t in text_chunks], embeddings, index_name=index_name)

In [None]:
# Uncomment if u want to delete an index
# pinecone.delete_index('medical-chatbot')
# print(pinecone.list_indexes())

In [None]:
query = "What are Allergies"
docs=docsearch.similarity_search(query, k=3)
print("Result", docs)