In [18]:
import pinecone

from tqdm.autonotebook import tqdm

from langchain.llms import CTransformers
from langchain.chains import RetrievalQA
from langchain.vectorstores import Pinecone
from langchain.prompts import PromptTemplate
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader, DirectoryLoader

from dotenv import load_dotenv
load_dotenv()


True

In [4]:
def load_pdf(data):
    loader = DirectoryLoader(
        data,
        glob="*.pdf",
        loader_cls=PyPDFLoader
    )
    
    return loader.load()

In [5]:
doc = load_pdf("../data")
#doc <-- A huge document, so better not uncomment and see it here

In [6]:
# Create text chunks
def text_split(extracted_data):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size = 500,
        chunk_overlap = 20
    )
    
    return splitter.split_documents(extracted_data)

In [7]:
chunked_data = text_split(doc) 

In [8]:
# Download the Embedding Model
def download_hf_embedding():
    return HuggingFaceEmbeddings(model_name = "sentence-transformers/all-MiniLM-L6-v2") # You can search this in HuggingFace to see how the embedding works

In [9]:
embedding = download_hf_embedding()
embedding

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={})

In [10]:
# Testing the embedding model
query_result = embedding.embed_query("Arsenal FC!")
print(query_result)
print("Embedding Length:: ", len(query_result))

[-0.006630192510783672, 0.00038896594196558, -0.032901667058467865, -0.022884873673319817, 0.03727448359131813, 0.08408267796039581, 0.05745553970336914, 0.005245243664830923, 0.10912478715181351, 0.00669531524181366, -0.03419636934995651, -0.12951846420764923, 0.035795096307992935, 0.0009837307734414935, -0.04221082478761673, -0.07842656224966049, -0.06226443871855736, -0.08227495849132538, 0.01652878150343895, -0.10809822380542755, -0.04492056369781494, -0.0641852468252182, -0.07197432219982147, 0.042060744017362595, -0.07080677896738052, 0.05859140679240227, 0.04831169173121452, 0.04864639788866043, -0.04143129661679268, -0.09741217643022537, -0.016064763069152832, 0.012926643714308739, 0.020278211683034897, 0.0031622096430510283, -0.03352763503789902, 0.00844635721296072, 0.05178043991327286, -0.040948767215013504, 0.01600121706724167, 0.05930143594741821, -0.04747535660862923, -0.05880596861243248, 0.009616314433515072, -0.004391269758343697, 0.07551032304763794, 0.035135217010974

In [12]:
pc = pinecone.Pinecone()
index = pc.Index("medicalchatbot")

In [19]:
docsearch = Pinecone.from_texts(
    [t.page_content for t in chunked_data],
    embedding,
    index_name = 'medicalchatbot'
)

AttributeError: list_indexes is no longer a top-level attribute of the pinecone package.

To use list_indexes, please create a client instance and call the method there instead.

Example:

    from pinecone import Pinecone
    
    pc = Pinecone(api_key='YOUR_API_KEY')

    index_name = "quickstart" # or your index name

    if index_name not in pc.list_indexes().names():
        # do something



In [21]:
pc.list_indexes()

{'indexes': [{'dimension': 384,
              'host': 'medicalchatbot-m9wpn31.svc.aped-4627-b74a.pinecone.io',
              'metric': 'cosine',
              'name': 'medicalchatbot',
              'spec': {'serverless': {'cloud': 'aws', 'region': 'us-east-1'}},
              'status': {'ready': True, 'state': 'Ready'}}]}