In [1]:
print("OK")

OK


In [2]:
from langchain import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Pinecone
import pinecone
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.llms import CTransformers

  from tqdm.autonotebook import tqdm


In [3]:
# Extract data from PDF
def load_pdf(data):
    loader = DirectoryLoader(data, 
                    glob="*.pdf", # meaning, get all the pdf from the data directory
                    loader_cls=PyPDFLoader)
    documents = loader.load()
    return documents

In [5]:
extracted_data = load_pdf("data/")

In [9]:
len(extracted_data)

637

In [10]:
type(extracted_data)

list

In [11]:
# Create text chunks 
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, 
                                   chunk_overlap = 20)
    text_chunks = text_splitter.split_documents(extracted_data)
    return text_chunks

In [12]:
text_chunks = text_split(extracted_data)

In [13]:
len(text_chunks)

5859

In [None]:
text_chunks

Document(page_content='<http://www.cdc.gov/niosh/w7_high.html>.\nMaureen Haggerty\nGALE ENCYCLOPEDIA OF MEDICINE 2 623\nByssinosis\nGEM -0433 to 0624 - B  10/22/03 6:09 PM  Page 623', metadata={'source': 'data/Medical_book.pdf', 'page': 636})

In [18]:
# download embedding model  
def download_hugging_face_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embeddings

In [20]:
embeddings = download_hugging_face_embeddings()

In [21]:
embeddings

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={})

In [22]:
query_result = embeddings.embed_query("Hello World")
print(len(query_result))

384


In [23]:
query_result

[-0.034477271139621735,
 0.031023181974887848,
 0.0067349583841860294,
 0.026109008118510246,
 -0.03936202451586723,
 -0.16030247509479523,
 0.06692396104335785,
 -0.006441452540457249,
 -0.04745050147175789,
 0.014758880250155926,
 0.07087534666061401,
 0.05552754923701286,
 0.019193293526768684,
 -0.026251323521137238,
 -0.010109510272741318,
 -0.026940520852804184,
 0.02230743132531643,
 -0.02222662977874279,
 -0.1496926099061966,
 -0.017493100836873055,
 0.007676210254430771,
 0.05435231328010559,
 0.0032544685527682304,
 0.03172597289085388,
 -0.08462142944335938,
 -0.029405977576971054,
 0.05159565433859825,
 0.0481240376830101,
 -0.003314815228804946,
 -0.05827919393777847,
 0.04196927696466446,
 0.022210722789168358,
 0.128188818693161,
 -0.022338902577757835,
 -0.011656270362436771,
 0.06292837858200073,
 -0.032876309007406235,
 -0.09122607111930847,
 -0.031175294890999794,
 0.052699554711580276,
 0.047034814953804016,
 -0.08420305699110031,
 -0.030056182295084,
 -0.0207448415

In [27]:
!pip install python-dotenv

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting python-dotenv
  Using cached python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Using cached python_dotenv-1.0.1-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.0.1


In [29]:
import os 
import dotenv
from dotenv import load_dotenv
load_dotenv()
PINECONE_API = os.getenv("PINECONE_API_KEY")  
PINECONE_ENV = os.getenv("PINECONE_API_ENV")

In [34]:
from pinecone import Pinecone
from langchain.vectorstores import Pinecone as PineconeLangchain

# Initialize Pinecone client
pc = Pinecone(api_key=PINECONE_API)

index_name = "medical-chatbot"

# Check if index exists using the new client
if index_name not in pc.list_indexes().names():
    # Create index if it doesn't exist
    pc.create_index(
        name=index_name,
        dimension=1536,  # Must match your embeddings dimension
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region="us-west-2"
        )
    )

# LangChain integration - modified to use the new client
docsearch = PineconeLangchain.from_texts(
    texts=[t.page_content for t in text_chunks],
    embedding=embeddings,
    index_name=index_name,
    pinecone_client=pc  # Pass the initialized client
)

AttributeError: list_indexes is no longer a top-level attribute of the pinecone package.

To use list_indexes, please create a client instance and call the method there instead.

Example:

    from pinecone import Pinecone
    
    pc = Pinecone(api_key='YOUR_API_KEY')

    index_name = "quickstart" # or your index name

    if index_name not in pc.list_indexes().names():
        # do something

