In [2]:
from langchain.document_loaders import DirectoryLoader, PyPDFLoader

def load_pdf(directory_path='data'):
    loader = DirectoryLoader(directory_path,
                             glob="*.pdf",
                             loader_cls=PyPDFLoader)
    documents = loader.load()
    return documents


In [3]:
def summarize_documents(documents):
    for i, doc in enumerate(documents):
        print(f"Document {i + 1}")
        print(f"Document Content: {doc.page_content[:500]}...")  # Show first 500 characters for brevity
        print("\n")

# Load and summarize the PDF data
extracted_data = load_pdf("data/")
summarize_documents(extracted_data)


Document 1
Document Content: ...


Document 2
Document Content: The GALE
ENCYCLOPEDIA
of MEDICINE
SECOND EDITION...


Document 3
Document Content: The GALE
ENCYCLOPEDIA
of MEDICINE
SECOND EDITION
JACQUELINE L. LONGE, EDITOR
DEIRDRE S. BLANCHFIELD, ASSOCIATE EDITOR
VOLUME
A-B
1...


Document 4
Document Content: STAFF
Jacqueline L. Longe, Project Editor
Deirdre S. Blanchfield, Associate Editor
Christine B. Jeryan, Managing Editor
Donna Olendorf, Senior Editor
Stacey Blachford, Associate Editor
Kate Kretschmann, Melissa C. McDade, Ryan
Thomason, Assistant Editors
Mark Springer, Technical Specialist
Andrea Lopeman, Programmer/Analyst
Barbara J. Yarrow,Manager, Imaging and Multimedia
Content
Robyn V . Young,Project Manager, Imaging and
Multimedia Content
Dean Dauphinais, Senior Editor, Imaging and
Multimed...


Document 5
Document Content: Introduction.................................................... ix
Advisory Board.............................................. xi
Contributors ........

In [4]:
from langchain_text_splitters import RecursiveCharacterTextSplitter


def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunks = text_splitter.split_documents(extracted_data)
    return text_chunks


In [5]:
text_chunks = text_split(extracted_data)


In [6]:
print("Length of the chunks:", len(text_chunks))


Length of the chunks: 5860


In [7]:
from langchain_huggingface import HuggingFaceEmbeddings

def download_hugging_face_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embeddings

# Download the embeddings
embeddings = download_hugging_face_embeddings()

# Embed a sample query
query_result = embeddings.embed_query("kallo kalisa the don")

# Print the length of the result
print("Length:", len(query_result))

# Print the query result for verification
query_result


Length: 384


[-0.11381158977746964,
 0.03737015277147293,
 -0.05325103923678398,
 0.0602087564766407,
 -0.10784343630075455,
 -0.007317695301026106,
 0.10283239930868149,
 -0.07219936698675156,
 0.07612476497888565,
 -0.015257258899509907,
 0.04438699036836624,
 -0.06522295624017715,
 -0.031545545905828476,
 0.05603882297873497,
 -0.00863468088209629,
 -0.018816933035850525,
 0.02800637111067772,
 0.03092520497739315,
 -0.008785144425928593,
 -0.03431516885757446,
 -0.05797110125422478,
 0.03644885867834091,
 0.04238847270607948,
 0.04385584592819214,
 -0.08088473230600357,
 -0.05737711116671562,
 -0.015496702864766121,
 -0.009699282236397266,
 -0.01774829812347889,
 -0.09014662355184555,
 -0.05122816562652588,
 0.021257178857922554,
 -0.04036024212837219,
 0.02126120775938034,
 -0.0496150441467762,
 -0.041019659489393234,
 -0.05329013615846634,
 0.131815567612648,
 0.03461975231766701,
 -0.048066914081573486,
 0.007686016149818897,
 -0.042599599808454514,
 -0.03554120287299156,
 0.0509946495294570

In [8]:
# pip install -U jupyter ipywidgets


In [9]:
import pinecone
from langchain_huggingface import HuggingFaceEmbeddings


In [2]:
import os
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(
    api_key='*******************************************************************************************8'
)


In [11]:
from langchain import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.embeddings import  HuggingFaceEmbeddings 
from langchain.vectorstores import Pinecone
from pinecone import Pinecone,ServerlessSpec
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.llms import ctransformers

In [12]:
import os
from pinecone import Pinecone, ServerlessSpec
from langchain_huggingface import HuggingFaceEmbeddings

# Initialize Pinecone

# Create an index if it doesn't exist
index_name = 'embedding-index'
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name, 
        dimension=384,  # Ensure this matches your embeddings' dimensionality
        metric='cosine',  # Metric can be 'cosine', 'euclidean', or 'dotproduct'
        spec=ServerlessSpec(
            cloud='aws',  # Ensure this matches your region
            region='us-east-1'  # Change to a valid region for the starter plan
        )
    )
index = pc.Index(index_name)

# Define the function to load PDF and split text
def load_pdf(directory_path='data'):
    loader = DirectoryLoader(directory_path,
                             glob="*.pdf",
                             loader_cls=PyPDFLoader)
    documents = loader.load()
    return documents

def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunks = text_splitter.split_documents(extracted_data)
    return text_chunks

# Download the embeddings
def download_hugging_face_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embeddings

# Load and split the PDF data
extracted_data = load_pdf("data/")
text_chunks = text_split(extracted_data)

# Embed the text chunks
embeddings = download_hugging_face_embeddings()
vectors = [
    (f'vec_{i}', embeddings.embed_query(chunk.page_content), {'text': chunk.page_content}) 
    for i, chunk in enumerate(text_chunks)
]

# Define a function to split vectors into batches
def split_into_batches(vectors, batch_size):
    for i in range(0, len(vectors), batch_size):
        yield vectors[i:i + batch_size]

# Define the batch size
batch_size = 100  # Adjust the batch size based on your data

# Split the vectors into smaller batches and upsert each batch
for batch in split_into_batches(vectors, batch_size):
    index.upsert(batch)


In [13]:
import os
from pinecone import Pinecone, ServerlessSpec
from langchain_huggingface import HuggingFaceEmbeddings



# Define index name
index_name = 'embedding-index'

# Initialize the Pinecone index
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name, 
        dimension=384,  # Ensure this matches your embeddings' dimensionality
        metric='cosine',  # Metric can be 'cosine', 'euclidean', or 'dotproduct'
        spec=ServerlessSpec(
            cloud='aws',  # Ensure this matches your region
            region='us-east-1'  # Change to a valid region for the starter plan
        )
    )
index = pc.Index(index_name)


In [14]:
def download_hugging_face_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embeddings

def embed_query(formatted_query):
    embeddings = download_hugging_face_embeddings()
    query_vector = embeddings.embed_query(formatted_query)
    return query_vector


In [15]:
def search_vector_database(query_vector, top_k=5):
    search_results = index.query(
        vector=query_vector,
        top_k=top_k,  # Retrieve the top-k most similar vectors
        include_metadata=True  # Include metadata (e.g., text chunks)
    )
    return search_results['matches']


In [29]:
# Formulate the query
query = "What are Abortion?"
formatted_query = f"Answer this question based on medical knowledge: {query}"
query_vector = embed_query(formatted_query)

# Perform the search
top_k = 5  # Number of results to retrieve
search_results = search_vector_database(query_vector, top_k=top_k)

# Display the retrieved results safely
for i, match in enumerate(search_results):
    print(f"Result {i+1}:")
    print(f"  Vector ID: {match.get('id', 'N/A')}")
    print(f"  Similarity Score: {match.get('score', 'N/A')}")
    metadata = match.get('metadata', {})
    if 'text' in metadata:
        print(f"  Metadata (Text): {metadata['text']}")
    else:
        print("  Metadata (Text): No text available")


Result 1:
  Vector ID: vec_163
  Similarity Score: 0.678429365
  Metadata (Text): often involve more risk, more services, anesthesia, and
sometimes a hospital stay. Insurance carriers and HMOs
may or may not cover the procedure. Federal law pro-
GALE ENCYCLOPEDIA OF MEDICINE 2 11
Abortion, therapeutic
GEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 11
Result 2:
  Vector ID: vec_140
  Similarity Score: 0.65549
  Metadata (Text): pregnancy. Women who have stable diabetes, controlled
epilepsy, mild to moderate high blood pressure, or who
are HIV positive can often have abortions as outpatients
if precautions are taken. Women with heart disease, pre-
vious endocarditis, asthma, lupus erythematosus, uter-
ine fibroid tumors, blood clotting disorders, poorly con-
trolled epilepsy, or some psychological disorders usually
need to be hospitalized in order to receive special moni-
toring and medications during the procedure.
Result 3:
  Vector ID: vec_146
  Similarity Score: 0.605629504
  Metadat

In [28]:
import os
from pinecone import Pinecone, ServerlessSpec
# from langchain_huggingface import HuggingFaceEmbeddings
from langchain.document_loaders import DirectoryLoader
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter


In [37]:
def search_vector_database(query_vector, top_k=5):
    """
    Search the Pinecone vector database for the most similar embeddings to the query vector.

    Parameters:
        query_vector (list): The query vector representation.
        top_k (int): Number of top results to retrieve.
    
    Returns:
        list: Retrieved matches containing vector IDs, scores, and metadata (if available).
    """
    search_results = index.query(
        vector=query_vector,
        top_k=top_k,  # Retrieve the top-k most similar vectors
        include_metadata=True  # Include metadata (e.g., text chunks)
    )
    return search_results['matches']

# Example usage
query = "What are abuse?"
formatted_query = f"Answer this question based on medical knowledge: {query}"
query_vector = embed_query(formatted_query)

# Perform the search
top_k = 5  # Number of results to retrieve
search_results = search_vector_database(query_vector, top_k=top_k)

# Display the retrieved results safely
for i, match in enumerate(search_results):
    print(f"Result {i+1}:")
    print(f"  Vector ID: {match.get('id', 'N/A')}")
    print(f"  Similarity Score: {match.get('score', 'N/A')}")
    metadata = match.get('metadata', {})
    if 'text' in metadata:
        print(f"  Metadata (Text): {metadata['text']}")
    else:
        print("  Metadata (Text): No text available")


Result 1:
  Vector ID: vec_204
  Similarity Score: 0.64324
  Metadata (Text): Abuse
Definition
Abuse is defined as any thing that is harmful, injuri-
ous, or offensive. Abuse also includes excessive and
wrongful misuse of anything. There are several major types
of abuse: physical and sexual abuse of a child or an adult,
substance abuse, elderly abuse, and emotional abuse.
Description
Physical abuse of a child is the infliction of injury by
an other person. The injuries can include punching, kick-
ing, biting, burning, beating, or pulling the victim’s hair.
Result 2:
  Vector ID: vec_208
  Similarity Score: 0.59400624
  Metadata (Text): of drugs).
GALE ENCYCLOPEDIA OF MEDICINE 216
Abuse
GEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 16
Result 3:
  Vector ID: vec_217
  Similarity Score: 0.562256336
  Metadata (Text): females in an effort to control and intimidate partners.
Abuse in the elderly usually occurs in the frail, elderly
community. The caretaker is usually the perpetrator. Care-

In [38]:

# Find the best match based on the highest similarity score
best_match = max(search_results, key=lambda match: match['score'])

# Display the best result
# print("Best Result:")
# print(f"  Vector ID: {best_match.get('id', 'N/A')}")
# print(f"  Similarity Score: {best_match.get('score', 'N/A')}")
metadata = best_match.get('metadata', {})
if 'text' in metadata:
    print(f"  Metadata (Text): {metadata['text']}")
else:
    print("  Metadata (Text): No text available")


  Metadata (Text): Abuse
Definition
Abuse is defined as any thing that is harmful, injuri-
ous, or offensive. Abuse also includes excessive and
wrongful misuse of anything. There are several major types
of abuse: physical and sexual abuse of a child or an adult,
substance abuse, elderly abuse, and emotional abuse.
Description
Physical abuse of a child is the infliction of injury by
an other person. The injuries can include punching, kick-
ing, biting, burning, beating, or pulling the victim’s hair.
