<a href="https://colab.research.google.com/github/bhargavichennareddy7/genai/blob/main/Vector_Databases_Langchain.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install langchain langchain-community pdfplumber pymupdf faiss-cpu sentence-transformers

Collecting langchain-community
  Downloading langchain_community-0.3.19-py3-none-any.whl.metadata (2.4 kB)
Collecting pdfplumber
  Downloading pdfplumber-0.11.5-py3-none-any.whl.metadata (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.5/42.5 kB[0m [31m597.0 kB/s[0m eta [36m0:00:00[0m
[?25hCollecting pymupdf
  Downloading pymupdf-1.25.3-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Collecting langchain-core<1.0.0,>=0.3.35 (from langchain)
  Downloading langchain_core-0.3.41-py3-none-any.whl.metadata (5.9 kB)
Collecting langchain
  Downloading langchain-0.3.20-py3-none-any.whl.metadata (7.7 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic

In [None]:
import os
import fitz  # PyMuPDF for PDF parsing
import pdfplumber
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [None]:
# Function to extract text from PDF
def extract_text_from_pdf(pdf_path):
    text = ""
    try:
        # Using PyMuPDF (fitz)
        with fitz.open(pdf_path) as doc:
            for page in doc:
                text += page.get_text("text") + "\n"
    except Exception as e:
        print("Error with PyMuPDF, trying pdfplumber:", e)
        try:
            # Using pdfplumber as an alternative
            with pdfplumber.open(pdf_path) as pdf:
                text = "\n".join(page.extract_text() for page in pdf.pages if page.extract_text())
        except Exception as e:
            print("Error extracting text from PDF:", e)
    return text

In [None]:
# Function to store text in FAISS vector database
def store_in_vector_db(text):
    # Split text into smaller chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    texts = text_splitter.split_text(text)

    # Load embeddings model
    embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

    # Store text in FAISS vector store
    vector_store = FAISS.from_texts(texts, embedding_model)

    # Save FAISS index for later use
    vector_store.save_local("faiss_index")
    print("Vector database saved!")


In [None]:
# Main function
def main():
    pdf_path = "/content/lemh1a1.pdf"

    if not os.path.exists(pdf_path):
        print("File not found!")
        return

    print("Extracting text from PDF...")
    text = extract_text_from_pdf(pdf_path)

    if text:
        print("Text extracted successfully!")
        print("Storing in vector database...")
        store_in_vector_db(text)
    else:
        print("Failed to extract text.")

if __name__ == "__main__":
    main()


Extracting text from PDF...
Text extracted successfully!
Storing in vector database...


  embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling%2Fconfig.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Vector database saved!


In [None]:
from langchain.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings

# Load FAISS index with safe deserialization
vector_store = FAISS.load_local(
    "faiss_index",
    HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2"),
    allow_dangerous_deserialization=True  # Add this line
)

# Test query
query = "what is a proof?"
results = vector_store.similarity_search(query, k=3)
print(results)

[Document(id='fc07b7b0-4cc5-4ca4-a0d7-9f3d431c16c9', metadata={}, page_content='A.1.2  What is a Proof?\nProof of a mathematical statement consists of sequence of statements, each statement\nbeing justified with a definition or an axiom or a proposition that is previously established\nby the  method of deduction using only the allowed logical rules.\nThus, each proof is a chain of deductive arguments each of which has its premises\nand conclusions. Many a times, we prove a proposition directly from what is given in'), Document(id='c3698b68-0c80-41b6-a6e3-04a11c3df5be', metadata={}, page_content='vProofs are to Mathematics what calligraphy is to poetry.\nMathematical works do consist of proofs just as\npoems do consist of characters.\n— VLADIMIR ARNOLD v\nA.1.1  Introduction\nIn Classes IX, X and XI, we have learnt about the concepts of a statement, compound\nstatement, negation, converse and contrapositive of a statement; axioms, conjectures,\ntheorems and deductive reasoning.\nHere, w