<a href="https://colab.research.google.com/github/arockiasachin/ContextualFinAi/blob/main/RAG_using_LlamaIndex_Faiss.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -U llama_index langchain langchain-community

In [None]:
!pip install torch transformers bitsandbytes accelerate sqlparse langchain faiss-cpu pypdf tiktoken databutton llama-index-vector-stores-faiss llama-index-embeddings-huggingface sentence-transformers llama-index-embeddings-langchain llama-index-llms-ollama llama-index-llms-huggingface llama-index-llms-huggingface-api  google-generativeai llama-index-llms-gemini

# **IMPORTS**

In [None]:
import os
import shutil
from google.colab import files
from io import BytesIO
from typing import List
from pypdf import PdfReader
from llama_index.core import Document, VectorStoreIndex, StorageContext, load_index_from_storage, Settings
from llama_index.vector_stores.faiss import FaissVectorStore
from langchain.embeddings import HuggingFaceEmbeddings
from llama_index.embeddings.langchain import LangchainEmbedding
from langchain_text_splitters import RecursiveCharacterTextSplitter
import faiss
import numpy as np

# **Ingestion**

In [None]:
# Initialize embedding model and global settings
lc_embed_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
embed_model = LangchainEmbedding(lc_embed_model)

Settings.embed_model = embed_model
Settings.chunk_size = 512
Settings.chunk_overlap = 20

# Directory for index persistence
PERSIST_DIR = "./faiss_storage"

# Upload and process PDFs
def upload_and_process_pdfs():
    uploaded = files.upload()
    pdf_file_names = list(uploaded.keys())
    pdf_file_contents = []

    for file_name in pdf_file_names:
        pdf_file_contents.append(BytesIO(uploaded[file_name]))
    return pdf_file_names, pdf_file_contents

# Parse PDF text by page
def parse_pdf(file: BytesIO, filename: str) -> List[str]:
    pdf = PdfReader(file)
    return [page.extract_text().replace("-\n", "").replace("\n", " ").strip() for page in pdf.pages]

# Convert text to Document format
def text_to_docs(text: List[str], filename: str) -> List[Document]:
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=0)
    doc_chunks = []
    for i, page_text in enumerate(text):
        chunks = text_splitter.split_text(page_text)
        doc_chunks.extend(
            Document(text=chunk, metadata={"page": i + 1, "chunk": j, "filename": filename})
            for j, chunk in enumerate(chunks)
        )
    return doc_chunks


# **Build and Load VectorStore**

In [None]:
# Build and save FAISS index with persistence
def build_and_save_faiss_index(documents: List[Document]):
    # Clear and create persistence directory
    if os.path.exists(PERSIST_DIR):
        shutil.rmtree(PERSIST_DIR)
    os.makedirs(PERSIST_DIR, exist_ok=True)

    # Initialize FAISS index
    embedding_dim = len(embed_model.get_text_embedding(documents[0].text))
    faiss_index = faiss.IndexFlatL2(embedding_dim)

    # Set up vector store and document storage context
    vector_store = FaissVectorStore(faiss_index=faiss_index)
    storage_context = StorageContext.from_defaults(vector_store=vector_store)

    # Add documents to the index
    index = VectorStoreIndex.from_documents(documents, storage_context=storage_context)

    # Persist the index to disk
    storage_context.persist(persist_dir=PERSIST_DIR)
    return index

# Load FAISS vector store index from disk
def load_faiss_index():
    vector_store = FaissVectorStore.from_persist_dir(PERSIST_DIR)
    storage_context = StorageContext.from_defaults(vector_store=vector_store, persist_dir=PERSIST_DIR)
    return load_index_from_storage(storage_context=storage_context)

# Main function to upload, parse, and build an index for PDFs
def get_index_for_pdfs():
    pdf_file_names, pdf_file_contents = upload_and_process_pdfs()
    documents = []
    for pdf_file, pdf_name in zip(pdf_file_contents, pdf_file_names):
        text = parse_pdf(pdf_file, pdf_name)
        documents.extend(text_to_docs(text, pdf_name))
    return build_and_save_faiss_index(documents)


# **Fucntion Call**

In [None]:
# Example usage: Upload, build, and load the index
index = get_index_for_pdfs()
loaded_index = load_faiss_index()

# **Querstion Answering/RAG**

In [None]:
import os

GOOGLE_API_KEY = "insert Gemini AI key here" #get it from https://aistudio.google.com/app/apikey
os.environ["GOOGLE_API_KEY"] = GOOGLE_API_KEY

In [None]:
from llama_index.llms.gemini import Gemini
from llama_index.core import Settings

# Configure global settings to use Gemini
Settings.llm = Gemini(model="models/gemini-pro")

In [None]:
# Query example
query_engine = loaded_index.as_query_engine()
response = query_engine.query("What are main topics being disscussed in this document")
print(response)