In [None]:
import os
import faiss
import numpy as np
from pathlib import Path
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS as LangchainFAISS
from langchain.chains import RetrievalQA
from langchain_openai import AzureChatOpenAI

# Set your Azure OpenAI credentials
os.environ["AZURE_OPENAI_API_KEY"] = "your_azure_openai_api_key"
os.environ["AZURE_OPENAI_ENDPOINT"] = "your_azure_openai_endpoint"

def get_azure_openai_client():
    from openai import AzureOpenAI
    client = AzureOpenAI(
        api_key=os.getenv("AZURE_OPENAI_API_KEY"),
        api_version="2023-05-15",
        azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT")
    )
    return client

def get_embedding(text):
    openai = get_azure_openai_client()
    embedding = openai.embeddings.create(input=text, model="text-embedding-ada-002").data[0].embedding
    return embedding

def get_file_hash(file_path):
    import hashlib
    with open(file_path, "rb") as file:
        file_hash = hashlib.md5(file.read()).hexdigest()
    return file_hash

def process_pdf(file_path):
    file_hash = get_file_hash(file_path)
    cache_dir = Path(f"./cache/{file_hash}")
    faiss_index_path = cache_dir / "index.faiss"

    if faiss_index_path.exists():
        print(f"Loading from cache: {cache_dir}")
        try:
            vectorstore = LangchainFAISS.load_local(str(cache_dir), get_embedding)
            return vectorstore
        except Exception as e:
            print(f"Error loading cached index: {str(e)}. Reprocessing PDF.")
    else:
        print("Cache not found. Processing new PDF.")

    try:
        # Load and split the PDF
        loader = PyPDFLoader(file_path)
        pages = loader.load_and_split()

        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,
            chunk_overlap=200,
            length_function=len,
        )

        texts = text_splitter.split_documents(pages)

        # Generate embeddings
        embeddings = [get_embedding(text.page_content) for text in texts]
        embeddings_array = np.array(embeddings).astype('float32')
        
        # Create IVF-Flat index
        dimension = len(embeddings[0])
        nlist = min(int(4 * np.sqrt(len(embeddings))), 1024)  # number of clusters, capped at 1024
        quantizer = faiss.IndexFlatL2(dimension)
        index = faiss.IndexIVFFlat(quantizer, dimension, nlist, faiss.METRIC_L2)
        
        # Train and add vectors to the index
        if len(embeddings) < nlist:
            print(f"Warning: number of vectors ({len(embeddings)}) is less than nlist ({nlist}). Setting nlist to {len(embeddings)}.")
            nlist = len(embeddings)
            index = faiss.IndexIVFFlat(quantizer, dimension, nlist, faiss.METRIC_L2)

        index.train(embeddings_array)
        index.add(embeddings_array)

        # Create FAISS vectorstore
        vectorstore = LangchainFAISS(
            embedding_function=get_embedding,
            index=index,
            docstore=dict(zip(range(len(texts)), texts))
        )

        # Save the vectorstore
        cache_dir.mkdir(parents=True, exist_ok=True)
        vectorstore.save_local(str(cache_dir))

        print(f"Vectorstore saved to {cache_dir}")
        
        return vectorstore

    except Exception as e:
        print(f"An error occurred while processing the PDF: {str(e)}")
        return None

def ask_question(question, vectorstore):
    llm = AzureChatOpenAI(
        openai_api_version="2023-05-15",
        azure_deployment="your_chat_deployment_name",
        temperature=0
    )
    
    # Set the number of clusters to search
    vectorstore.index.nprobe = 10
    
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=vectorstore.as_retriever(search_kwargs={"k": 5}),
    )
    response = qa_chain({"query": question})
    return response["result"]

# Process the PDF
pdf_path = '/path/to/your/large/pdf/file.pdf'
vectorstore = process_pdf(pdf_path)

if vectorstore:
    print("PDF processed successfully!")
    
    # Ask questions
    while True:
        question = input("Ask a question about the PDF (or type 'quit' to exit): ")
        if question.lower() == 'quit':
            break
        answer = ask_question(question, vectorstore)
        print(f"Answer: {answer}\n")
else:
    print("Failed to process the PDF. Please check the file path and try again.")

In [None]:
vectorstore = LangchainFAISS(
    embedding_function=get_embedding,
    index=index,
    docstore=dict(zip(range(len(texts)), texts))
)