In [None]:
import os
import tempfile
import streamlit as st
from pathlib import Path
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA
from langchain_community.chat_models import ChatOpenAI
import hashlib
import logging

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [None]:
def get_file_hash(file_path):
    with open(file_path, "rb") as file:
        file_hash = hashlib.md5(file.read()).hexdigest()
    return file_hash

def process_pdf(file_path):
    file_hash = get_file_hash(file_path)
    cache_dir = Path(f"./cache/{file_hash}")
    faiss_index_path = cache_dir / "index.faiss"

    if faiss_index_path.exists():
        logger.info(f"Loading from cache: {cache_dir}")
        try:
            embeddings = OpenAIEmbeddings()
            vectorstore = FAISS.load_local(str(cache_dir), embeddings, allow_dangerous_deserialization=True)
            return vectorstore
        except Exception as e:
            logger.error(f"Error loading cached index: {str(e)}. Reprocessing PDF.")
    else:
        logger.info("Cache not found. Processing new PDF.")

    try:
        loader = PyPDFLoader(file_path)
        pages = loader.load_and_split()

        chunk_size = 2000
        chunk_overlap = 300

        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            length_function=len,
        )

        texts = text_splitter.split_documents(pages)

        embeddings = OpenAIEmbeddings()

        vectorstore = FAISS.from_documents(texts, embeddings)
        cache_dir.mkdir(parents=True, exist_ok=True)
        vectorstore.save_local(str(cache_dir))

        logger.info(f"Vectorstore saved to {cache_dir}")
        
        return vectorstore

    except Exception as e:
        logger.error(f"An error occurred while processing the PDF: {str(e)}")
        return None

def get_query_engine(vectorstore):
    llm = ChatOpenAI(temperature=0.0, model_name="gpt-4o-mini")  # Increased temperature for more varied responses
    
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=vectorstore.as_retriever(search_kwargs={"k": 5}),
    )
    return qa_chain

In [None]:
def ask_question(question, vectorstore):
    llm = ChatOpenAI(temperature=0.0, model_name="gpt-4-1106-preview")
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=vectorstore.as_retriever(search_kwargs={"k": 5}),
    )
    response = qa_chain.invoke({"query": question})
    return response["result"]

# Assuming you've already processed the PDF and have the vectorstore
pdf_path = '/Users/ayusuf/Desktop/Finance /Cohere/annualreport-2023.pdf'
vectorstore = process_pdf(pdf_path)

#Function to ask a question and print the answer
def get_answer(question):
    if vectorstore:
        answer = ask_question(question, vectorstore)
        print(f"Q: {question}\nA: {answer}\n")
    else:
        print("PDF not processed successfully. Please check the file path and try again.")

In [None]:
question = "From the Selected income statement data, what is the Total net revenue in 2023"

In [None]:
get_answer(question)

Yes, that's correct. The code is designed to process the PDF and store the results for faster access in subsequent runs. Here's a breakdown of how it works:

File hashing: When you process a PDF, the code first generates a hash of the file. This hash is a unique identifier for the file's content.
Caching: The processed data (vectorstore) is saved in a cache directory. The directory name is based on the file's hash.
Subsequent runs: The next time you run the code with the same PDF, it checks if a cache directory exists for that file's hash. If it does, it loads the pre-processed data from the cache instead of processing the PDF again.

This caching mechanism offers several benefits:

Speed: After the initial processing, subsequent runs are much faster because they don't need to re-process the entire PDF.
Efficiency: It saves computational resources by not repeating the same processing tasks.
Consistency: It ensures that you're working with the same processed data across different sessions, as long as the PDF hasn't changed.

In [None]:
import os
import warnings
import logging
from pathlib import Path
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain_openai import AzureChatOpenAI
import hashlib
import numpy as np
from openai import AzureOpenAI

# Azure OpenAI settings
os.environ["AZURE_OPENAI_API_KEY"] = "your_azure_openai_api_key"
os.environ["AZURE_OPENAI_ENDPOINT"] = "your_azure_openai_endpoint"

def get_file_hash(file_path):
    with open(file_path, "rb") as file:
        file_hash = hashlib.md5(file.read()).hexdigest()
    return file_hash

def get_azure_openai_client():
    client = AzureOpenAI(
        api_key=os.getenv("AZURE_OPENAI_API_KEY"),
        api_version="2023-05-15",
        azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT")
    )
    return client

def get_embedding(text):
    openai = get_azure_openai_client()
    embedding = openai.embeddings.create(input=text, model="text-embedding-ada-002").data[0].embedding
    return embedding

def process_pdf(file_path):
    file_hash = get_file_hash(file_path)
    cache_dir = Path(f"./cache/{file_hash}")
    faiss_index_path = cache_dir / "index.faiss"

    if faiss_index_path.exists():
        print(f"Loading from cache: {cache_dir}")
        try:
            vectorstore = FAISS.load_local(str(cache_dir))
            return vectorstore
        except Exception as e:
            print(f"Error loading cached index: {str(e)}. Reprocessing PDF.")
    else:
        print("Cache not found. Processing new PDF.")

    try:
        loader = PyPDFLoader(file_path)
        pages = loader.load_and_split()

        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=2000,
            chunk_overlap=300,
            length_function=len,
        )

        texts = text_splitter.split_documents(pages)

        embeddings = [get_embedding(text.page_content) for text in texts]

        vectorstore = FAISS.from_embeddings(
            text_embeddings=list(zip([text.page_content for text in texts], embeddings)),
            embedding=lambda x: np.array(get_embedding(x)),
            metadatas=[text.metadata for text in texts]
        )

        cache_dir.mkdir(parents=True, exist_ok=True)
        vectorstore.save_local(str(cache_dir))

        print(f"Vectorstore saved to {cache_dir}")
        
        return vectorstore

    except Exception as e:
        print(f"An error occurred while processing the PDF: {str(e)}")
        return None

def ask_question(question, vectorstore):
    llm = AzureChatOpenAI(
        openai_api_version="2023-05-15",
        azure_deployment="your_azure_deployment_name",
        temperature=0
    )
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=vectorstore.as_retriever(search_kwargs={"k": 5}),
    )
    response = qa_chain.invoke({"query": question})
    return response["result"]

def get_answer(question):
    if vectorstore:
        answer = ask_question(question, vectorstore)
        print(f"Q: {question}\nA: {answer}\n")
    else:
        print("PDF not processed successfully. Please check the file path and try again.")

# Process the PDF
pdf_path = '/path/to/your/pdf/file.pdf'
vectorstore = process_pdf(pdf_path)

# Now you can ask questions
question = "What is the main topic of this PDF?"
get_answer(question)