In [None]:
import os
import tempfile
import streamlit as st
from pathlib import Path
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain.chat_models import AzureChatOpenAI
import hashlib
import logging
from tqdm import tqdm
from langchain.vectorstores import Chroma
import pickle
from langchain.embeddings.base import Embeddings
from typing import List
from quanthub.util import llm

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Custom Azure OpenAI Embeddings class
class CustomOpenAIEmbeddings(Embeddings):
    def __init__(self, client, model="text-embedding-ada-002"):
        self.client = client
        self.model = model

    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        return [self.embed_query(text) for text in texts]

    def embed_query(self, text: str) -> List[float]:
        response = self.client.embeddings.create(input=text, model=self.model)
        return response.data[0].embedding

def get_file_hash(file_path):
    with open(file_path, "rb") as file:
        file_hash = hashlib.md5(file.read()).hexdigest()
    return file_hash

def process_pdf(file_path, openai_client):
    file_hash = get_file_hash(file_path)
    cache_dir = Path(f"./cache/{file_hash}")
    cache_dir.mkdir(parents=True, exist_ok=True)
    
    chroma_persist_dir = cache_dir / "chroma"
    texts_cache_path = cache_dir / "split_texts.pkl"

    # Check if processed data already exists
    if chroma_persist_dir.exists() and texts_cache_path.exists():
        logger.info(f"Loading from cache: {cache_dir}")
        try:
            embeddings = CustomOpenAIEmbeddings(openai_client)
            vectorstore = Chroma(persist_directory=str(chroma_persist_dir), embedding_function=embeddings)
            with open(texts_cache_path, "rb") as f:
                texts = pickle.load(f)
            return vectorstore, texts
        except Exception as e:
            logger.error(f"Error loading cached data: {str(e)}. Reprocessing PDF.")

    logger.info("Cache not found or incomplete. Processing new PDF.")

    try:
        loader = PyPDFLoader(file_path)
        pages = loader.load_and_split()

        chunk_size = 2000
        chunk_overlap = 300

        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            length_function=len,
        )

        # Process pages in batches
        batch_size = 50
        all_texts = []
        for i in tqdm(range(0, len(pages), batch_size), desc="Processing PDF"):
            batch = pages[i:i+batch_size]
            texts = text_splitter.split_documents(batch)
            all_texts.extend(texts)

        # Save split texts
        with open(texts_cache_path, "wb") as f:
            pickle.dump(all_texts, f)

        embeddings = CustomOpenAIEmbeddings(openai_client)
        
        # Create and persist Chroma vectorstore
        vectorstore = Chroma.from_documents(
            documents=all_texts,
            embedding=embeddings,
            persist_directory=str(chroma_persist_dir)
        )
        vectorstore.persist()

        logger.info(f"Vectorstore saved to {chroma_persist_dir}")
        
        return vectorstore, all_texts

    except Exception as e:
        logger.error(f"An error occurred while processing the PDF: {str(e)}")
        return None, None

def get_query_engine(vectorstore, llm_client):
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm_client,
        chain_type="map_reduce",
        retriever=vectorstore.as_retriever(search_kwargs={"k": 5}),
    )
    return qa_chain

def ask_question(question, vectorstore, llm_client):
    qa_chain = get_query_engine(vectorstore, llm_client)
    response = qa_chain.invoke({"query": question})
    return response["result"]

# Main execution
pdf_path = '/Users/ayusuf/Desktop/Finance /Cohere/annualreport-2023.pdf'

# Initialize your custom GPT client
openai = llm.get_llm_client(llm.GPT_4_MODEL)

vectorstore, texts = process_pdf(pdf_path, openai)

def get_answer(question):
    if vectorstore:
        answer = ask_question(question, vectorstore, openai)
        print(f"Q: {question}\nA: {answer}\n")
    else:
        print("PDF not processed successfully. Please check the file path and try again.")

# Example usage
question = "From the Selected income statement data, what is the Total net revenue in 2023"
get_answer(question)

In [None]:
# Lobatan!!

In [None]:
import os
import tempfile
import streamlit as st
from pathlib import Path
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
import hashlib
import logging
from tqdm import tqdm
import pickle
from langchain.embeddings.base import Embeddings
from typing import List
from quanthub.util import llm

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Custom OpenAI Embeddings class
class CustomOpenAIEmbeddings(Embeddings):
    def __init__(self, client, model="text-embedding-ada-002"):
        self.client = client
        self.model = model

    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        return [self.embed_query(text) for text in texts]

    def embed_query(self, text: str) -> List[float]:
        response = self.client.embeddings.create(input=text, model=self.model)
        return response.data[0].embedding

def get_file_hash(file_path):
    with open(file_path, "rb") as file:
        file_hash = hashlib.md5(file.read()).hexdigest()
    return file_hash

def process_pdf(file_path, openai_client):
    file_hash = get_file_hash(file_path)
    cache_dir = Path(f"./cache/{file_hash}")
    cache_dir.mkdir(parents=True, exist_ok=True)
    
    chroma_persist_dir = cache_dir / "chroma"
    texts_cache_path = cache_dir / "split_texts.pkl"

    if chroma_persist_dir.exists() and texts_cache_path.exists():
        logger.info(f"Loading from cache: {cache_dir}")
        try:
            embeddings = CustomOpenAIEmbeddings(openai_client)
            vectorstore = Chroma(persist_directory=str(chroma_persist_dir), embedding_function=embeddings)
            with open(texts_cache_path, "rb") as f:
                texts = pickle.load(f)
            return vectorstore, texts
        except Exception as e:
            logger.error(f"Error loading cached data: {str(e)}. Reprocessing PDF.")

    logger.info("Cache not found or incomplete. Processing new PDF.")

    try:
        loader = PyPDFLoader(file_path)
        pages = loader.load_and_split()

        chunk_size = 2000
        chunk_overlap = 300

        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            length_function=len,
        )

        batch_size = 50
        all_texts = []
        for i in tqdm(range(0, len(pages), batch_size), desc="Processing PDF"):
            batch = pages[i:i+batch_size]
            texts = text_splitter.split_documents(batch)
            all_texts.extend(texts)

        with open(texts_cache_path, "wb") as f:
            pickle.dump(all_texts, f)

        embeddings = CustomOpenAIEmbeddings(openai_client)
        
        vectorstore = Chroma.from_documents(
            documents=all_texts,
            embedding=embeddings,
            persist_directory=str(chroma_persist_dir)
        )
        vectorstore.persist()

        logger.info(f"Vectorstore saved to {chroma_persist_dir}")
        
        return vectorstore, all_texts

    except Exception as e:
        logger.error(f"An error occurred while processing the PDF: {str(e)}")
        return None, None

def ask_question(question, vectorstore, openai_client):
    prompt_role = "You are a helpful AI assistant answering questions based on the provided context."
    
    # Retrieve relevant context from the vectorstore
    relevant_docs = vectorstore.similarity_search(question, k=3)
    context = "\n".join([doc.page_content for doc in relevant_docs])
    
    prompt = f"Context: {context}\nQuestion: {question}\nAnswer:"
    
    response = openai_client.ChatCompletion.create(
        deployment_id=llm.GPT_35_16K_MODEL,
        messages=[
            {"role": "system", "content": prompt_role},
            {"role": "user", "content": prompt}
        ],
        max_tokens=300,
        temperature=0.0
    )
    
    answer = response.choices[0].message['content'].strip()
    usage = response["usage"]['total_tokens']
    cost = float(usage) / 1000 * 0.03
    
    return answer, usage, cost

# Main execution
pdf_path = '/Users/ayusuf/Desktop/Finance /Cohere/annualreport-2023.pdf'

# Initialize your custom GPT client
openai = llm.get_llm_client(llm.GPT_35_16K_MODEL)

vectorstore, texts = process_pdf(pdf_path, openai)

def get_answer(question):
    if vectorstore:
        answer, usage, cost = ask_question(question, vectorstore, openai)
        print(f"Q: {question}")
        print(f"A: {answer}")
        print(f"Tokens: {usage}")
        print(f"Cost of Call: ${cost:.5f}")
        print()
    else:
        print("PDF not processed successfully. Please check the file path and try again.")

# Chat loop
print("PDF Chat initialized. Type 'exit' to end the conversation.")
while True:
    user_question = input("Your question: ")
    if user_question.lower() == 'exit':
        print("Ending chat session.")
        break
    get_answer(user_question)