In [None]:
import numpy as np
from pathlib import Path
import pickle
import logging
from quanthub.util import llm
from sklearn.metrics.pairwise import cosine_similarity
import heapq

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def load_embeddings(pdf_path):
    file_hash = get_file_hash(pdf_path)
    cache_dir = Path(f"./cache/{file_hash}")
    embeddings_file = cache_dir / "page_embeddings.pkl"
    
    if embeddings_file.exists():
        with open(embeddings_file, "rb") as f:
            return pickle.load(f)
    else:
        logger.error("Embeddings file not found. Please process the PDF first.")
        return None

def embed_question(question, openai_client):
    response = openai_client.embeddings.create(
        input=[question],
        model="text-embedding-ada-002"
    )
    return np.array(response.data[0].embedding)

def calculate_similarities(question_embedding, page_embeddings):
    similarities = {}
    for page_num, page_embedding in page_embeddings.items():
        similarity = cosine_similarity(question_embedding.reshape(1, -1), page_embedding.reshape(1, -1))[0][0]
        similarities[page_num] = similarity
    return similarities

def rank_pages(similarities, top_k=5):
    return heapq.nlargest(top_k, similarities.items(), key=lambda x: x[1])

def generate_answer(question, top_pages, page_contents, openai_client, model="gpt-4-1106-preview"):
    context = "\n\n".join([f"Page {page}: {page_contents[page]}" for page, _ in top_pages])
    prompt = f"Context:\n{context}\n\nQuestion: {question}\n\nAnswer:"
    
    response = openai_client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": "You are a helpful assistant answering questions based on the provided context."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=300,
        temperature=0.7
    )
    
    return response.choices[0].message.content.strip()

def process_question(pdf_path, question, openai_client, generate_answer_flag=True):
    # Load stored embeddings
    page_embeddings = load_embeddings(pdf_path)
    if not page_embeddings:
        return None

    # Embed the question
    question_embedding = embed_question(question, openai_client)

    # Calculate similarities
    similarities = calculate_similarities(question_embedding, page_embeddings)

    # Rank top pages
    top_pages = rank_pages(similarities)

    logger.info("Top relevant pages:")
    for page, score in top_pages:
        logger.info(f"Page {page}: Similarity score {score:.4f}")

    if generate_answer_flag:
        # Load page contents (you need to implement this function based on how you store page contents)
        page_contents = load_page_contents(pdf_path)
        
        # Generate answer
        answer = generate_answer(question, top_pages, page_contents, openai_client)
        logger.info(f"Generated Answer: {answer}")
        return top_pages, answer
    else:
        return top_pages

# Main execution
pdf_path = '/path/to/your/large.pdf'
question = "What is the company's revenue for the last fiscal year?"

# Initialize your custom GPT client
openai = llm.get_llm_client(llm.GPT_4_MODEL)

result = process_question(pdf_path, question, openai)

if result:
    top_pages, answer = result
    print(f"Top pages: {top_pages}")
    print(f"Answer: {answer}")
else:
    print("Failed to process the question.")

In [None]:
import numpy as np

def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

def calculate_similarities(question_embedding, page_embeddings):
    similarities = {}
    for page_num, page_embedding in page_embeddings.items():
        similarity = cosine_similarity(question_embedding, page_embedding)
        similarities[page_num] = similarity
    return similarities

def rank_pages(similarities, top_k=5):
    return sorted(similarities.items(), key=lambda x: x[1], reverse=True)[:top_k]

# The rest of the code remains the same