In [None]:
import numpy as np
from pathlib import Path
import pickle
import logging
from quanthub.util import llm
from sklearn.metrics.pairwise import cosine_similarity
import heapq

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def load_embeddings(pdf_path):
    file_hash = get_file_hash(pdf_path)
    cache_dir = Path(f"./cache/{file_hash}")
    embeddings_file = cache_dir / "page_embeddings.pkl"
    
    if embeddings_file.exists():
        with open(embeddings_file, "rb") as f:
            return pickle.load(f)
    else:
        logger.error("Embeddings file not found. Please process the PDF first.")
        return None

def embed_question(question, openai_client):
    response = openai_client.embeddings.create(
        input=[question],
        model="text-embedding-ada-002"
    )
    return np.array(response.data[0].embedding)

def calculate_similarities(question_embedding, page_embeddings):
    similarities = {}
    for page_num, page_embedding in page_embeddings.items():
        similarity = cosine_similarity(question_embedding.reshape(1, -1), page_embedding.reshape(1, -1))[0][0]
        similarities[page_num] = similarity
    return similarities

def rank_pages(similarities, top_k=5):
    return heapq.nlargest(top_k, similarities.items(), key=lambda x: x[1])

def generate_answer(question, top_pages, page_contents, openai_client, model="gpt-4-1106-preview"):
    context = "\n\n".join([f"Page {page}: {page_contents[page]}" for page, _ in top_pages])
    prompt = f"Context:\n{context}\n\nQuestion: {question}\n\nAnswer:"
    
    response = openai_client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": "You are a helpful assistant answering questions based on the provided context."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=300,
        temperature=0.7
    )
    
    return response.choices[0].message.content.strip()

def process_question(pdf_path, question, openai_client, generate_answer_flag=True):
    # Load stored embeddings
    page_embeddings = load_embeddings(pdf_path)
    if not page_embeddings:
        return None

    # Embed the question
    question_embedding = embed_question(question, openai_client)

    # Calculate similarities
    similarities = calculate_similarities(question_embedding, page_embeddings)

    # Rank top pages
    top_pages = rank_pages(similarities)

    logger.info("Top relevant pages:")
    for page, score in top_pages:
        logger.info(f"Page {page}: Similarity score {score:.4f}")

    if generate_answer_flag:
        # Load page contents (you need to implement this function based on how you store page contents)
        page_contents = load_page_contents(pdf_path)
        
        # Generate answer
        answer = generate_answer(question, top_pages, page_contents, openai_client)
        logger.info(f"Generated Answer: {answer}")
        return top_pages, answer
    else:
        return top_pages

# Main execution
pdf_path = '/path/to/your/large.pdf'
question = "What is the company's revenue for the last fiscal year?"

# Initialize your custom GPT client
openai = llm.get_llm_client(llm.GPT_4_MODEL)

result = process_question(pdf_path, question, openai)

if result:
    top_pages, answer = result
    print(f"Top pages: {top_pages}")
    print(f"Answer: {answer}")
else:
    print("Failed to process the question.")

In [None]:
import numpy as np

def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

def calculate_similarities(question_embedding, page_embeddings):
    similarities = {}
    for page_num, page_embedding in page_embeddings.items():
        similarity = cosine_similarity(question_embedding, page_embedding)
        similarities[page_num] = similarity
    return similarities

def rank_pages(similarities, top_k=5):
    return sorted(similarities.items(), key=lambda x: x[1], reverse=True)[:top_k]

# The rest of the code remains the same

In [None]:
import duckdb
import pandas as pd
import time
import matplotlib.pyplot as plt
from io import StringIO

# 1. Create a sample dataset
def create_sample_data(rows=1000000):
    data = StringIO()
    for i in range(rows):
        data.write(f"{i},value_{i%100},{i*1.1}\n")
    data.seek(0)
    return data

# 2. Load data into DuckDB and Pandas
data = create_sample_data()
duck_conn = duckdb.connect(':memory:')
start_time = time.time()
duck_conn.execute("CREATE TABLE test AS SELECT * FROM read_csv_auto('data')")
duck_load_time = time.time() - start_time

start_time = time.time()
df = pd.read_csv(data, names=['id', 'category', 'value'])
pandas_load_time = time.time() - start_time

print(f"DuckDB load time: {duck_load_time:.2f} seconds")
print(f"Pandas load time: {pandas_load_time:.2f} seconds")

# 3. Perform a simple aggregation
start_time = time.time()
duck_result = duck_conn.execute("SELECT category, AVG(value) as avg_value FROM test GROUP BY category").fetchdf()
duck_query_time = time.time() - start_time

start_time = time.time()
pandas_result = df.groupby('category')['value'].mean().reset_index()
pandas_query_time = time.time() - start_time

print(f"DuckDB query time: {duck_query_time:.2f} seconds")
print(f"Pandas query time: {pandas_query_time:.2f} seconds")

# 4. Visualize performance comparison
plt.figure(figsize=(10, 5))
plt.bar(['DuckDB Load', 'Pandas Load', 'DuckDB Query', 'Pandas Query'],
        [duck_load_time, pandas_load_time, duck_query_time, pandas_query_time])
plt.title('Performance Comparison: DuckDB vs Pandas')
plt.ylabel('Time (seconds)')
plt.show()

# 5. Demonstrate SQL capabilities
print(duck_conn.execute("""
    SELECT category,
           AVG(value) as avg_value,
           MIN(value) as min_value,
           MAX(value) as max_value,
           COUNT(*) as count
    FROM test
    GROUP BY category
    ORDER BY avg_value DESC
    LIMIT 5
""").fetchdf())

# 6. Show compatibility with Pandas
duck_conn.register('pandas_df', df)
print(duck_conn.execute("SELECT * FROM pandas_df LIMIT 5").fetchdf())

# 7. Demonstrate data export
duck_conn.execute("COPY (SELECT * FROM test LIMIT 1000) TO 'sample_export.csv' (HEADER, DELIMITER ',')")

# 8. Show support for complex queries
print(duck_conn.execute("""
    WITH ranked_values AS (
        SELECT *,
               ROW_NUMBER() OVER (PARTITION BY category ORDER BY value DESC) as rank
        FROM test
    )
    SELECT category, AVG(value) as avg_top_10_percent
    FROM ranked_values
    WHERE rank <= (SELECT COUNT(*) FROM test) * 0.1
    GROUP BY category
    ORDER BY avg_top_10_percent DESC
    LIMIT 5
""").fetchdf())

# Clean up
duck_conn.close()