In [1]:
import sqlite3
import numpy as np
from openai import OpenAI
from tqdm.notebook import tqdm
import time
import re

In [None]:
# Project 2025 Document Retrieval System


# Import the API key from config.py
try:
    from config import OPENAI_API_KEY
except ImportError:
    raise ImportError("Please create a config.py file with your OPENAI_API_KEY")

# Set up OpenAI client
client = OpenAI(api_key=OPENAI_API_KEY)

# Connect to the existing database
conn = sqlite3.connect('p2025_db.sqlite')
cursor = conn.cursor()

def transform_query(query, verbose=False):
    system_message = """You are an AI assistant helping to optimize queries for semantic search. Your task is to rewrite the given query in a way that will improve its semantic matching with relevant document chunks."""

    user_message = f"""Hey, I have a query: "{query}", and a 900 page document called Project 2025, which is a sweeping, 900-page plan drummed up by a conservative think tank targeting the executive branch and laying out right-wing priorities for everything from America's education system to the border and abortion restrictions. The document is chunked out and vectorized using OpenAI embeddings. How might I rewrite the query such that when it gets vectorized, its nearest neighbors will contain the answers to the question?"""

    try:
        response = client.chat.completions.create(
            model="gpt-4",
            messages=[
                {"role": "system", "content": system_message},
                {"role": "user", "content": user_message}
            ]
        )
        transformed_query = response.choices[0].message.content.strip()
        if verbose:
            print(f"Original query: {query}")
            print(f"Transformed query: {transformed_query}")
        return transformed_query
    except Exception as e:
        if verbose:
            print(f"Error in transforming query: {e}")
        return query  # Return original query if transformation fails

def encode_text(text, max_retries=10, backoff_factor=2, timeout=30, verbose=False):
    if verbose:
        print(f"Starting to encode text of length {len(text)}")
    for attempt in range(max_retries):
        try:
            if verbose:
                print(f"Attempt {attempt + 1} to encode text")
            response = client.embeddings.create(
                model="text-embedding-ada-002",
                input=[text],
                timeout=timeout
            )
            embedding = np.array(response.data[0].embedding)
            if verbose:
                print(f"Successfully encoded text")
            return embedding, embedding.shape
        except Exception as e:
            wait_time = backoff_factor * (2 ** attempt)
            if verbose:
                print(f"Error occurred: {e}. Retrying in {wait_time} seconds...")
            time.sleep(wait_time)
    if verbose:
        print("Failed to encode text after all attempts")
    raise Exception("Failed to encode text after all attempts")

def retrieve_chunks(query, top_k=20, verbose=False):
    transformed_query = transform_query(query, verbose)
    if verbose:
        print(f"Retrieving chunks for transformed query: '{transformed_query}'")
    query_embedding, query_shape = encode_text(transformed_query, verbose=verbose)
    if verbose:
        print(f"Query embedding shape: {query_shape}")
    
    cursor.execute('SELECT id, embedding, shape FROM document_chunks')
    results = cursor.fetchall()
    
    if verbose:
        print(f"Comparing query to {len(results)} stored chunks")
    similarities = []
    for id, emb, shape in (tqdm(results, desc="Comparing embeddings") if verbose else results):
        emb_array = np.array([float(x) for x in emb.split(',')]).reshape(eval(shape))
        
        if emb_array.shape != query_shape:
            if verbose:
                print(f"Warning: Embedding shape mismatch. Query: {query_shape}, Stored: {emb_array.shape}")
            continue
        
        similarity = np.dot(query_embedding, emb_array) / (np.linalg.norm(query_embedding) * np.linalg.norm(emb_array))
        similarities.append((id, similarity))
    
    if not similarities:
        if verbose:
            print("No valid embeddings found for comparison.")
        return []
    
    top_ids = sorted(similarities, key=lambda x: x[1], reverse=True)[:top_k]
    
    placeholders = ','.join('?' for _ in top_ids)
    cursor.execute(f'SELECT content FROM document_chunks WHERE id IN ({placeholders})', 
                   [id for id, _ in top_ids])
    return cursor.fetchall()

def select_promising_chunks(query, chunks, verbose=False):
    system_message = """You are an AI assistant helping to identify relevant text chunks for a given query. Your task is to select the most promising chunks that are likely to contain information to answer the query. Provide your reasoning, then list the selected chunk numbers in a specific format."""

    chunk_previews = "\n".join([f"Chunk {i+1}: {chunk[0][:200]}..." for i, chunk in enumerate(chunks)])
    
    user_message = f"""Query: "{query}"

Here are previews of several text chunks:

{chunk_previews}

Please identify the chunks that seem most relevant to answering the query. Explain your reasoning briefly, then list only the numbers of the selected chunks.

End your response with the comma-separated chunk numbers enclosed in curly brackets, like this: {{1,4,7}}. Include only the most relevant chunks, aiming for 3-5 selections unless more are clearly relevant."""

    try:
        response = client.chat.completions.create(
            model="gpt-4",
            messages=[
                {"role": "system", "content": system_message},
                {"role": "user", "content": user_message}
            ]
        )
        full_response = response.choices[0].message.content.strip()
        if verbose:
            print(f"GPT-4 response:\n{full_response}")
        
        match = re.search(r'\{([^}]+)\}$', full_response)
        if match:
            chunk_numbers = match.group(1)
            selected_chunks = [int(num.strip()) for num in chunk_numbers.split(',') if num.strip().isdigit()]
            if verbose:
                print(f"Selected chunks: {selected_chunks}")
            return selected_chunks
        else:
            if verbose:
                print("No properly formatted chunk numbers found. Defaulting to first 5 chunks.")
            return list(range(1, min(6, len(chunks)+1)))
    except Exception as e:
        if verbose:
            print(f"Error in selecting promising chunks: {e}")
        return list(range(1, min(6, len(chunks)+1)))  # Default to first 5 chunks if there's an error

def print_chunks(chunks, verbose=False):
    if verbose:
        print("\nRelevant chunks for the query:")
        for i, chunk in enumerate(chunks, 1):
            print(f"Chunk {i}:")
            print(chunk[0][:200] + "...")  # Print first 200 characters of each chunk
            print()

def generate_final_answer(original_query, relevant_chunks, max_chunk_length=500, verbose=False):
    combined_context = " ".join([chunk[0][:max_chunk_length] for chunk in relevant_chunks])
    
    system_message = """You are an AI assistant tasked with answering questions about Project 2025 based on provided context. Use the given information to answer the question accurately and concisely."""

    user_message = f"""Context from Project 2025 document:

{combined_context}

Based on this context, please answer the following question:
{original_query}

Provide a concise answer that directly addresses the question using only the information given in the context."""

    try:
        response = client.chat.completions.create(
            model="gpt-4",
            messages=[
                {"role": "system", "content": system_message},
                {"role": "user", "content": user_message}
            ]
        )
        final_answer = response.choices[0].message.content.strip()
        return final_answer
    except Exception as e:
        if verbose:
            print(f"Error in generating final answer: {e}")
        return "Unable to generate a final answer due to an error."

def process_query(query, verbose=False):
    print(f"Query: {query}")
    all_chunks = retrieve_chunks(query, top_k=20, verbose=verbose)
    selected_indices = select_promising_chunks(query, all_chunks, verbose=verbose)
    relevant_chunks = [all_chunks[i-1] for i in selected_indices]  # -1 because chunk numbering starts at 1
    
    print_chunks(relevant_chunks, verbose=verbose)
    
    if verbose:
        print("\nGenerating final answer...")
    final_answer = generate_final_answer(query, relevant_chunks, verbose=verbose)
    print("\nFinal Answer:")
    print(final_answer)

# # Example usage
# query = "What does Project 2025 say about the consolidation of power into the president's seat?"
# process_query(query, verbose=True)

# # Close the database connection when done
# conn.close()

In [None]:
query = "What can be said about Bureau of Land Management?"
process_query(query, verbose=True)