In [1]:
import sqlite3
import numpy as np
from openai import OpenAI
import os
from tqdm.notebook import tqdm
import time
import requests
from requests.exceptions import Timeout

# Import the API key from config.py
try:
    from config import OPENAI_API_KEY
except ImportError:
    raise ImportError("Please create a config.py file with your OPENAI_API_KEY")

print("Setting up OpenAI client...")
client = OpenAI(api_key=OPENAI_API_KEY)

print("Initializing database...")
conn = sqlite3.connect('p2025_pyramid.sqlite')
cursor = conn.cursor()

print("Creating tables if not exist...")
cursor.execute('''
CREATE TABLE IF NOT EXISTS document_chunks
(id INTEGER PRIMARY KEY, content TEXT, embedding BLOB, shape TEXT, layer INTEGER)
''')

def encode_text(text, max_retries=10, backoff_factor=2, timeout=30):
    print(f"Starting to encode text of length {len(text)}")
    for attempt in range(max_retries):
        try:
            print(f"Attempt {attempt + 1} to encode text")
            response = client.embeddings.create(
                model="text-embedding-ada-002",
                input=[text],
                timeout=timeout
            )
            embedding = np.array(response.data[0].embedding)
            print(f"Successfully encoded text")
            return embedding, embedding.shape
        except Timeout:
            wait_time = backoff_factor * (2 ** attempt)
            print(f"Request timed out. Retrying in {wait_time} seconds...")
            time.sleep(wait_time)
        except Exception as e:
            wait_time = backoff_factor * (2 ** attempt)
            print(f"Error occurred: {e}. Retrying in {wait_time} seconds...")
            time.sleep(wait_time)
    print("Failed to encode text after all attempts")
    raise Exception("Failed to encode text after all attempts")

def add_chunk(content, embedding, shape, layer):
    cursor.execute('INSERT INTO document_chunks (content, embedding, shape, layer) VALUES (?, ?, ?, ?)',
                   (content, ','.join(map(str, embedding)), str(shape), layer))
    conn.commit()

def read_and_chunk_file(file_path, chunk_size=3500, overlap=500):
    print(f"Reading file: {file_path}")
    chunks = []
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    
    print(f"Chunking file (chunk size: {chunk_size}, overlap: {overlap})")
    start = 0
    with tqdm(total=len(content), desc="Chunking progress") as pbar:
        while start < len(content):
            end = start + chunk_size
            chunk = content[start:end]
            
            if end < len(content):
                sentence_end = chunk.rfind('.')
                paragraph_end = chunk.rfind('\n')
                if sentence_end > 0:
                    end = start + sentence_end + 1
                elif paragraph_end > 0:
                    end = start + paragraph_end + 1
            
            chunks.append(content[start:end])
            start = end - overlap
            pbar.update(end - start)
    
    print(f"Created {len(chunks)} chunks")
    return chunks

def summarize_pair(chunk1, chunk2):
    system_message = "You are an AI assistant tasked with summarizing text. Provide a concise summary that captures the key points of the given text."
    user_message = f"Summarize the following text:\n\n{chunk1}\n\n{chunk2}"
    
    response = client.chat.completions.create(
        model="gpt-4",
        messages=[
            {"role": "system", "content": system_message},
            {"role": "user", "content": user_message}
        ]
    )
    return response.choices[0].message.content.strip()

def create_summary_pyramid(chunks, max_layers=5):
    pyramid = [chunks]  # Bottom layer
    
    for layer in range(1, max_layers):
        print(f"Creating layer {layer}...")
        new_layer = []
        for i in range(0, len(pyramid[-1]), 2):
            if i + 1 < len(pyramid[-1]):
                combined = summarize_pair(pyramid[-1][i], pyramid[-1][i+1])
            else:
                combined = pyramid[-1][i]  # If odd number, keep last chunk as is
            new_layer.append(combined)
        
        pyramid.append(new_layer)
        
        if len(new_layer) == 1:
            break  # We've reached the top of the pyramid
    
    return pyramid

print("Reading and chunking file...")
chunks = read_and_chunk_file('p2025.txt')

print("Creating summary pyramid...")
pyramid = create_summary_pyramid(chunks)

print("Processing chunks and adding to database...")
for layer, layer_chunks in enumerate(pyramid):
    for chunk in tqdm(layer_chunks, desc=f"Processing layer {layer}"):
        print(f"Encoding chunk (length: {len(chunk)})")
        embedding, shape = encode_text(chunk)
        print(f"Adding chunk to database (embedding shape: {shape}, layer: {layer})")
        add_chunk(chunk, embedding, shape, layer)

print(f"Added {sum(len(layer) for layer in pyramid)} chunks to the database.")

def retrieve_chunks(query, top_k=5):
    print(f"Retrieving chunks for query: '{query}'")
    query_embedding, query_shape = encode_text(query)
    print(f"Query embedding shape: {query_shape}")
    
    cursor.execute('SELECT id, embedding, shape, layer FROM document_chunks')
    results = cursor.fetchall()
    
    print(f"Comparing query to {len(results)} stored chunks")
    similarities = []
    for id, emb, shape, layer in tqdm(results, desc="Comparing embeddings"):
        emb_array = np.array([float(x) for x in emb.split(',')]).reshape(eval(shape))
        
        if emb_array.shape != query_shape:
            print(f"Warning: Embedding shape mismatch. Query: {query_shape}, Stored: {emb_array.shape}")
            continue
        
        similarity = np.dot(query_embedding, emb_array) / (np.linalg.norm(query_embedding) * np.linalg.norm(emb_array))
        similarities.append((id, similarity, layer))
    
    if not similarities:
        print("No valid embeddings found for comparison.")
        return []
    
    # Sort by similarity and then by layer (preferring lower layers for equal similarity)
    top_ids = sorted(similarities, key=lambda x: (x[1], -x[2]), reverse=True)[:top_k]
    
    placeholders = ','.join('?' for _ in top_ids)
    cursor.execute(f'SELECT content, layer FROM document_chunks WHERE id IN ({placeholders})', 
                   [id for id, _, _ in top_ids])
    return cursor.fetchall()

print("Testing retrieval...")
test_query = "What is the overall theme of Project 2025?"
relevant_chunks = retrieve_chunks(test_query)

print("\nRelevant chunks for the query:")
for i, (chunk, layer) in enumerate(relevant_chunks, 1):
    print(f"Chunk {i} (Layer {layer}):")
    print(chunk[:200] + "...")  # Print first 200 characters of each chunk
    print()

print("Closing database connection...")
conn.close()
print("Done!")

Setting up OpenAI client...
Initializing database...
Creating tables if not exist...
Reading and chunking file...
Reading file: p2025.txt
Chunking file (chunk size: 3500, overlap: 500)


Chunking progress:   0%|          | 0/2418009 [00:00<?, ?it/s]

Created 835 chunks
Creating summary pyramid...
Creating layer 1...
Creating layer 2...
Creating layer 3...
Creating layer 4...
Processing chunks and adding to database...


Processing layer 0:   0%|          | 0/835 [00:00<?, ?it/s]

Encoding chunk (length: 3500)
Starting to encode text of length 3500
Attempt 1 to encode text
Successfully encoded text
Adding chunk to database (embedding shape: (1536,), layer: 0)
Encoding chunk (length: 3500)
Starting to encode text of length 3500
Attempt 1 to encode text
Successfully encoded text
Adding chunk to database (embedding shape: (1536,), layer: 0)
Encoding chunk (length: 2880)
Starting to encode text of length 2880
Attempt 1 to encode text
Successfully encoded text
Adding chunk to database (embedding shape: (1536,), layer: 0)
Encoding chunk (length: 3419)
Starting to encode text of length 3419
Attempt 1 to encode text
Successfully encoded text
Adding chunk to database (embedding shape: (1536,), layer: 0)
Encoding chunk (length: 3469)
Starting to encode text of length 3469
Attempt 1 to encode text
Successfully encoded text
Adding chunk to database (embedding shape: (1536,), layer: 0)
Encoding chunk (length: 3463)
Starting to encode text of length 3463
Attempt 1 to encode t

Processing layer 1:   0%|          | 0/418 [00:00<?, ?it/s]

Encoding chunk (length: 732)
Starting to encode text of length 732
Attempt 1 to encode text
Successfully encoded text
Adding chunk to database (embedding shape: (1536,), layer: 1)
Encoding chunk (length: 833)
Starting to encode text of length 833
Attempt 1 to encode text
Successfully encoded text
Adding chunk to database (embedding shape: (1536,), layer: 1)
Encoding chunk (length: 782)
Starting to encode text of length 782
Attempt 1 to encode text
Successfully encoded text
Adding chunk to database (embedding shape: (1536,), layer: 1)
Encoding chunk (length: 912)
Starting to encode text of length 912
Attempt 1 to encode text
Successfully encoded text
Adding chunk to database (embedding shape: (1536,), layer: 1)
Encoding chunk (length: 1769)
Starting to encode text of length 1769
Attempt 1 to encode text
Successfully encoded text
Adding chunk to database (embedding shape: (1536,), layer: 1)
Encoding chunk (length: 1062)
Starting to encode text of length 1062
Attempt 1 to encode text
Succ

Processing layer 2:   0%|          | 0/209 [00:00<?, ?it/s]

Encoding chunk (length: 701)
Starting to encode text of length 701
Attempt 1 to encode text
Successfully encoded text
Adding chunk to database (embedding shape: (1536,), layer: 2)
Encoding chunk (length: 794)
Starting to encode text of length 794
Attempt 1 to encode text
Successfully encoded text
Adding chunk to database (embedding shape: (1536,), layer: 2)
Encoding chunk (length: 708)
Starting to encode text of length 708
Attempt 1 to encode text
Successfully encoded text
Adding chunk to database (embedding shape: (1536,), layer: 2)
Encoding chunk (length: 809)
Starting to encode text of length 809
Attempt 1 to encode text
Successfully encoded text
Adding chunk to database (embedding shape: (1536,), layer: 2)
Encoding chunk (length: 694)
Starting to encode text of length 694
Attempt 1 to encode text
Successfully encoded text
Adding chunk to database (embedding shape: (1536,), layer: 2)
Encoding chunk (length: 992)
Starting to encode text of length 992
Attempt 1 to encode text
Successf

Processing layer 3:   0%|          | 0/105 [00:00<?, ?it/s]

Encoding chunk (length: 660)
Starting to encode text of length 660
Attempt 1 to encode text
Successfully encoded text
Adding chunk to database (embedding shape: (1536,), layer: 3)
Encoding chunk (length: 876)
Starting to encode text of length 876
Attempt 1 to encode text
Successfully encoded text
Adding chunk to database (embedding shape: (1536,), layer: 3)
Encoding chunk (length: 796)
Starting to encode text of length 796
Attempt 1 to encode text
Successfully encoded text
Adding chunk to database (embedding shape: (1536,), layer: 3)
Encoding chunk (length: 761)
Starting to encode text of length 761
Attempt 1 to encode text
Successfully encoded text
Adding chunk to database (embedding shape: (1536,), layer: 3)
Encoding chunk (length: 879)
Starting to encode text of length 879
Attempt 1 to encode text
Successfully encoded text
Adding chunk to database (embedding shape: (1536,), layer: 3)
Encoding chunk (length: 1135)
Starting to encode text of length 1135
Attempt 1 to encode text
Succes

Processing layer 4:   0%|          | 0/53 [00:00<?, ?it/s]

Encoding chunk (length: 763)
Starting to encode text of length 763
Attempt 1 to encode text
Successfully encoded text
Adding chunk to database (embedding shape: (1536,), layer: 4)
Encoding chunk (length: 761)
Starting to encode text of length 761
Attempt 1 to encode text
Successfully encoded text
Adding chunk to database (embedding shape: (1536,), layer: 4)
Encoding chunk (length: 793)
Starting to encode text of length 793
Attempt 1 to encode text
Successfully encoded text
Adding chunk to database (embedding shape: (1536,), layer: 4)
Encoding chunk (length: 905)
Starting to encode text of length 905
Attempt 1 to encode text
Successfully encoded text
Adding chunk to database (embedding shape: (1536,), layer: 4)
Encoding chunk (length: 1209)
Starting to encode text of length 1209
Attempt 1 to encode text
Successfully encoded text
Adding chunk to database (embedding shape: (1536,), layer: 4)
Encoding chunk (length: 1060)
Starting to encode text of length 1060
Attempt 1 to encode text
Succ

Comparing embeddings:   0%|          | 0/1620 [00:00<?, ?it/s]


Relevant chunks for the query:
Chunk 1 (Layer 0):
nder, Senior Graphic Designer Grace Desandro, and Senior Designer Melissa 
Bluey came together to bring the volume to life. We also thank the dedicated junior 
staff who provided immeasurable assistan...

Chunk 2 (Layer 1):
"Mandate for Leadership 2025: The Conservative Promise" is a collective work aimed at advancing positive change in America, contributed by various volunteers and conservative thinkers. While acknowled...

Chunk 3 (Layer 1):
Project 2025 aims to prepare the next conservative President to govern the increasingly complex and growing federal government. The project plans to build an army of trained and ready conservatives to...

Chunk 4 (Layer 2):
The "Project 2025 Presidential Transition Project" is a comprehensive document by the Heritage Foundation covering diverse topics, including governmental operations, defense, public welfare, economy, ...

Chunk 5 (Layer 2):
Project 2025 is an initiative aimed at preparing the 