In [2]:
import sqlite3
import numpy as np
from openai import OpenAI
import os
from tqdm.notebook import tqdm
import time

# Import the API key and org ID from config.py
try:
    from config import OPENAI_API_KEY
except ImportError:
    raise ImportError("Please create a config.py file with your OPENAI_API_KEY and OPENAI_ORG_ID")

# Set up OpenAI client
client = OpenAI(api_key=OPENAI_API_KEY)

# Initialize database
conn = sqlite3.connect('p2025_db.sqlite')
cursor = conn.cursor()

# Create table
cursor.execute('''
CREATE TABLE IF NOT EXISTS document_chunks
(id INTEGER PRIMARY KEY, content TEXT, embedding BLOB)
''')

# Function to encode text using OpenAI with rate limiting
def encode_text(text, max_retries=5, backoff_factor=1):
    for attempt in range(max_retries):
        try:
            response = client.embeddings.create(
                model="text-embedding-ada-002",
                input=[text]
            )
            return np.array(response.data[0].embedding)
        except Exception as e:
            if attempt == max_retries - 1:
                raise
            print(f"Error occurred: {e}. Retrying in {backoff_factor * (2 ** attempt)} seconds...")
            time.sleep(backoff_factor * (2 ** attempt))

# Function to add a chunk to the database
def add_chunk(content, embedding):
    cursor.execute('INSERT INTO document_chunks (content, embedding) VALUES (?, ?)',
                   (content, embedding.tobytes()))
    conn.commit()

# Function to read and chunk the file
def read_and_chunk_file(file_path, chunk_size=1000):
    chunks = []
    current_chunk = []
    current_size = 0
    
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            current_chunk.append(line)
            current_size += len(line)
            if current_size >= chunk_size:
                chunks.append(''.join(current_chunk))
                current_chunk = []
                current_size = 0
    
    if current_chunk:
        chunks.append(''.join(current_chunk))
    
    return chunks

# Read and process the file
chunks = read_and_chunk_file('p2025.txt')

# Add chunks to the database with progress bar
for chunk in tqdm(chunks, desc="Processing chunks"):
    embedding = encode_text(chunk)
    add_chunk(chunk, embedding)

print(f"Added {len(chunks)} chunks to the database.")

# Function to retrieve relevant chunks
def retrieve_chunks(query, top_k=5):
    query_embedding = encode_text(query)
    
    cursor.execute('SELECT id, embedding FROM document_chunks')
    results = cursor.fetchall()
    
    similarities = []
    for id, emb in results:
        emb_array = np.frombuffer(emb, dtype=np.float32)
        similarity = np.dot(query_embedding, emb_array)
        similarities.append((id, similarity))
    
    top_ids = sorted(similarities, key=lambda x: x[1], reverse=True)[:top_k]
    
    placeholders = ','.join('?' for _ in top_ids)
    cursor.execute(f'SELECT content FROM document_chunks WHERE id IN ({placeholders})', 
                   [id for id, _ in top_ids])
    return cursor.fetchall()

# Test the retrieval
test_query = "What does Project 2025 say about BLM's move west?"
relevant_chunks = retrieve_chunks(test_query)

print("\nRelevant chunks for the query:")
for chunk in relevant_chunks:
    print(chunk[0][:200] + "...")  # Print first 200 characters of each chunk
    print()

# Close the database connection
conn.close()

Processing chunks:   0%|          | 0/2317 [00:00<?, ?it/s]

Added 2317 chunks to the database.


ValueError: shapes (1536,) and (3072,) not aligned: 1536 (dim 0) != 3072 (dim 0)