In [1]:
import sqlite3
import numpy as np
from openai import OpenAI
import os
from tqdm.notebook import tqdm
import time
import requests
from requests.exceptions import Timeout


In [2]:
 #Import the API key from config.py
try:
    from config import OPENAI_API_KEY
except ImportError:
    raise ImportError("Please create a config.py file with your OPENAI_API_KEY")

print("Setting up OpenAI client...")
client = OpenAI(api_key=OPENAI_API_KEY)

print("Initializing database...")
conn = sqlite3.connect('premera_plan.sqlite')
cursor = conn.cursor()

print("Creating tables if not exist...")
cursor.execute('''
CREATE TABLE IF NOT EXISTS document_chunks
(id INTEGER PRIMARY KEY, content TEXT, embedding BLOB, shape TEXT, layer INTEGER)
''')

Setting up OpenAI client...
Initializing database...
Creating tables if not exist...


<sqlite3.Cursor at 0x20d7129e640>

In [3]:
def read_and_chunk_file(file_path, chunk_size=500, overlap=100):
    print(f"Reading file: {file_path}")
    chunks = []
    #opening the file
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    
    print(f"Chunking file (chunk size: {chunk_size}, overlap: {overlap})")
    start = 0
    #looping through file chunk at a time
    with tqdm(total=len(content), desc="Chunking progress") as pbar:
        while start < len(content):
            end = start + chunk_size
            chunk = content[start:end]
            
            if end < len(content):
                #finds sentence end of chunk or paragraph end of chunk then moves end to that spot + 1 after new para or period
                sentence_end = chunk.rfind('.')
                paragraph_end = chunk.rfind('\n')
                if sentence_end > 0:
                    end = start + sentence_end + 1
                elif paragraph_end > 0:
                    end = start + paragraph_end + 1
            
            chunks.append(content[start:end])
            #new start will be the end but minus the overlap so we can include the overlap in the next chunk
            start = end - overlap
            pbar.update(end - start)
    
    print(f"Created {len(chunks)} chunks")
    return chunks

In [4]:
def summarize_pair(chunk1, chunk2):
    system_message = "You are an AI assistant tasked with summarizing text. Provide a concise summary that captures the key points of the given text."
    user_message = f"Summarize the following text:\n\n{chunk1}\n\n{chunk2}"
    
    response = client.chat.completions.create(
        model="gpt-4",
        messages=[
            {"role": "system", "content": system_message},
            {"role": "user", "content": user_message}
        ]
    )
    return response.choices[0].message.content.strip()

In [5]:
def create_summary_pyramid(chunks, max_layers=5):
    pyramid = [chunks]  # Bottom layer
    
    for layer in range(1, max_layers):
        print(f"Creating layer {layer}...")
        new_layer = []
        for i in range(0, len(pyramid[-1]), 2):
            if i + 1 < len(pyramid[-1]):
                combined = summarize_pair(pyramid[-1][i], pyramid[-1][i+1])
            else:
                combined = pyramid[-1][i]  # If odd number, keep last chunk as is
            new_layer.append(combined)
        
        pyramid.append(new_layer)
        
        if len(new_layer) == 1:
            break  # We've reached the top of the pyramid
    
    return pyramid

In [None]:
def encode_text(text, max_retries=10, backoff_factor=2, timeout=30):
    print(f"Starting to encode text of length {len(text)}")
    for attempt in range(max_retries):
        try:
            print(f"Attempt {attempt + 1} to encode text")
            response = client.embeddings.create(
                model="text-embedding-ada-002",
                input=[text],
                timeout=timeout
            )
            embedding = np.array(response.data[0].embedding)
            print(f"Successfully encoded text")
            return embedding, embedding.shape
        except Timeout:
            wait_time = backoff_factor * (2 ** attempt)
            print(f"Request timed out. Retrying in {wait_time} seconds...")
            time.sleep(wait_time)
        except Exception as e:
            wait_time = backoff_factor * (2 ** attempt)
            print(f"Error occurred: {e}. Retrying in {wait_time} seconds...")
            time.sleep(wait_time)
    print("Failed to encode text after all attempts")
    raise Exception("Failed to encode text after all attempts")

In [6]:
print("Reading and chunking file...")
chunks = read_and_chunk_file('premera_paragraphs.txt')

Reading and chunking file...
Reading file: premera_paragraphs.txt
Chunking file (chunk size: 500, overlap: 100)


Chunking progress:   0%|          | 0/5888 [00:00<?, ?it/s]

Created 18 chunks


In [7]:
print("Creating summary pyramid...")
pyramid = create_summary_pyramid(chunks)

Creating summary pyramid...
Creating layer 1...
Creating layer 2...
Creating layer 3...
Creating layer 4...


In [None]:
print("Processing chunks and adding to database...")
for layer, layer_chunks in enumerate(pyramid):
    for chunk in tqdm(layer_chunks, desc=f"Processing layer {layer}"):
        print(f"Encoding chunk (length: {len(chunk)})")
        embedding, shape = encode_text(chunk)
        print(f"Adding chunk to database (embedding shape: {shape}, layer: {layer})")
        add_chunk(chunk, embedding, shape, layer)

print(f"Added {sum(len(layer) for layer in pyramid)} chunks to the database.")