In [1]:
import sqlite3
import numpy as np
from openai import OpenAI
import os
from tqdm.notebook import tqdm
import time
import requests
from requests.exceptions import Timeout


In [2]:
 #Import the API key from config.py
try:
    import sys
    sys.path.append('./guid/src')
    from config import OPENAI_API_KEY
except ImportError:
    raise ImportError("Please create a config.py file with your OPENAI_API_KEY")

print("Setting up OpenAI client...")
client = OpenAI(api_key=OPENAI_API_KEY)

print("Initializing database...")
conn = sqlite3.connect('premera_docs.sqlite')
cursor = conn.cursor()

print("Creating tables if not exist...")
cursor.execute('''
CREATE TABLE IF NOT EXISTS document_chunks
(id INTEGER PRIMARY KEY, content TEXT, embedding BLOB, shape TEXT, layer INTEGER)
''')

Setting up OpenAI client...
Initializing database...
Creating tables if not exist...


<sqlite3.Cursor at 0x21b1657e540>

In [3]:
def read_and_chunk_file(file_path, doc_name, chunk_size=500, overlap=100):
    print(f"Reading file: {file_path}")
    chunks = []
    #opening the file
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    
    print(f"Chunking file (chunk size: {chunk_size}, overlap: {overlap})")
    start = 0
    #looping through file chunk at a time
    with tqdm(total=len(content), desc="Chunking progress") as pbar:
        while start < len(content):
            end = start + chunk_size
            chunk = content[start:end]
            
            if end < len(content):
                #finds sentence end of chunk or paragraph end of chunk then moves end to that spot + 1 after new para or period
                sentence_end = chunk.rfind('.')
                paragraph_end = chunk.rfind('\n')
                if sentence_end > 0:
                    end = start + sentence_end + 1
                elif paragraph_end > 0:
                    end = start + paragraph_end + 1
            
            chunks.append(f'For the {doc_name}, ' + content[start:end])
            #new start will be the end but minus the overlap so we can include the overlap in the next chunk
            start = end - overlap
            pbar.update(end - start)
    
    print(f"Created {len(chunks)} chunks")
    return chunks

In [4]:
def summarize_pair(chunk1, chunk2):
    system_message = "You are an AI assistant tasked with summarizing text. Provide a concise summary that captures the key points of the given text."
    user_message = f"Summarize the following text:\n\n{chunk1}\n\n{chunk2}"
    
    response = client.chat.completions.create(
        model="gpt-4",
        messages=[
            {"role": "system", "content": system_message},
            {"role": "user", "content": user_message}
        ]
    )
    return response.choices[0].message.content.strip()

In [5]:
def create_summary_pyramid(chunks, max_layers=5):
    pyramid = [chunks]  # Bottom layer
    
    for layer in range(1, max_layers):
        print(f"Creating layer {layer}...")
        new_layer = []
        for i in range(0, len(pyramid[-1]), 2):
            if i + 1 < len(pyramid[-1]):
                combined = summarize_pair(pyramid[-1][i], pyramid[-1][i+1])
            else:
                combined = pyramid[-1][i]  # If odd number, keep last chunk as is
            new_layer.append(combined)
        
        pyramid.append(new_layer)
        
        if len(new_layer) == 1:
            break  # We've reached the top of the pyramid
    
    return pyramid

In [6]:
def encode_text(text, max_retries=10, backoff_factor=2, timeout=30):
    print(f"Starting to encode text of length {len(text)}")
    for attempt in range(max_retries):
        try:
            print(f"Attempt {attempt + 1} to encode text")
            response = client.embeddings.create(
                model="text-embedding-ada-002",
                input=[text],
                timeout=timeout
            )
            embedding = np.array(response.data[0].embedding)
            print(f"Successfully encoded text")
            return embedding, embedding.shape
        except Timeout:
            wait_time = backoff_factor * (2 ** attempt)
            print(f"Request timed out. Retrying in {wait_time} seconds...")
            time.sleep(wait_time)
        except Exception as e:
            wait_time = backoff_factor * (2 ** attempt)
            print(f"Error occurred: {e}. Retrying in {wait_time} seconds...")
            time.sleep(wait_time)
    print("Failed to encode text after all attempts")
    raise Exception("Failed to encode text after all attempts")

In [7]:
def add_chunk(content, embedding, shape, layer):
    cursor.execute('INSERT INTO document_chunks (content, embedding, shape, layer) VALUES (?, ?, ?, ?)',
                   (content, ','.join(map(str, embedding)), str(shape), layer))
    conn.commit()

In [8]:
def retrieve_chunks(query, top_k=5):
    print(f"Retrieving chunks for query: '{query}'")
    query_embedding, query_shape = encode_text(query)
    print(f"Query embedding shape: {query_shape}")
    
    cursor.execute('SELECT id, embedding, shape, layer FROM document_chunks')
    results = cursor.fetchall()
    
    print(f"Comparing query to {len(results)} stored chunks")
    similarities = []
    for id, emb, shape, layer in tqdm(results, desc="Comparing embeddings"):
        #takes the embedding that was put in as a string and makes a list of comma seperated numbers and turns them to be floats
        #it is then rehaped after being made into an np array to its original shape
        emb_array = np.array([float(x) for x in emb.split(',')]).reshape(eval(shape))
        
        if emb_array.shape != query_shape:
            print(f"Warning: Embedding shape mismatch. Query: {query_shape}, Stored: {emb_array.shape}")
            continue
        #this is how cosine similiarity is done, take dot product of both embeddings and divide by the euclidean norm of both multiplied
        similarity = np.dot(query_embedding, emb_array) / (np.linalg.norm(query_embedding) * np.linalg.norm(emb_array))
        similarities.append((id, similarity, layer))
    
    if not similarities:
        print("No valid embeddings found for comparison.")
        return []
    
    # Sort by similarity and then by layer (preferring lower layers for equal similarity)
    #sorted by similiarity and if they are the same takes the layer in desecending order which is why layer is negative
    top_ids = sorted(similarities, key=lambda x: (x[1], -x[2]), reverse=True)[:top_k]
    
    placeholders = ','.join('?' for _ in top_ids)
    cursor.execute(f'SELECT content, layer FROM document_chunks WHERE id IN ({placeholders})', 
                   [id for id, _, _ in top_ids])
    return cursor.fetchall()

In [9]:
print("Reading and chunking file...")
chunks = read_and_chunk_file('gold_summary.txt', 'gold plan')

Reading and chunking file...
Reading file: gold_summary.txt
Chunking file (chunk size: 500, overlap: 100)


Chunking progress:   0%|          | 0/4943 [00:00<?, ?it/s]

Created 16 chunks


In [10]:
print("Creating summary pyramid...")
pyramid = create_summary_pyramid(chunks)

Creating summary pyramid...
Creating layer 1...
Creating layer 2...
Creating layer 3...
Creating layer 4...


In [11]:
print("Processing chunks and adding to database...")
for layer, layer_chunks in enumerate(pyramid):
    for chunk in tqdm(layer_chunks, desc=f"Processing layer {layer}"):
        print(f"Encoding chunk (length: {len(chunk)})")
        embedding, shape = encode_text(chunk)
        print(f"Adding chunk to database (embedding shape: {shape}, layer: {layer})")
        add_chunk(chunk, embedding, shape, layer)

print(f"Added {sum(len(layer) for layer in pyramid)} chunks to the database.")

Processing chunks and adding to database...


Processing layer 0:   0%|          | 0/16 [00:00<?, ?it/s]

Encoding chunk (length: 497)
Starting to encode text of length 497
Attempt 1 to encode text
Successfully encoded text
Adding chunk to database (embedding shape: (1536,), layer: 0)
Encoding chunk (length: 444)
Starting to encode text of length 444
Attempt 1 to encode text
Successfully encoded text
Adding chunk to database (embedding shape: (1536,), layer: 0)
Encoding chunk (length: 421)
Starting to encode text of length 421
Attempt 1 to encode text
Successfully encoded text
Adding chunk to database (embedding shape: (1536,), layer: 0)
Encoding chunk (length: 487)
Starting to encode text of length 487
Attempt 1 to encode text
Successfully encoded text
Adding chunk to database (embedding shape: (1536,), layer: 0)
Encoding chunk (length: 433)
Starting to encode text of length 433
Attempt 1 to encode text
Successfully encoded text
Adding chunk to database (embedding shape: (1536,), layer: 0)
Encoding chunk (length: 451)
Starting to encode text of length 451
Attempt 1 to encode text
Successf

Processing layer 1:   0%|          | 0/8 [00:00<?, ?it/s]

Encoding chunk (length: 688)
Starting to encode text of length 688
Attempt 1 to encode text
Successfully encoded text
Adding chunk to database (embedding shape: (1536,), layer: 1)
Encoding chunk (length: 627)
Starting to encode text of length 627
Attempt 1 to encode text
Successfully encoded text
Adding chunk to database (embedding shape: (1536,), layer: 1)
Encoding chunk (length: 608)
Starting to encode text of length 608
Attempt 1 to encode text
Successfully encoded text
Adding chunk to database (embedding shape: (1536,), layer: 1)
Encoding chunk (length: 800)
Starting to encode text of length 800
Attempt 1 to encode text
Successfully encoded text
Adding chunk to database (embedding shape: (1536,), layer: 1)
Encoding chunk (length: 631)
Starting to encode text of length 631
Attempt 1 to encode text
Successfully encoded text
Adding chunk to database (embedding shape: (1536,), layer: 1)
Encoding chunk (length: 412)
Starting to encode text of length 412
Attempt 1 to encode text
Successf

Processing layer 2:   0%|          | 0/4 [00:00<?, ?it/s]

Encoding chunk (length: 940)
Starting to encode text of length 940
Attempt 1 to encode text
Successfully encoded text
Adding chunk to database (embedding shape: (1536,), layer: 2)
Encoding chunk (length: 849)
Starting to encode text of length 849
Attempt 1 to encode text
Successfully encoded text
Adding chunk to database (embedding shape: (1536,), layer: 2)
Encoding chunk (length: 678)
Starting to encode text of length 678
Attempt 1 to encode text
Successfully encoded text
Adding chunk to database (embedding shape: (1536,), layer: 2)
Encoding chunk (length: 558)
Starting to encode text of length 558
Attempt 1 to encode text
Successfully encoded text
Adding chunk to database (embedding shape: (1536,), layer: 2)


Processing layer 3:   0%|          | 0/2 [00:00<?, ?it/s]

Encoding chunk (length: 923)
Starting to encode text of length 923
Attempt 1 to encode text
Successfully encoded text
Adding chunk to database (embedding shape: (1536,), layer: 3)
Encoding chunk (length: 645)
Starting to encode text of length 645
Attempt 1 to encode text
Successfully encoded text
Adding chunk to database (embedding shape: (1536,), layer: 3)


Processing layer 4:   0%|          | 0/1 [00:00<?, ?it/s]

Encoding chunk (length: 640)
Starting to encode text of length 640
Attempt 1 to encode text
Successfully encoded text
Adding chunk to database (embedding shape: (1536,), layer: 4)
Added 31 chunks to the database.


In [12]:
print("Testing retrieval...")
test_query = "how much would i pay for hospice care?"
relevant_chunks = retrieve_chunks(test_query)

print("\nRelevant chunks for the query:")
for i, (chunk, layer) in enumerate(relevant_chunks, 1):
    print(f"Chunk {i} (Layer {layer}):")
    print(chunk[:500] + "...")  # Print first 200 characters of each chunk
    print()

print("Closing database connection...")
#conn.close()
print("Done!")

Testing retrieval...
Retrieving chunks for query: 'how much would i pay for hospice care?'
Starting to encode text of length 38
Attempt 1 to encode text
Successfully encoded text
Query embedding shape: (1536,)
Comparing query to 62 stored chunks


Comparing embeddings:   0%|          | 0/62 [00:00<?, ?it/s]


Relevant chunks for the query:
Chunk 1 (Layer 1):
The silver plan covers hospitalization including organ and tissue transplants, maternity and newborn care, mental health and substance use disorder services, and prescription drugs. Coverage generally applies after meeting a deductible, then a 30% coinsurance is in place. Prenatal, postnatal and abortion services are provided free of charge. Mental health visits have a $65 copay per visit, and preferred generic prescription drugs have a $25 copay for a 30-day supply....

Chunk 2 (Layer 0):
For the gold plan, ered for 10 visits per calendar year (PCY) and acupuncture for 12 visits PCY, both with a $15 copay.

Emergency Services: Emergency care is covered after the deductible with a 30% coinsurance, and the copay is waived if you are directly admitted to an inpatient facility. Ambulance transportation, both air and ground, is covered after the deductible with a 30% coinsurance.

Hospitalization: Inpatient services are covered after the d

In [13]:
def generate_final_answer(original_query, max_chunk_length=500, verbose=False):
    relevant_chunks = retrieve_chunks(original_query)
    combined_context = " ".join([chunk[0][:max_chunk_length] for chunk in relevant_chunks])
    system_message = """You are an AI assistant tasked with answering questions about a healthcare plan offered by a healthcare company based on provided context. Use the given information to answer the question accurately and concisely."""

    user_message = f"""Context from healthcare plan document:

{combined_context}

Based on this context, please answer the following question:
{original_query}

Provide a concise answer that directly addresses the question using only the information given in the context."""

    try:
        response = client.chat.completions.create(
            model="gpt-4",
            messages=[
                {"role": "system", "content": system_message},
                {"role": "user", "content": user_message}
            ]
        )
        final_answer = response.choices[0].message.content.strip()
        return final_answer
    except Exception as e:
        if verbose:
            print(f"Error in generating final answer: {e}")
        return "Unable to generate a final answer due to an error."


In [16]:
generate_final_answer("how much do i pay for doctors office visits between the silver and gold plan?", 1500, True)

Retrieving chunks for query: 'how much do i pay for doctors office visits between the silver and gold plan?'
Starting to encode text of length 77
Attempt 1 to encode text
Successfully encoded text
Query embedding shape: (1536,)
Comparing query to 62 stored chunks


Comparing embeddings:   0%|          | 0/62 [00:00<?, ?it/s]

'For the silver plan, the first two designated PCP office visits are fully covered, after which a $25 copay applies. Similarly, for the gold plan, the first two designated PCP office visits are also covered, but after that, a $15 co-pay applies.'

In [15]:
print(chunks)

['For the gold plan, Preferred Gold EPO 1500 Plan Summary\nThe Preferred Gold EPO 1500 Plan by Premera Blue Cross, effective January 1, 2024, is designed for individuals and families in Washington. This Exclusive Provider Organization (EPO) plan mandates the use of the Individual Signature Network of providers, with care outside the network not covered except in emergencies.\n\nKey Financial Details\nThe plan has an annual deductible of $1,500 per individual and $3,000 per family (in-network only).', 'For the gold plan, \nThe plan has an annual deductible of $1,500 per individual and $3,000 per family (in-network only). After meeting the deductible, you pay a coinsurance of 30%. The out-of-pocket maximum, which includes the deductible, coinsurance, and copays, is $6,800 per individual and $13,600 per family.\n\nEssential Health Benefits\nAmbulatory Patient Services: Outpatient services are covered after the deductible with a 30% coinsurance.', 'For the gold plan, atory Patient Services