In [1]:
import os
from dotenv import load_dotenv

In [2]:
from llama_index.core import Document
from llama_index.core import DocumentSummaryIndex
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core import PromptHelper
from llama_index.core import ServiceContext
from llama_index.core import StorageContext
from llama_index.core import Settings
from llama_index.core import GPTVectorStoreIndex
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.response_synthesizers import TreeSummarize
from llama_index.core.prompts import PromptTemplate

In [3]:
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
import openai

In [4]:
# 1. Load environment variables
def load_api_key():
    load_dotenv()
    openai.api_key= os.getenv("OPENAI_API_KEY")

In [5]:
# 2. Set global LlamaIndex settings
def configure_llama_index():
    Settings.llm = OpenAI(model="gpt-4o-mini", temperature=0.0)
    Settings.embed_model = OpenAIEmbedding()
    Settings.chunk_size_limit = 128
    Settings.chunk_overlap = 50
    Settings.num_output = 2048
    prompt_helper = PromptHelper(
        context_window = 4096,
        num_output = 3000,
        chunk_overlap_ratio = 0.1,
        chunk_size_limit = 512,
    )
    Settings.prompt_helper = prompt_helper

In [6]:
# 3. Create index using GPTVectorStoreIndex (which supports chunk retrieval)
def build_vector_index(email_text: str) -> GPTVectorStoreIndex:
    document  = Document(text=email_text)
    index = GPTVectorStoreIndex.from_documents([document])
    return index

In [7]:
# categorize emails into different categories
def categorize_email(email_text: str) -> str:
    # Build the vector index (make sure this function is defined elsewhere)
    index = build_vector_index(email_text)

    categories = ["payment", "billing", "enrollment", "uncategorized"]
    category_query = f"""
    Based on the relevant parts of this email, categorize it into one of the following categories:
    {', '.join(categories)}. If it does not fit any of these categories, please categorize it as 'uncategorized'.
    Respond only with the category name.
    """

    # Retrieve relevant chunks from the index
    retriever = VectorIndexRetriever(index=index, similarity_top_k=3)
    nodes = retriever.retrieve(category_query)

    # Get total number of chunks
    total_chunks = len(index.docstore.docs)
    # print(f"Total chunks in the index: {total_chunks}")
    # print(f"Top {len(nodes)} relevant chunks retrieved:\n")

    # for i, node in enumerate(nodes, 1):
    #     print(f"Chunk {i}:\n{'-'*40}\n{node.text.strip()}\n")

    # Combine retrieved chunks
    combined_text = "\n".join([node.text for node in nodes])

    # prompts for LLM
    full_prompt = f"""
    {combined_text}

    Categorize this email based on the above content into one of: {', '.join(categories)}.

    ---
    1. Enrollment  
    Categorize the email as Enrollment if any of the below keywords are found:  
    a) AWD  
    b) Enrollment  
    c) Autopay  

    2. Payment  
    Categorize the email as Payment if any of the below keywords are found:  
    a) Payment  
    b) Cheque  
    c) Remittance  
    d) Invoice  
    e) Coupon  
    f) Credit  
    g) Refund  

    3. Billing  
    Categorize the email as Billing if any of the below keywords are found:  
    a) Billing  
    b) Premium  
    c) Invoice  
    d) Incorrect Invoice  

    Post-Processing Steps:

    1. Autonomic Analysis Protocol  
    - Automatically identify patterns and context beyond just keywords (e.g., "I was charged wrongly" implies billing).  
    - Detect sentence structure and tone to infer category when explicit keywords are missing.

    2. Primary Intent Detection  
    - If multiple categories are detected, determine which intent is dominant based on keyword frequency, placement, and context.  
    - Prioritize the category mentioned in the subject or first few lines.

    3. Contradicting Evidence Check  
    - Look for conflicting phrases (e.g., “Refund not received” implies Payment, not Billing).  
    - Remove false positives caused by ambiguous keyword overlap (e.g., “Autopay invoice” likely relates to Enrollment, not Billing).

    4. Priority Rules  
    - If both Enrollment and Payment are detected, prioritize Enrollment.  
    - If both Billing and Payment are detected, prioritize Payment.  
    - If all three are mentioned, prioritize based on order: Enrollment > Payment > Billing.

    5. Confidence Assessment  
    - Assign a confidence score (0–100%) based on keyword density and clarity.  
    - If confidence is below 60%, flag the result for manual review.

    6. Integration of Atomic Signal  
    - Include other metadata if available (e.g., subject line, tags, sender type) to refine prediction.  
    - Example: If the sender is a known billing department, weight Billing higher.

    7. Output Validation  
    - Ensure the final category logically matches the context.  
    - If mismatch found, re-apply rules from step 1 to 6.  
    - Log output decision along with justification for traceability.

    If none of the keywords match, categorize the email as 'uncategorized'.
    Respond with only 1 word from the following : **payment**, **billing**, **enrollment**, **uncategorized**.
    """

    response = Settings.llm.complete(full_prompt)
    return response.text.strip().lower()


In [8]:
def get_existing_chunks(index: GPTVectorStoreIndex):
    # return list of node object already used in the index
    return list(index.docstore.docs.values())

In [9]:
def recursive_summerize(index: GPTVectorStoreIndex, summary_window_words: int = 100) -> str:
    nodes = get_existing_chunks(index)

    summary_template = PromptTemplate(
        f"Summerize the following text into approximately {summary_window_words} words:\n\n"
        "{{context_str}}\n\nSummary:"
    )

    tree_summerizer = TreeSummarize(summary_template=summary_template)
    summary_index = DocumentSummaryIndex(nodes)

    query_engine = summary_index.as_query_engine(response_synthesizer=tree_summerizer)
    response = query_engine.query("Please summerize the entire content of this email.")

    return response.response.strip()


In [10]:
def load_email_from_file(file_path: str) -> str:
    with open(file_path, 'r', encoding="utf-8") as file:
        email_text = file.read()
    return email_text

In [11]:
if __name__ == "__main__":
    load_api_key()
    configure_llama_index()

    # Load email text from a file
    email_text = load_email_from_file("emails/email4.txt")
    index = build_vector_index(email_text)

    category = categorize_email(email_text)
    print(f"Predicted Category: {category}")

    summary = recursive_summerize(index, summary_window_words=50)
    print(f"\nEmail Summary: {summary}")


Predicted Category: enrollment
current doc id: b08ae810-2a18-4883-9e6d-f1b35831b5ba

Email Summary: Your enrollment in the Premium Plan is confirmed. Your subscription is active, with autopay enabled. You will receive updates and can modify your plan anytime. For questions about your enrollment or billing, contact our support team. Thank you for choosing our services!
