In [None]:
%load_ext autoreload
%autoreload 2

from dotenv import load_dotenv
from langchain_openai import ChatOpenAI
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.prompts import ChatPromptTemplate
from IPython.display import Markdown, display

import utils.extractors as extractors

load_dotenv()

# Data Extraction


## Categories

In [None]:
df_cat = extractors.fetch_categories()
df_cat

## Transactions

In [None]:
df_trans = extractors.fetch_sample_transactions()
df_trans

# LangChain

In [None]:
# Use same embedding model as when creating the DB
embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

# Connect to the existing Chroma DB
vectorstore = Chroma(
    collection_name="transactions",
    persist_directory="./chroma_db",
    embedding_function=embedding_model
)

In [None]:
description = "DD *DOORDASH EINSTEINB"
results = vectorstore.similarity_search(description, k=3)

for doc in results:
    print(doc.page_content)
    print(doc.metadata)

In [None]:
def get_similar_transactions(description, as_str=True):
    results = vectorstore.similarity_search(description, k=3)
    if as_str:
        examples = "\n".join(
            f"- Description: {doc.page_content}\n  Category: {doc.metadata['category_name']}"
            for doc in results
        )
        return examples
    return results

display(Markdown(get_similar_transactions(query)))

In [None]:
categories_markdown = df_cat.loc[:, ['parent_name', 'category_name']].to_markdown(index=False)
examples = "\n".join(
    f"- Description: {doc.page_content}\n  Category: {doc.metadata['category_name']}"
    for doc in results
)

# --- Step 3: Build prompt ---
template_string = f"""
You are labeling the categories of transactions for a personal budget.
The following categories are available for you:

{categories_markdown}

Reply only with the category name, no explanation

Here are some past examples:
{{examples}}

Now categorize this new transaction:
- Description: {{description}}

Return the best matching category and subcategory.
Format:
Category: ...
"""
display(Markdown(template_string))

In [None]:
prompt_template = ChatPromptTemplate.from_template(template_string)
prompt_template.input_variables

In [None]:
llm = ChatOpenAI(temperature=0)  # low temperature = more deterministic

In [None]:
description = "DD *DOORDASH EINSTEINB"
messages = prompt_template.format_messages(description=description, examples=get_similar_transactions(description))
response = llm.invoke(messages)
print(response.content)