In [1]:
!pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-4.0.1-py3-none-any.whl.metadata (13 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence-transformers)
  Downloading transformers-4.50.1-py3-none-any.whl.metadata (39 kB)
Collecting torch>=1.11.0 (from sentence-transformers)
  Downloading torch-2.6.0-cp312-none-macosx_11_0_arm64.whl.metadata (28 kB)
Collecting sympy==1.13.1 (from torch>=1.11.0->sentence-transformers)
  Downloading sympy-1.13.1-py3-none-any.whl.metadata (12 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers<5.0.0,>=4.41.0->sentence-transformers)
  Downloading tokenizers-0.21.1-cp39-abi3-macosx_11_0_arm64.whl.metadata (6.8 kB)
Collecting safetensors>=0.4.3 (from transformers<5.0.0,>=4.41.0->sentence-transformers)
  Downloading safetensors-0.5.3-cp38-abi3-macosx_11_0_arm64.whl.metadata (3.8 kB)
Downloading sentence_transformers-4.0.1-py3-none-any.whl (340 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m340.6/340.6 kB[0m [31

In [3]:
import os
import json
from sentence_transformers import SentenceTransformer, util

# Load pre-trained model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Define the 5 spending categories
categories = [
    "Food & Dining",
    "Housing & Utilities",
    "Transportation",
    "Leisure",
    "Personal Care & Education"
]

# Pre-encode the categories for efficient similarity comparison
category_embeddings = model.encode(categories, convert_to_tensor=True)

# Function to determine the best matching category for a merchant name
def get_closest_category(merchant_name):
    if not merchant_name:
        return "Uncategorized"
    merchant_embedding = model.encode(merchant_name, convert_to_tensor=True)
    similarities = util.cos_sim(merchant_embedding, category_embeddings)
    best_match_index = similarities.argmax().item()
    return categories[best_match_index]

# Get path to users folder (relative to where script is running)
base_dir = os.getcwd()
users_dir = os.path.abspath(os.path.join(base_dir, "..", "users"))

# Process each user folder and update receipt_type
for folder in os.listdir(users_dir):
    folder_path = os.path.join(users_dir, folder)
    if os.path.isdir(folder_path) and folder.startswith("user_"):
        for filename in os.listdir(folder_path):
            file_path = os.path.join(folder_path, filename)
            if os.path.isfile(file_path) and filename.endswith(".json") and filename != "user_total.json":
                try:
                    with open(file_path, "r") as f:
                        receipts = json.load(f)

                    for receipt in receipts:
                        merchant = receipt.get("merchant_name", "")
                        new_category = get_closest_category(merchant)
                        receipt["receipt_type"] = new_category
                        print(f"Updated: '{merchant}' → '{new_category}'")

                    with open(file_path, "w") as f:
                        json.dump(receipts, f, indent=4)
                except Exception as e:
                    print(f"Error updating {file_path}: {e}")


Updated: 'นายการศาล สะภาพร์จะ' → 'Leisure'
Updated: 'MEAT MARKET' → 'Food & Dining'
Updated: 'Primo
Family Restaurant' → 'Food & Dining'
Updated: 'Thompson-Smith' → 'Transportation'
Updated: 'Fritz,
and' → 'Transportation'
Updated: 'HOME MASTER HARDWARE &
ELECTRICAL' → 'Housing & Utilities'
Updated: 'None' → 'Uncategorized'
Updated: '7-ELEVEN' → 'Food & Dining'
Updated: 'Peterson LLC' → 'Housing & Utilities'
Updated: 'OLD TOWN WHITE COFFE' → 'Leisure'
Updated: 'Subway#44969-0' → 'Transportation'
Updated: 'BEST BUY' → 'Transportation'
Updated: 'CAFE S' → 'Food & Dining'
Updated: 'Jacobs-Curtis' → 'Housing & Utilities'
Updated: 'Stevens and Sons' → 'Personal Care & Education'
Updated: 'QuikTrip' → 'Transportation'
Updated: 'Pasta Divina' → 'Food & Dining'
Updated: 'food
Basic$' → 'Food & Dining'
Updated: 'PINNACLES
RESTAURANT' → 'Food & Dining'
Updated: 'PERMAS JAYA JUSCO' → 'Personal Care & Education'
Updated: 'Fernandez Ltd' → 'Leisure'
Updated: 'UNIHAKKA INTERNATIONAL SDN BHD' → 'Tran