In [1]:
# Example orchestrator (could live in a separate script/notebook)
import pandas as pd
from preprocessing import MerchantCategoryPreprocessor, Paths

paths = Paths(
    merchant_emb_parquet="artifacts/features/merchant_embeddings.parquet",
    category_emb_parquet="artifacts/features/category_embeddings.parquet",
    sims_parquet="artifacts/features/tx_category_sims.parquet",
)

pp = MerchantCategoryPreprocessor(paths, model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", top_k=5)

# 1) After your extraction step:
tx = pd.read_csv("/Users/andreabosia/Projects/personal-finance-app/data/trusted/transactions.csv")  # transaction_id, merchant_text

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
import hashlib
def make_id(row):
    # Use relevant fields to generate a unique hash
    base = f"{row.get('data_operazione','')}_{row.get('data_valuta','')}_{row.get('uscite','')}_{row.get('entrate','')}_{row.get('descrizione','')}_{row.get('bank','')}"
    return hashlib.sha256(base.encode("utf-8")).hexdigest()

In [6]:

tx["transaction_id"] = tx.apply(make_id, axis=1)
tx


Unnamed: 0,data_operazione,data_valuta,uscite,entrate,descrizione,fineco,transaction_id
0,2024-01-02,2023-12-27,8.14,,Beats And Burgers Krabi Th,fineco,e6e6476fbceba08f1df984452b224d07c3db62a30370b8...
1,2024-01-02,2023-12-28,53.63,,Andalay Beach Bar & Ca Krabi Th,fineco,6a99afa953e9d22a154c4bde499c8f1dfb744b0eb1166e...
2,2024-01-02,2023-12-28,3.18,,Anyavee Klongmuangbeac Krabi Th,fineco,69f2176ba22c7a7c8f94ea2480be717a5a2ea999a5dcf1...
3,2024-01-02,2023-12-29,1.45,,Spese per Visa Debit,fineco,ce587eaa620fe0eb1fb345cd35c71a50a5b87b382e2fdf...
4,2024-01-02,2023-12-29,117.57,,Ao Nang Landmark Bangkok Th,fineco,cf809af84d53608f18e929fdb81a935af6c76b63a40189...
...,...,...,...,...,...,...,...
869,2025-06-30,2025-06-27,20.00,,Kanji Lite Milano It,fineco,c53af997ac5358e3846de47937f875fa2f115ba26bf14a...
870,2025-06-30,2025-06-28,617.71,,Revolut**8423* Dublin Ie,fineco,0702e9a5c90979996407b2d8db4a6106cee35d952a2d8c...
871,2025-06-30,2025-06-28,17.13,,Seebad Enge Zurich Ch,fineco,61be049d0de5106484909cef38910e47212a4981e2e0b7...
872,2025-06-30,2025-06-28,126.32,,Relais De Lentrecote Zuerich Ch,fineco,6e20e95f07ef13807b2d4de7fb7d05123a0564f09fc090...


In [33]:
# 2) Define your categories once (you can enrich with multilingual synonyms)
cats = pd.DataFrame([
    {"category_id": "bar&restaurnat", "category_text": "Includes bar coffe shop restaurnt food"},
    {"category_id": "groceries", "category_text": "Includes supermarket groceries food"},
    {"category_id": "subscription", "category_text": "Includes netflix, disney, spotify, apple, amazon"},
    {"category_id": "bills&utilities", "category_text": "Includes electricity gas water internet phone mobile rent mortgage insurance"},
    {"category_id": "personal_care", "category_text": "Includes hairdresser barber beauty salon spa wellness gym sports fitness"},
    {"category_id": "shopping", "category_text": "Includes shopping mall (unes, coop, migros, carrefour) clothes shoes fashion electronics gadgets furniture"},
    {"category_id": "transportation", "category_text": "Any means of transportation bus taxi uber train tram metro subway"},
    {"category_id": "salary", "category_text": "The payment of the stipendio which in almost all cases is from Prometeia "},

])

# 3) Embed & append (idempotent)
pp.embed_and_append_categories(cats, category_id_col="category_id", category_text_col="category_text")
pp.embed_and_append_merchants(tx, transaction_id_col="transaction_id", merchant_text_col="descrizione")

# 4) Compute & append top-K similarities (idempotent)
sims = pp.compute_and_append_similarities(top_k=5)

In [34]:
tmp = sims.join(tx.set_index("transaction_id"), on="transaction_id", how="left")
tmp[[ "descrizione", "topk_category_ids", "data_operazione"]][tmp['data_operazione']>'2024-08-01'].sort_values("data_operazione")

Unnamed: 0,descrizione,topk_category_ids,data_operazione
263,Sconto Canone Mensile Luglio 2024,"[salary, groceries, bills&utilities, bar&resta...",2024-08-05
264,Canone Mensile Conto Luglio 2024,"[salary, groceries, bar&restaurnat, shopping, ...",2024-08-05
265,Ristorante Punta Est D San Teodoro It,"[bar&restaurnat, salary, groceries, shopping, ...",2024-08-05
266,Su Sinniperu di Mereu Baunei It,"[salary, bar&restaurnat, shopping, groceries, ...",2024-08-07
267,Coop. Turistica Sisine Baunei It,"[bar&restaurnat, salary, shopping, transportat...",2024-08-08
268,Apple.com/Bill 800915904 Ie,"[subscription, groceries, shopping, bills&util...",2024-08-08
269,Treatwell Milano It,"[salary, bar&restaurnat, transportation, perso...",2024-08-09
270,Ordinante: Paolo Caloni Brembilla Beneficiario...,"[salary, bills&utilities, groceries, bar&resta...",2024-08-15
273,Ord: Manelli Gabriele Ben: Andrea Bosia Dt-Ord...,"[salary, shopping, bar&restaurnat, groceries, ...",2024-08-19
272,Ord: Zago Elisa Ben: Andrea Bosia Dt-Ord: 19/0...,"[salary, bar&restaurnat, groceries, shopping, ...",2024-08-19


In [15]:
pd.set_option('display.max_rows', None)