In [None]:
%load_ext autoreload
%autoreload 2

import logging
logging.basicConfig(level=logging.INFO)



In [None]:
from pathlib import Path

import pymupdf4llm

receipts_path = Path("/mnt/c/Users/alexk/Meine Ablage/KFZ/VW ID3/Rechnungen_Laden/")

paths = list(receipts_path.glob("*.PDF")) + list(receipts_path.glob("*.pdf"))

md_texts = []
for path in paths:
    md_text = pymupdf4llm.to_markdown(path)
    md_texts.append(md_text)

In [None]:
import pandas as pd
from datetime import datetime

def parse_name_and_address(text: str):
    split = text.split(" ")
    return " ".join(split[1:3]), " ".join(split[3:])


def parse_md_to_dict(md_text: str):
    lines = md_text.split("\n")
    result = {}
    name, address = parse_name_and_address(lines[0])
    result["name"] = name
    result["address"] = address
    for line_idx, line in enumerate(lines):
        if line.startswith("#") and "Rechnungsnummer" in line:
            result["invoice_number"] = line.split(" ")[-1].strip()
        if "Rechnungsdatum" in line:
            result["invoice_date"] = pd.Timestamp(datetime.strptime(line.split(" ")[-1].strip(), '%d.%m.%Y'))
        if "Kundennummer" in line:
            result["customer_number"] = line.split(" ")[-1].strip()
        if "Vertragskonto" in line:
            result["contract_account"] = line.split(" ")[-1].strip()
        if "Startdatum" in line:
            split = line.split(" ")
            result["start_date"] = split[1]
            result["end_date"] = split[3]
        if "Startzeit" in line:
            split = line.split(" ")
            result["start_time"] = split[1]
            result["end_time"] = split[3]
        if "Gesamtbetrag" in line:
            result["total_amount"] = float(line.split(" ")[-1].strip().replace(",", "."))
        if "Charging Station" in line:
            result["charging_station"] = " ".join(lines[line_idx+2:line_idx+4])
    result["start"] = pd.Timestamp(datetime.strptime(result["start_date"] + " " + result["start_time"], '%d.%m.%Y %H:%M:%S'))
    result["end"] = pd.Timestamp(datetime.strptime(result["end_date"] + " " + result["end_time"], '%d.%m.%Y %H:%M:%S'))
    result["duration"] = result["end"] - result["start"]
    for key in ["start_date", "start_time", "end_date", "end_time"]:
        result.pop(key)
    return result

df = []
for md_text in md_texts:
    res = parse_md_to_dict(md_text)
    df.append(res)
df = pd.DataFrame(df)
df.sort_values("start", inplace=True)

In [None]:
print(df.columns)
total = df["total_amount"].sum()
start = df.start.min().date()
end = df.start.max().date()
days = (end - start).days
summary = f"Over {days} days, you spent {total:.2f} EUR. That's {total/days*30:.2f} EUR per month."
print(summary)
df

In [None]:
save_path = Path(receipts_path, f"rechnungen_{start}_{end}.csv")
print(save_path)
with open(save_path.with_name(f"rechnungen_{start}_{end}_statistik.txt"), "w") as hd:
    hd.write(summary)
df.to_csv(save_path, index=False)

In [None]:
from datetime import datetime

# datetime.strptime('14.11.2024', '%d.%m.%Y')
# 14.11.2024
# 14.11.2024 08:01:15