In [None]:
%load_ext autoreload
%autoreload 2

import logging

logging.basicConfig(level=logging.INFO)

In [None]:
from pathlib import Path
from tqdm import tqdm

import pymupdf4llm

receipts_path = Path("/mnt/c/Users/alexk/Meine Ablage/KFZ/VW ID3/Rechnungen_Laden/")
if not receipts_path.exists():
    receipts_path = Path("/mnt/c/Users/alexk/My Drive/KFZ/VW ID3/Rechnungen_Laden/")
    if not receipts_path.exists():
        raise FileNotFoundError(f"Path {receipts_path} does not exist.")

paths = list(receipts_path.glob("*.PDF")) + list(receipts_path.glob("*.pdf"))

md_texts = []
for path in tqdm(paths):
    md_text = pymupdf4llm.to_markdown(path)
    md_texts.append(md_text)

In [None]:
import pandas as pd
from datetime import datetime


class BoschReceiptParser:
    @staticmethod
    def parse_md_to_dict(md_text: str):
        lines = md_text.split("\n")
        result = {}
        for line_idx, line in enumerate(lines):
            if line.startswith("#") and "Rechnungsnummer" in line:
                result["invoice_number"] = line.split(" ")[-1].strip()
            if "Rechnungsdatum" in line:
                result["invoice_date"] = pd.Timestamp(datetime.strptime(line.split(" ")[-1].strip(), "%d.%m.%Y"))
            if "Startdatum" in line:
                split = line.split(" ")
                result["start_date"] = split[1]
                result["end_date"] = split[3]
            if "Startzeit" in line:
                split = line.split(" ")
                result["start_time"] = split[1]
                result["end_time"] = split[3]
            if "Gesamtbetrag" in line:
                result["total_amount"] = float(line.split(" ")[-1].strip().replace(",", ".").replace("*", ""))
            if "Charging Station" in line:
                result["charging_station"] = " ".join(lines[line_idx + 2 : line_idx + 4])
            if "BOSCH_CHARGING Ladevorgang - kWh" in line:
                result["kWh"] = line.split(" ")[-3].strip().replace(",", ".")
        result["start"] = pd.Timestamp(
            datetime.strptime(result["start_date"] + " " + result["start_time"], "%d.%m.%Y %H:%M:%S")
        )
        result["end"] = pd.Timestamp(
            datetime.strptime(result["end_date"] + " " + result["end_time"], "%d.%m.%Y %H:%M:%S")
        )
        result["duration"] = result["end"] - result["start"]
        result["Eur/kWh"] = float(result["total_amount"]) / float(result["kWh"])
        for key in ["start_date", "start_time", "end_date", "end_time"]:
            result.pop(key)
        return result


class MVVReceiptParser:
    @staticmethod
    def parse_md_to_dict(md_text: str):
        result = {}
        lines = [l for l in md_text.split("\n") if l.strip()]
        start_lines = lines[:30].copy()
        for line in start_lines:
            if line.startswith("**Datum:**"):
                result["invoice_date"] = pd.Timestamp(
                    datetime.strptime(line.split(" ")[-1].replace("*", ""), "%d.%m.%Y")
                )
            if line.startswith("**Rechnungsnummer"):
                result["invoice_number"] = line.split(" ")[-1].replace("*", "")
        lines = lines[lines.index("### **Einzelverbindungsnachweis**") :]
        result["charging_station"] = lines[1]
        for line_idx, line in enumerate(lines):
            if line.startswith("**Datum:*"):
                result["invoice_date"] = pd.Timestamp(datetime.strptime(line.split(" ")[-1].strip(), "%d.%m.%Y"))
            if line.startswith("Strombezug"):
                split = line.split(" ")
                result["start_date"] = split[1]
                result["start_time"] = split[2]
                result["end_date"] = split[4]
                result["end_time"] = split[5]
                result["total_amount"] = float(split[12].strip().replace(",", "."))
                result["kWh"] = float(split[6].strip().replace(",", "."))
        result["start"] = pd.Timestamp(
            datetime.strptime(result["start_date"] + " " + result["start_time"], "%d.%m.%Y %H:%M")
        )
        result["end"] = pd.Timestamp(datetime.strptime(result["end_date"] + " " + result["end_time"], "%d.%m.%Y %H:%M"))
        result["duration"] = result["end"] - result["start"]
        result["Eur/kWh"] = float(result["total_amount"]) / float(result["kWh"])
        for key in ["start_date", "start_time", "end_date", "end_time"]:
            result.pop(key)
        return result


df = []
for path, md_text in zip(paths, md_texts):
    print(path)
    if path.name.startswith("IhreRechnung_"):
        res = BoschReceiptParser().parse_md_to_dict(md_text)
    elif path.name.startswith("mvv_"):
        res = MVVReceiptParser().parse_md_to_dict(md_text)
    elif path.name.startswith("Rechnung_"):
        continue
    df.append(res)
df = pd.DataFrame(df)
df.sort_values("start", inplace=True)

In [None]:
print(df.columns)
total = df["total_amount"].sum()
start = df.start.min().date()
end = df.start.max().date()
days = (end - start).days
summary = f"Over {days} days, you spent {total:.2f} EUR. That's {total / days * 30:.2f} EUR per month."
print(summary)
df

In [None]:
save_path = Path(receipts_path, f"rechnungen_{start}_{end}.csv")
print(save_path)
with open(save_path.with_name(f"rechnungen_{start}_{end}_statistik.txt"), "w") as hd:
    hd.write(summary)
df.to_csv(save_path, index=False)