In [2]:
from typing import List, Callable

import yaml
import pandas as pd
import typer
from toolz import pipe


from contextlib import contextmanager



@contextmanager
def skip_lines_until(file_name: str, predicate: Callable[[str], bool]):
    """Skips lines until the predicate is true.
    Then yields the file _including_ the line where the predicate is true."""
    with open(file_name, mode="r", encoding="iso-8859-1") as f:
        pos = f.tell()
        while not predicate(f.readline()):
            pos = f.tell()
            # Go back to the bqeginning of the line
        f.seek(pos)
        yield f


def to_raw_df(file_name: str) -> pd.DataFrame:
    with skip_lines_until(file_name, lambda line: line.startswith("Buchung;Valuta;Auftraggeber")) as f:
        raw_df = pd.read_csv(f, sep=";", encoding="iso-8859-1")
        raw_df.rename(
            columns={
                "Währung.1": "currency1",
                "Währung": "currency",
                "Auftraggeber/Empfänger": "party",
                "Buchungstext": "book_text",
                "Verwendungszweck": "purpose",
            },
            inplace=True,
        )
        raw_df["book_date"] = pd.to_datetime(raw_df["Buchung"], dayfirst=True)
        raw_df["valuta_date"] = pd.to_datetime(raw_df["Valuta"], dayfirst=True)
        raw_df["amount"] = pd.to_numeric(
            raw_df["Betrag"].str.replace(".", "", regex=False).str.replace(",", ".", regex=False)
        )
        raw_df["balance"] = pd.to_numeric(
            raw_df["Saldo"].str.replace(".", "", regex=False).str.replace(",", ".", regex=False)
        )
    return raw_df


def to_df(raw_df: pd.DataFrame) -> pd.DataFrame:
    df = raw_df[["book_date", "valuta_date", "party", "book_text", "purpose", "amount", "balance"]]
    category_attribute_subs_map = {
        "bargeld": {"party": ["bargeldauszahlung"]},
        "einkaufen": {
            "party": [
                "bio company",
                "biobackhaus",
                "edeka",
                "dm-drogerie",
                "steinecke",
                "nah und gut",
                "visa ralf oelmann",
                "combi verbrauchermarkt",
                "tchibo",
                "REWE MARKT",
                "VISA REWE VIKTOR ADLER",
                "VISA LPG BIOMARKT",
                "VISA BILLA DANKT",
            ],
            "purpose": [
                "KoRo Handels GmbH",
                "KoRo Drogerie GmbH",
                "BIO COMPANY GmbH",
                "gewuerzland",
            ],
        },
        "einnahmen": {"party": ["andreas edmond profous"]},
        "einnahmen::dividende": {"purpose": ["dividende"]},
        "geschenk": {
            "party": ["VISA SPIELVOGEL"],
            "purpose": ["superiore.de", "geschenk mama", "Marimekko"],
        },
        "gesundheit": {
            "party": [
                "ZAHNARZT DR. MUELLER",
                "JOSEPHINEN APOTHEKE",
                "PRAGER APOTHEKE",
                "FORTUNA APOTHEKE",
            ],
            "purpose": ["Center-Apotheke im Minipreis", "SPEICKSHOP", "SHAVING.IE"],
        },
        "kleidung": {"party": ["VISA MAGAZZINO"]},
        "kinder": {
            "party": [
                "Carolina Sgro",
                "Musikschule City West",
                "KINDER- UND JUGEND-, REIT- UND FAHRVEREIN ZEHLENDORF E.V.",
            ],
            "purpose": ["Zoologischer Garten Be", "Kinderschwimmen"],
        },
        "kinder::sparen": {"purpose": ["Sparen Depot Paula"]},
        "kinder::schule": {"purpose": ["Kassenzeichen: 2134900496613 Paula Profous"]},
        "media": {
            "party": ["amznprime", "prime video", "abo lage der nation", "aws emea", "thalia.de"],
            "purpose": ["Spotify AB", "audible.de", "netflix.com"],
        },
        "mobilitaet::auto": {
            "party": [
                "sprint station",
                "visa shell",
                "riller & schnauck",
                "Bundeskasse in Kiel",
                "VISA STOP + GO SYSTEMZENTRA",
                "ARAL AG",
                "Worldline Sweden AB fuer Shell",
                "VISA ARAL STATION",
            ],
            "purpose": ["CosmosDirekt Kfz Beitrag"],
        },
        "mobilitaet::autoleihen": {"party": ["VISA ENTERPRISE RENT A CAR", "VISA RENTALCARS.COM"]},
        "mobilitaet::fahrrad": {"party": ["bike market city", "FAHRRADLADEN MEHRINGHOF"]},
        "mobilitaet::fliegen": {
            "party": ["RYANAIR"],
            "purpose": [
                "ryanair limited",
                "deutsche lufthansa",
                "Koninklijke Luchtvaart Maatschappij",
            ],
        },
        "mobilitaet::oeffentlich": {
            "party": ["bvg app", "DB Fernverkehr AG"],
            "purpose": ["DB Vertrieb GmbH"],
        },
        "intern": {"party": ["andreas profous"]},
        "intern::rente": {"purpose": ["Wertpapierkauf"], "book_text": ["Wertpapierkauf"]},
        "intern::steuerklasse": {"purpose": ["Ausgleich Steuerklasse"]},
        "restaurant": {
            "party": [
                "cocolo ramen",
                "HAPPINESSHEART",
                "lieferando.de",
                "VISA RESTAURANT LENZIG",
                "VISA RESTAURANT KOINONIA",
                "VISA RESTAURANT BEL MONDO",
                "RESTAURANT PARACAS",
                "VISA EATAROUND DELIVERY",
                "VISA ZIMT UND ZUCKER",
                "VISA SPC*RESTAURANT BAHADUR",
                "VISA RESTAURANTE CALIBOCCA",
                "VISA SY RESTAURANT",
            ]
        },
        "spenden": {"party": ["Aerzte ohne Grenzen eV"]},
        "sport": {"party": ["Katherine Finger"]},
        "urlaub": {"purpose": ["Airbnb Payments", "airbnb"]},
        "wohnen": {"purpose": ["Rate, Putzen, Naturstrom", "Ausgleich WEG"]},
    }

    for category, subs_map in category_attribute_subs_map.items():
        for attribute, subs in subs_map.items():
            for sub in subs:
                df.loc[
                    df[attribute].fillna("").str.lower().str.contains(sub.lower()), "category"
                ] = category

    df.loc[
        df.party.fillna("").str.lower().str.contains("VISA APPLE.COM/BILL".lower())
        & (df.amount > -50),
        "category",
    ] = "media"

    return df

In [3]:
df = pipe("downloads/Umsatzanzeige_DE97500105175409854125_20220412.csv", to_raw_df, to_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[


In [11]:
def to_yaml(df: pd.DataFrame):
    """
    Convert the dataframe to a yaml file.

    Args:
        df: The dataframe to convert.

    Returns:
        The dataframe as a yaml file.
    """
    df["book_date"] = df.book_date.df.strftime("%Y-%m-%d")
    df["valuta_date"] = df.valuta_date.df.strftime("%Y-%m-%d")
    yml = yaml.dump(
        df.reset_index().to_dict(orient='records'),
        sort_keys=False, width=72, indent=4,
        default_flow_style=None)
    print(yml)

In [12]:
to_yaml(df)

AttributeError: 'Series' object has no attribute 'df'

In [16]:
#!/Users/anpr/.pyenv/versions/pandacount-3.10.0/bin/python
from typing import List, Callable

import yaml
import pandas as pd
import typer
from toolz import pipe


from contextlib import contextmanager


@contextmanager
def skip_lines_until(file_name: str, predicate: Callable[[str], bool]):
    """Skips lines until the predicate is true.
    Then yields the file _including_ the line where the predicate is true."""
    with open(file_name, mode="r", encoding="iso-8859-1") as f:
        pos = f.tell()
        while not predicate(f.readline()):
            pos = f.tell()
        # Go back to the bqeginning of the line
        f.seek(pos)
        yield f


def to_raw_df(file_name: str) -> pd.DataFrame:
    with skip_lines_until(
            file_name, lambda line: line.startswith("Buchung;Valuta;Auftraggeber")
    ) as f:
        raw_df = pd.read_csv(f, sep=";", encoding="iso-8859-1")
        raw_df.rename(
            columns={
                "Währung.1": "currency1",
                "Währung": "currency",
                "Auftraggeber/Empfänger": "party",
                "Buchungstext": "book_text",
                "Verwendungszweck": "purpose",
            },
            inplace=True,
        )
        raw_df["book_date"] = pd.to_datetime(raw_df["Buchung"], dayfirst=True)
        raw_df["valuta_date"] = pd.to_datetime(raw_df["Valuta"], dayfirst=True)
        raw_df["amount"] = pd.to_numeric(
            raw_df["Betrag"].str.replace(".", "", regex=False).str.replace(",", ".", regex=False)
        )
        raw_df["balance"] = pd.to_numeric(
            raw_df["Saldo"].str.replace(".", "", regex=False).str.replace(",", ".", regex=False)
        )
    raw_df = raw_df[["book_date", "valuta_date", "party", "book_text", "purpose", "amount", "balance"]]
    return raw_df


def categorize(df: pd.DataFrame) -> pd.DataFrame:
    category_attribute_subs_map = {
        "bargeld": {"party": ["bargeldauszahlung"]},
        "einkaufen": {
            "party": [
                "bio company",
                "biobackhaus",
                "edeka",
                "dm-drogerie",
                "steinecke",
                "nah und gut",
                "visa ralf oelmann",
                "combi verbrauchermarkt",
                "tchibo",
                "REWE MARKT",
                "VISA REWE VIKTOR ADLER",
                "VISA LPG BIOMARKT",
                "VISA BILLA DANKT",
            ],
            "purpose": [
                "KoRo Handels GmbH",
                "KoRo Drogerie GmbH",
                "BIO COMPANY GmbH",
                "gewuerzland",
            ],
        },
        "einnahmen": {"party": ["andreas edmond profous"]},
        "einnahmen::dividende": {"purpose": ["dividende"]},
        "geschenk": {
            "party": ["VISA SPIELVOGEL"],
            "purpose": ["superiore.de", "geschenk mama", "Marimekko"],
        },
        "gesundheit": {
            "party": [
                "ZAHNARZT DR. MUELLER",
                "JOSEPHINEN APOTHEKE",
                "PRAGER APOTHEKE",
                "FORTUNA APOTHEKE",
            ],
            "purpose": ["Center-Apotheke im Minipreis", "SPEICKSHOP", "SHAVING.IE"],
        },
        "kleidung": {"party": ["VISA MAGAZZINO"]},
        "kinder": {
            "party": [
                "Carolina Sgro",
                "Musikschule City West",
                "Erika Tribbioli",
                "KINDER- UND JUGEND-, REIT- UND FAHRVEREIN ZEHLENDORF E.V.",
            ],
            "purpose": ["Zoologischer Garten Be", "Kinderschwimmen"],
        },
        "kinder::sparen": {"purpose": ["Sparen Depot Paula"]},
        "kinder::schule": {"purpose": ["Kassenzeichen: 2134900496613 Paula Profous"]},
        "media": {
            "party": ["amznprime", "prime video", "abo lage der nation", "aws emea", "thalia.de"],
            "purpose": ["Spotify AB", "audible.de", "netflix.com"],
        },
        "mobilitaet::auto": {
            "party": [
                "sprint station",
                "visa shell",
                "riller & schnauck",
                "Bundeskasse in Kiel",
                "VISA STOP + GO SYSTEMZENTRA",
                "ARAL AG",
                "Worldline Sweden AB fuer Shell",
                "VISA ARAL STATION",
            ],
            "purpose": ["CosmosDirekt Kfz Beitrag"],
        },
        "mobilitaet::autoleihen": {"party": ["VISA ENTERPRISE RENT A CAR", "VISA RENTALCARS.COM"]},
        "mobilitaet::fahrrad": {"party": ["bike market city", "FAHRRADLADEN MEHRINGHOF"]},
        "mobilitaet::fliegen": {
            "party": ["RYANAIR"],
            "purpose": [
                "ryanair limited",
                "deutsche lufthansa",
                "Koninklijke Luchtvaart Maatschappij",
            ],
        },
        "mobilitaet::oeffentlich": {
            "party": ["bvg app", "DB Fernverkehr AG"],
            "purpose": ["DB Vertrieb GmbH"],
        },
        "intern": {"party": ["andreas profous"]},
        "intern::rente": {"purpose": ["Wertpapierkauf"], "book_text": ["Wertpapierkauf"]},
        "intern::steuerklasse": {"purpose": ["Ausgleich Steuerklasse"]},
        "restaurant": {
            "party": [
                "cocolo ramen",
                "HAPPINESSHEART",
                "lieferando.de",
                "VISA RESTAURANT LENZIG",
                "VISA RESTAURANT KOINONIA",
                "VISA RESTAURANT BEL MONDO",
                "RESTAURANT PARACAS",
                "VISA EATAROUND DELIVERY",
                "VISA ZIMT UND ZUCKER",
                "VISA SPC*RESTAURANT BAHADUR",
                "VISA RESTAURANTE CALIBOCCA",
                "VISA SY RESTAURANT",
            ]
        },
        "spenden": {"party": ["Aerzte ohne Grenzen eV"]},
        "sport": {"party": ["Katherine Finger"]},
        "urlaub": {"purpose": ["Airbnb Payments", "airbnb"]},
        "wohnen": {"purpose": ["Rate, Putzen, Naturstrom", "Ausgleich WEG"]},
    }

    for category, subs_map in category_attribute_subs_map.items():
        for attribute, subs in subs_map.items():
            for sub in subs:
                df.loc[
                    df[attribute].fillna("").str.lower().str.contains(sub.lower()), "category"
                ] = category

    df.loc[
        df.party.fillna("").str.lower().str.contains("VISA APPLE.COM/BILL".lower())
        & (df.amount > -50),
        "category",
    ] = "media"

    return df


def to_yaml(df: pd.DataFrame) -> str:
    """
    Convert the dataframe to a yaml file.

    Args:
        df: The dataframe to convert.

    Returns:
        The dataframe as a yaml file.
    """
    df["book_date"] = df.book_date.dt.strftime("%Y-%m-%d")
    df["valuta_date"] = df.valuta_date.dt.strftime("%Y-%m-%d")
    yml = yaml.dump(
        df.reset_index().to_dict(orient="records"),
        sort_keys=False,
        width=120,
        indent=2,
        default_flow_style=False,
    )
    return yml


def from_yaml(yml: str) -> pd.DataFrame:
    """
    Convert a yaml file to a dataframe.

    Args:
        yml: The yaml file to convert.

    Returns:
        The yaml file as a dataframe.
    """
    df = pd.DataFrame(yaml.load(yml, yaml.Loader))
    df["book_date"] = pd.to_datetime(df["book_date"])
    df["valuta_date"] = pd.to_datetime(df["valuta_date"])
    return df


def from_yaml_file() -> pd.DataFrame:
    with open("pandacount.yml", "r") as f:
        pc = from_yaml(f.read())
    return pc


def to_yaml_file(pc: pd.DataFrame):
    yml = to_yaml(pc)
    with open("pandacount.yml", "w") as f:
        f.write(yml)


def import_to_pandacount(pc: pd.DataFrame, df: pd.DataFrame) -> pd.DataFrame:
    return pc.merge(df, on=["book_date", "valuta_date", "party", "book_text", "purpose", "amount"])


def main(file_list: List[str]):
    pc = from_yaml_file()
    for file in file_list:
        typer.echo(f"Processing {file}")
        df = pipe(file, to_raw_df, categorize)
        pc = import_to_pandacount(pc, df)

    to_yaml_file(pc)
    print(pc.head())

In [17]:
pc = from_yaml_file()

In [18]:
pc.shape

(173, 9)

In [19]:
df = pipe("downloads/Umsatzanzeige_DE97500105175409854125_20220412.csv", to_raw_df, categorize)

In [20]:
df.shape

(173, 8)

In [21]:
df.head()

Unnamed: 0,book_date,valuta_date,party,book_text,purpose,amount,balance,category
0,2022-04-12,2022-04-12,"AMAZON EU S.A R.L., NIEDERLASSUNG DEUTSCHLAND",Lastschrift,304-5833107-6686730 Amazon.de 6PMP9 55UCADFB0VK,-6.5,1431.57,
1,2022-04-12,2022-04-12,"AMAZON EU S.A R.L., NIEDERLASSUNG DEUTSCHLAND",Lastschrift,304-7367240-1654768 Amazon.de 75N6I 5E8570Z3QZG,-6.5,1438.07,
2,2022-04-12,2022-04-12,"AMAZON EU S.A R.L., NIEDERLASSUNG DEUTSCHLAND",Lastschrift,304-7367240-1654768 Amazon.de 6O504 2GUSBFQ238Q,-11.8,1444.57,
3,2022-04-12,2022-04-12,"AMAZON EU S.A R.L., NIEDERLASSUNG DEUTSCHLAND",Lastschrift,304-7367240-1654768 Amazon.de 4672P 80A6KWWRF53,-7.9,1456.37,
4,2022-04-12,2022-04-12,"AMAZON EU S.A R.L., NIEDERLASSUNG DEUTSCHLAND",Lastschrift,304-7367240-1654768 Amazon.de 2YAW6 BI38UXWHIHX,-13.8,1464.27,


In [22]:
pc.head()

Unnamed: 0,index,book_date,valuta_date,party,book_text,purpose,amount,balance,category
0,0,2022-04-12,2022-04-12,"AMAZON EU S.A R.L., NIEDERLASSUNG DEUTSCHLAND",Lastschrift,304-5833107-6686730 Amazon.de 6PMP9 55UCADFB0VK,-6.5,1431.57,
1,1,2022-04-12,2022-04-12,"AMAZON EU S.A R.L., NIEDERLASSUNG DEUTSCHLAND",Lastschrift,304-7367240-1654768 Amazon.de 75N6I 5E8570Z3QZG,-6.5,1438.07,
2,2,2022-04-12,2022-04-12,"AMAZON EU S.A R.L., NIEDERLASSUNG DEUTSCHLAND",Lastschrift,304-7367240-1654768 Amazon.de 6O504 2GUSBFQ238Q,-11.8,1444.57,
3,3,2022-04-12,2022-04-12,"AMAZON EU S.A R.L., NIEDERLASSUNG DEUTSCHLAND",Lastschrift,304-7367240-1654768 Amazon.de 4672P 80A6KWWRF53,-7.9,1456.37,
4,4,2022-04-12,2022-04-12,"AMAZON EU S.A R.L., NIEDERLASSUNG DEUTSCHLAND",Lastschrift,304-7367240-1654768 Amazon.de 2YAW6 BI38UXWHIHX,-13.8,1464.27,


In [25]:
pc.drop(labels=["index"], axis=1, inplace=True)
pc

Unnamed: 0,book_date,valuta_date,party,book_text,purpose,amount,balance,category
0,2022-04-12,2022-04-12,"AMAZON EU S.A R.L., NIEDERLASSUNG DEUTSCHLAND",Lastschrift,304-5833107-6686730 Amazon.de 6PMP9 55UCADFB0VK,-6.50,1431.57,
1,2022-04-12,2022-04-12,"AMAZON EU S.A R.L., NIEDERLASSUNG DEUTSCHLAND",Lastschrift,304-7367240-1654768 Amazon.de 75N6I 5E8570Z3QZG,-6.50,1438.07,
2,2022-04-12,2022-04-12,"AMAZON EU S.A R.L., NIEDERLASSUNG DEUTSCHLAND",Lastschrift,304-7367240-1654768 Amazon.de 6O504 2GUSBFQ238Q,-11.80,1444.57,
3,2022-04-12,2022-04-12,"AMAZON EU S.A R.L., NIEDERLASSUNG DEUTSCHLAND",Lastschrift,304-7367240-1654768 Amazon.de 4672P 80A6KWWRF53,-7.90,1456.37,
4,2022-04-12,2022-04-12,"AMAZON EU S.A R.L., NIEDERLASSUNG DEUTSCHLAND",Lastschrift,304-7367240-1654768 Amazon.de 2YAW6 BI38UXWHIHX,-13.80,1464.27,
...,...,...,...,...,...,...,...,...
168,2021-04-30,2021-04-30,Andreas Profous,Gutschrift aus Dauerauftrag,"Rate, Putzen, Naturstrom O2, HUK24, GEZ, Kita",1300.00,2541.40,wohnen
169,2021-04-19,2021-04-19,Erika Tribbioli,Überweisung,"Teatro, Paula Profous, 2d",-32.00,1241.40,kinder
170,2021-04-15,2021-04-15,NaturStromHandel GmbH,Lastschrift,Rechnungsnummer DR-16081796 / Vertr agsnummer ...,-105.42,1273.40,
171,2021-04-13,2021-04-13,Bundesagentur fur Arbeit - Familienkasse,Gutschrift,KG039922FK356574 0421 48044238560/3 000106812943,438.00,1378.82,


In [26]:
pc == df

Unnamed: 0,book_date,valuta_date,party,book_text,purpose,amount,balance,category
0,True,True,True,True,True,True,True,False
1,True,True,True,True,True,True,True,False
2,True,True,True,True,True,True,True,False
3,True,True,True,True,True,True,True,False
4,True,True,True,True,True,True,True,False
...,...,...,...,...,...,...,...,...
168,True,True,True,True,True,True,True,True
169,True,True,True,True,True,True,True,True
170,True,True,True,True,True,True,True,False
171,True,True,True,True,True,True,True,False


In [37]:
pc = pc.merge(df, on=["book_date", "valuta_date", "party", "book_text", "purpose", "amount"], suffixes=("", "_y"))
pc.drop(axis=1, labels=["balance_y", "category_y"], inplace=True)
pc

Unnamed: 0,book_date,valuta_date,party,book_text,purpose,amount,balance,category
0,2022-04-12,2022-04-12,"AMAZON EU S.A R.L., NIEDERLASSUNG DEUTSCHLAND",Lastschrift,304-5833107-6686730 Amazon.de 6PMP9 55UCADFB0VK,-6.50,1431.57,
1,2022-04-12,2022-04-12,"AMAZON EU S.A R.L., NIEDERLASSUNG DEUTSCHLAND",Lastschrift,304-7367240-1654768 Amazon.de 75N6I 5E8570Z3QZG,-6.50,1438.07,
2,2022-04-12,2022-04-12,"AMAZON EU S.A R.L., NIEDERLASSUNG DEUTSCHLAND",Lastschrift,304-7367240-1654768 Amazon.de 6O504 2GUSBFQ238Q,-11.80,1444.57,
3,2022-04-12,2022-04-12,"AMAZON EU S.A R.L., NIEDERLASSUNG DEUTSCHLAND",Lastschrift,304-7367240-1654768 Amazon.de 4672P 80A6KWWRF53,-7.90,1456.37,
4,2022-04-12,2022-04-12,"AMAZON EU S.A R.L., NIEDERLASSUNG DEUTSCHLAND",Lastschrift,304-7367240-1654768 Amazon.de 2YAW6 BI38UXWHIHX,-13.80,1464.27,
...,...,...,...,...,...,...,...,...
168,2021-04-30,2021-04-30,Andreas Profous,Gutschrift aus Dauerauftrag,"Rate, Putzen, Naturstrom O2, HUK24, GEZ, Kita",1300.00,2541.40,wohnen
169,2021-04-19,2021-04-19,Erika Tribbioli,Überweisung,"Teatro, Paula Profous, 2d",-32.00,1241.40,kinder
170,2021-04-15,2021-04-15,NaturStromHandel GmbH,Lastschrift,Rechnungsnummer DR-16081796 / Vertr agsnummer ...,-105.42,1273.40,
171,2021-04-13,2021-04-13,Bundesagentur fur Arbeit - Familienkasse,Gutschrift,KG039922FK356574 0421 48044238560/3 000106812943,438.00,1378.82,
