<a href="https://colab.research.google.com/github/axjasf/YNAB-Categorizer/blob/main/YNAB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

## Libraries

In [1]:
# Installing libraries
!pip install fuzzywuzzy
!pip install python-Levenshtein

Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0
Collecting python-Levenshtein
  Downloading python_Levenshtein-0.21.1-py3-none-any.whl (9.4 kB)
Collecting Levenshtein==0.21.1 (from python-Levenshtein)
  Downloading Levenshtein-0.21.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (172 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m172.5/172.5 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rapidfuzz<4.0.0,>=2.3.0 (from Levenshtein==0.21.1->python-Levenshtein)
  Downloading rapidfuzz-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m58.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz, Levenshtein, python-Levenshtein
Successfully installed Levenshtein-0.21.1 python-Levenshtein-0.21.1 rapidf

In [2]:
# Importing libraries
import pandas as pd
import numpy as np
import os
from ipywidgets import widgets
from IPython.display import display
import json
import io
from fuzzywuzzy import process


## Files

In [3]:
# Folder names and File names

HOME_PATH = "/content/drive/MyDrive/Colab Notebooks/YNAB/"
CONFIG_PATH = HOME_PATH + "/config/"
TRANSACTIONS_PATH = HOME_PATH + "/transactions/"

TRANSACTIONS_FILE = "chase.csv"


In [4]:
# Mounting Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
### Config Files ###

# Read Grocery keywords
grocery_keywords_df = pd.read_csv(CONFIG_PATH + 'grocery_keywords.csv')
grocery_keywords = grocery_keywords_df.iloc[:, 0].str.lower().tolist()

# Checking, if data is valid
# grocery_keywords_df.head()

In [6]:
### Amazon Orders ###

# Read Amazon Order Headers and Items
amazon_order_headers_df = pd.read_csv(TRANSACTIONS_PATH + 'amazon_order_headers.csv')
amazon_order_items_df = pd.read_csv(TRANSACTIONS_PATH + 'amazon_order_items.csv')

# Checking, if data is valid
# amazon_order_headers_df.head()
# amazon_order_items_df.head()

In [7]:
### Transactions ###

# Read bank transactions
transactions_df = pd.read_csv(TRANSACTIONS_PATH + TRANSACTIONS_FILE)

# Checking, if data is valid
# transactions_df.head()


## Global Variables

In [8]:
### CONSTANTS ###

# Major categories
category= {
    "grocery" : "Food, Sports, Entertainment:Food - Groceries & Errands"
}

# Transactions preparation

## Amazon Payee identification

In [10]:
# interactive

import time

CONFIG_FILE_PATH = HOME_PATH + "/config/gas_stations_patterns_config.json"

# Add a global variable to keep track of payees to remove
payees_to_remove = []

def load_config(file_path):
    """Load configurations from the given file path."""
    with open(file_path, 'r') as file:
        return json.load(file)

def identify_potential_outliers(transactions_df, config):
    patterns = config["patterns"]
    threshold = config["fuzzy_threshold"]
    payee_field = "Description"
    negative_list = config.get("negative_list", [])

    # Identify matched entries
    direct_matches = direct_match(transactions_df, patterns, payee_field)
    fuzzy_matches = fuzzy_match(transactions_df, patterns, payee_field)

    all_matches = list(set(direct_matches + [item[0] for item in fuzzy_matches if item[1] >= threshold]))

    # Filter out any payees present in the negative_list from the all_matches
    all_matches = [match for match in all_matches if match not in negative_list]

    # Assign matched_df to all matches
    matched_df = transactions_df[transactions_df[payee_field].isin(all_matches)]

    uncertain_matches = log_uncertain_matches(transactions_df, patterns, threshold, payee_field)
    uncertain_payees = [entry[0] for entry in uncertain_matches]

    # Filter out any payees present in the negative_list from the uncertain_payees
    uncertain_payees = [payee for payee in uncertain_payees if payee not in negative_list]

    return uncertain_payees, all_matches, matched_df, threshold

def process_feedback(entry, action):
    """Process user feedback."""
    if action == "add_to_patterns":
        config['patterns'].append(entry)
    elif action == "remove_from_df":
        # Add to the negative list
        config['negative_list'].append(entry)
        # Logic to remove the entry from the results DataFrame goes here
    elif action == "add_to_pending_review":
        config['monitoring']['pending_review'].append(entry)

    # Save the updated config back to the file
    with open(CONFIG_PATH + 'gas_stations_patterns_config.json', 'w') as file:
        json.dump(config, file)

def direct_match(transactions_df, patterns, payee_field):
    """Identify direct matches based on patterns."""
    matched_entries = []
    for pattern in patterns:
        matched_entries.extend(transactions_df[transactions_df[payee_field].str.contains(pattern, case=False, na=False)][payee_field].unique().tolist())
    return matched_entries

def fuzzy_match(transactions_df, patterns, payee_field):
    """Identify all fuzzy matches based on patterns."""
    matched_entries_with_scores = []
    for payee in transactions_df[payee_field].unique():
        for pattern in patterns:
            score = process.extractOne(pattern, [payee])[1]
            matched_entries_with_scores.append((payee, score))
    return matched_entries_with_scores

def log_uncertain_matches(transactions_df, patterns, threshold, payee_field):
    """Log uncertain matches for review."""
    log_entries = []
    for payee in transactions_df[payee_field].unique():
        for pattern in patterns:
            score = process.extractOne(pattern, [payee])[1]
            if 70 <= score < threshold:
                log_entries.append((payee, score))
    return log_entries

def create_feedback_widgets(matches_with_scores, threshold):
    """Create interactive widgets for user feedback on matches with scores."""

    # Filter matches_with_scores based on the threshold
    filtered_matches_with_scores = [(entry, score) for entry, score in matches_with_scores if score >= threshold]

    for entry, score in filtered_matches_with_scores:
        checkbox = widgets.Checkbox(value=True, description=f"{entry} (Score: {score})")

        # Preselect dropdown based on score
        preselected_action = 'add_to_patterns'

        action_dropdown = widgets.Dropdown(
            options=['add_to_patterns', 'remove_from_df', 'add_to_pending_review'],
            value=preselected_action,
            description='Action:'
        )

        # Display the widget immediately
        display(widgets.HBox([checkbox, action_dropdown]))

    # Add bulk action buttons
    def bulk_action(action):
        for hbox in widgets_list:
            _, action_dropdown = hbox.children
            action_dropdown.value = action

    accept_all_button = widgets.Button(description="Accept all")
    accept_all_button.on_click(lambda x: bulk_action('add_to_patterns'))

    remove_all_button = widgets.Button(description="Remove from DF all")
    remove_all_button.on_click(lambda x: bulk_action('remove_from_df'))

    review_all_button = widgets.Button(description="All into review")
    review_all_button.on_click(lambda x: bulk_action('add_to_pending_review'))

    bulk_actions = widgets.HBox([accept_all_button, remove_all_button, review_all_button])

    submit_button = widgets.Button(description="Submit Feedback")

    def on_submit(button):
        """Handle the feedback upon submission."""
        global payees_to_remove

        for hbox in widgets_list:
            checkbox, action_dropdown = hbox.children
            entry = checkbox.description.split(" (Score:")[0]

            # If the checkbox is unchecked, add the payee to the removal list
            if not checkbox.value:
                payees_to_remove.append(entry)
            else:
                # Handle feedback for checked checkboxes
                action = action_dropdown.value
                update_monitor_list(config, entry, action)

        # Remove unchecked payees from the dataframe
        transactions_df.drop(transactions_df[transactions_df['Description'].isin(payees_to_remove)].index, inplace=True)

    submit_button.on_click(on_submit)

    return widgets.VBox([bulk_actions] + widgets_list + [submit_button])

def update_monitor_list(config, entry, action):
    """Update the monitoring list and potentially the main patterns based on user input."""

    # Remove from pending_review
    if entry in config['monitoring']['pending_review']:
        config['monitoring']['pending_review'].remove(entry)

    # Handle based on action
    if action == "add_to_patterns":
        config['patterns'].append(entry)
    elif action == "remove_from_df":
        config['negative_list'].append(entry)
        payees_to_remove.append(entry)  # Also add it to the global list of payees to be removed
    elif action == "add_to_pending_review":
        config['monitoring']['pending_review'].append(entry)

    # Save the updated config back to the file
    with open(CONFIG_FILE_PATH, 'w') as file:
        json.dump(config, file)


# Load the config
config = load_config(CONFIG_FILE_PATH)

# Identify potential outliers and matches
uncertain_payees, matched_payees, matched_df, threshold = identify_potential_outliers(transactions_df, config)
# Get fuzzy matches with scores
fuzzy_matches_with_scores = fuzzy_match(transactions_df, config["patterns"], "Description")
input("Press Enter to continue...3")
# Display the feedback widgets for all fuzzy matches above the threshold
feedback_widgets = create_feedback_widgets(fuzzy_matches_with_scores, threshold)
display(feedback_widgets)


# def test_widget_display():
#    checkbox = widgets.Checkbox(value=True, description=f"Test Entry (Score: 90)")
#    action_dropdown = widgets.Dropdown(
#        options=['add_to_patterns', 'remove_from_df', 'add_to_pending_review'],
#        value='add_to_patterns',
#        description='Action:'
#    )
#
#    hbox = widgets.HBox([checkbox, action_dropdown])
#    vbox = widgets.VBox([hbox])
#    display(vbox)
# test_widget_display()





Press Enter to continue...3


HBox(children=(Checkbox(value=True, description='SHELL OIL 57444216105 (Score: 90)'), Dropdown(description='Ac…

HBox(children=(Checkbox(value=True, description='CHEVRON 0090878 (Score: 90)'), Dropdown(description='Action:'…

HBox(children=(Checkbox(value=True, description='Aral Station 140980178 (Score: 90)'), Dropdown(description='A…

HBox(children=(Checkbox(value=True, description='Payment Thank You-Mobile (Score: 90)'), Dropdown(description=…

HBox(children=(Checkbox(value=True, description='Aral Station 140486127 (Score: 90)'), Dropdown(description='A…

NameError: ignored

## NEXT / General ideas
General algo
*   For all Amazon transactions
  * run the Amazon Orders program and put them into a QIF file
*   Go through all other payees
  *   List item



# Amazon Orders

## Pricing

In [None]:
### PRICE TOTALS CALCULATIONS ###
### HEADER ###

# Extract the individual payments for one row and put them into columns
def extract_individual_payments_into_columns(df, column_name):
    # Extract dollar values from the specified column
    payments_dollar_values = df[column_name].str.findall(r'\$(\d+\.\d+)')

    # Determine the maximum number of payments in any row
    max_payments = payments_dollar_values.apply(len).max()

    # Pad the lists to ensure they all have a length of max_payments
    padded_payments = payments_dollar_values.apply(lambda x: x + [None] * (max_payments - len(x)))

    # Convert the padded lists to a DataFrame with dynamic column names
    payment_columns = [f'payment_{i+1}' for i in range(max_payments)]
    df[payment_columns] = pd.DataFrame(padded_payments.tolist(), index=df.index)

    # Convert string values to float for further analysis if needed
    for col in payment_columns:
        df[col] = df[col].astype(float)

# Apply the function to the original dataframe
extract_individual_payments_into_columns(amazon_order_headers, "payments")

# amazon_order_headers.head()

In [None]:
### PRICE TOTALS CALCULATIONS ###
### ITEMS ###

# Item Totals (Quantity * Price) #
amazon_order_items = amazon_order_items.copy()
if amazon_order_items['price'].dtype == 'object':
    amazon_order_items['price'] = pd.to_numeric(amazon_order_items['price'].str.replace('[\$,]', '', regex=True), errors='coerce')
amazon_order_items['quantity'] = amazon_order_items['quantity'].replace([' ', ''], np.nan)
amazon_order_items['quantity'] = pd.to_numeric(amazon_order_items['quantity'], errors='coerce').fillna(1)
amazon_order_items['total'] = amazon_order_items['price'] * amazon_order_items['quantity']

### Distribution of taxes etc. to individual order items ###

# Group the items dataframe by 'order id' and compute the sum of the 'total' column for each order
# Merge the summed items total with the headers dataframe on 'order id'
# Convert 'total_header' column to numeric format, setting non-numeric values to NaN
# Compute the difference between the 'total' from the headers dataframe and the computed sum of items for each order
# Compute the count of items for each order
# Merge the count of items with the merged dataframe
# Compute the 'overall' value for each order
# Merge the 'overall' value with the original items dataframe to create the 'overall' column

items_sum = amazon_order_items.groupby('order id')['total'].sum().reset_index()
merged_df = amazon_order_headers.merge(items_sum, on='order id', how='left', suffixes=('_header', '_items'))
merged_df['total_header'] = pd.to_numeric(merged_df['total_header'].str.replace('[\$,]', '', regex=True), errors='coerce')
merged_df['difference'] = merged_df['total_header'] - merged_df['total_items']
items_count = amazon_order_items.groupby('order id').size().reset_index(name='count')
merged_df = merged_df.merge(items_count, on='order id', how='left')
merged_df['residual'] = merged_df['difference'] / merged_df['count']
amazon_order_items = amazon_order_items.merge(merged_df[['order id', 'residual']], on='order id', how='left')
amazon_order_items['grand_total'] = amazon_order_items['total'] + amazon_order_items['residual'].round(2)

amazon_order_items.head()

## Item categorization

In [None]:
# Initialize a category column with default value as 'non-grocery'
amazon_order_items['category'] = 'n/a'

# Check each item description against the keywords
for keyword in grocery_keywords:
    amazon_order_items.loc[amazon_order_items['description'].str.lower().str.contains(keyword, na=False), 'category'] = 'grocery'

amazon_order_items.head()


# Output

## Transaction creation

In [None]:
### SPLIT TRANSACTION ###

def create_split(category, memo, amount):
    return (category, memo, amount)

def create_transaction(date, total_amount, payee, memo, category=None, splits=None):
    return {
        "date": date,
        "total_amount": total_amount,
        "payee": payee,
        "memo": memo,
        "category": category,
        "splits": splits
    }

def export_qif_transactions(filename, transactions):
    qif_content = "!Type:Cash\n"

    for transaction in transactions:
        # Add the transaction details
        qif_content += f"D{transaction['date']}\n"
        qif_content += f"T{transaction['total_amount']}\n"
        qif_content += f"P{transaction['payee']}\n"
        qif_content += f"M{transaction['memo']}\n"

        # If a category is provided for a simple transaction, add it
        if transaction.get('category') and not transaction.get('splits'):
            qif_content += f"L{transaction['category']}\n"

        # If splits exist for the transaction, add them
        splits = transaction.get('splits')
        if splits:
            qif_content += "LSplit Transaction\n"
            for split in splits:
                category, memo, amount = split
                qif_content += f"S{category}\n"
                qif_content += f"E{memo}\n"
                qif_content += f"${amount}\n"

        # End the transaction
        qif_content += "^\n"

    # Export to a file in Google Drive
    with open(HOME_PATH + "/results/" + filename, "w") as file:
        file.write(qif_content)

def export_qif_transactions_by_chunk(transactions, x, base_filename="transactions"):
    """
    Export the transactions in QIF format, splitting them into separate files,
    each containing up to x transactions.

    Parameters:
    - transactions: List of transaction dictionaries.
    - x: Number of transactions per file.
    - base_filename: Base name for the output files. Files will be named as base_filename_1.qif, base_filename_2.qif, ...

    Returns:
    - List of saved file paths.
    """

    # Validate x
    if not (1 <= x <= 100):
        raise ValueError("x should be between 1 and 100")

    saved_files = []

    for idx, i in enumerate(range(0, len(transactions), x)):
        chunk = transactions[i:i+x]
        qif_content = "!Type:Cash\n"

        for transaction in chunk:
            # Add the transaction details
            qif_content += f"D{transaction['date']}\n"
            qif_content += f"T{transaction['total_amount']}\n"
            qif_content += f"P{transaction['payee']}\n"
            qif_content += f"M{transaction['memo']}\n"

            # If a category is provided for a simple transaction, add it
            if transaction.get('category') and not transaction.get('splits'):
                qif_content += f"L{transaction['category']}\n"

            # If splits exist for the transaction, add them
            splits = transaction.get('splits')
            if splits:
                qif_content += "LSplit Transaction\n"
                for split in splits:
                    category, memo, amount = split
                    qif_content += f"S{category}\n"
                    qif_content += f"E{memo}\n"
                    qif_content += f"${amount}\n"

            # End the transaction
            qif_content += "^\n"

        # Define filename for this chunk
        filename = f"{base_filename}_{idx+1}.qif"
        filepath = HOME_PATH + "/results/" + filename

        # Export to a file
        with open(filepath, "w") as file:
            file.write(qif_content)

        saved_files.append(filepath)

    return saved_files


## QIF File creation

In [None]:
# Using the provided functions to regenerate the QIF transactions and export them

# Generate the QIF transactions

transactions = []

from datetime import datetime

# Iterate over each unique order
for order_id, group in amazon_order_items.groupby('order id'):
    order_date = group['order date'].iloc[0]
    total_amount = group['grand_total'].sum()
    payee = "Amazon" + " " + order_id

    if len(group) == 1: # Check for single item transactions
        row = group.iloc[0]
        if row['category'] == 'grocery':
            transaction = create_transaction(order_date, f"{row['grand_total']:.2f}", "Amazon " + order_id, "", category=category["grocery"])
        else:
            transaction = create_transaction(order_date, f"{total_amount:.2f}", payee, memo=row['description'])
        transactions.append(transaction)

    else:
        # For multiple items in the transaction:
        # Grocery split amount
        grocery_total = group.loc[group['category'] == 'grocery', 'grand_total'].sum()

        # Non-grocery splits
        non_grocery_splits = []
        for _, row in group[group['category'] != 'grocery'].iterrows():
          # Ensure row has valid data
          if 'description' in row and 'grand_total' in row:
            split = create_split("", row['description'], f"{row['grand_total']:.2f}")
            non_grocery_splits.append(split)

        # If grocery total is not zero, add it as a split
        if grocery_total != 0:
            grocery_split = create_split(category["grocery"], "", f"{grocery_total:.2f}")
            non_grocery_splits.append(grocery_split)

        # Create the transaction
        transaction = create_transaction(order_date, f"{total_amount:.2f}", "Amazon " + order_id, "", splits=non_grocery_splits)
        transactions.append(transaction)

# Export the transactions to a QIF file by chunks
export_qif_transactions_by_chunk(transactions, 100)
